from __future__ import annotations
import builtins as __builtins__
import cupy as cp
import cupy._core.raw
__all__ = ['cp', 'mitchell_affine_kernel', 'mitchell_affine_kernel_code', 'mitchell_kernel', 'mitchell_kernel_code']
__test__: dict = {}
mitchell_affine_kernel: cupy._core.raw.RawKernel  # value = <cupy._core.raw.RawKernel object>
mitchell_affine_kernel_code: str = '\nextern "C" __device__ float mitchell_weight(float x, float B, float C) {\n    const float ax = fabsf(x);\n    float result;\n    if (ax < 1.0f) {\n        result = ((12.0f - 9.0f * B - 6.0f * C) * ax * ax * ax\n            + (-18.0f + 12.0f * B + 6.0f * C) * ax * ax\n            + (6.0f - 2.0f * B)) / 6.0f;\n    } else if (ax < 2.0f) {\n        result = ((-B - 6.0f * C) * ax * ax * ax\n            + (6.0f * B + 30.0f * C) * ax * ax\n            + (-12.0f * B - 48.0f * C) * ax\n            + (8.0f * B + 24.0f * C)) / 6.0f;\n    } else {\n        result = 0.0f;\n    }\n    return result;\n}\n\nextern "C" __global__\nvoid mitchell_affine_kernel(\n    const float* __restrict__ input,\n    float* __restrict__ output,\n    const float* __restrict__ matrix,\n    const int input_height,\n    const int input_width,\n    const int output_height,\n    const int output_width,\n    const float B,\n    const float C\n) {\n    const int x = blockIdx.x * blockDim.x + threadIdx.x;\n    const int y = blockIdx.y * blockDim.y + threadIdx.y;\n    if (x >= output_width || y >= output_height) return;\n\n    // Calculate input coordinates using affine transformation\n    const float src_x = matrix[0] * x + matrix[1] * y + matrix[2];\n    const float src_y = matrix[3] * x + matrix[4] * y + matrix[5];\n\n    // Calculate nearest integer coordinates (rounding)\n    const int x1 = floorf(src_x - 1);\n    const int y1 = floorf(src_y - 1);\n\n    // Check bounds\n    if (src_x < 1 || src_x >= input_width - 2 || src_y < 1 || src_y >= input_height - 2) {\n        const int output_idx = (y * output_width + x) * 3;\n        #pragma unroll\n        for (int c = 0; c < 3; c++) {\n            output[output_idx + c] = 0.0f;\n        }\n        return;\n    }\n\n    // Optimization 1: Store results in temporary variables for each channel\n    float channel_results[3];\n\n    #pragma unroll\n    for (int c = 0; c < 3; c++) {\n        float sum = 0.0f;\n        float weight_sum = 0.0f;\n\n        // Optimization 2: Dynamically limit y loop range\n        const int start_y = max(0, y1);\n        const int end_y = min(input_height, y1 + 4);\n\n        for (int iy = start_y; iy < end_y; iy++) {\n            const float dy_dist = fabsf(src_y - iy);\n            if (dy_dist >= 2.0f) continue;\n            const float wy = mitchell_weight(dy_dist, B, C);\n\n            // Optimization 3: Dynamically limit x loop range\n            const int start_x = max(0, x1);\n            const int end_x = min(input_width, x1 + 4);\n\n            for (int ix = start_x; ix < end_x; ix++) {\n                const float dx_dist = fabsf(src_x - ix);\n                if (dx_dist >= 2.0f) continue;\n                const float wx = mitchell_weight(dx_dist, B, C);\n                const float weight = wx * wy;\n                const float pixel = input[(iy * input_width + ix) * 3 + c];\n                sum += pixel * weight;\n                weight_sum += weight;\n            }\n        }\n        channel_results[c] = (weight_sum > 0.0f) ? sum / weight_sum : 0.0f;\n    }\n\n    // Optimization 4: Write results in a single operation\n    const int output_idx = (y * output_width + x) * 3;\n    #pragma unroll\n    for (int c = 0; c < 3; c++) {\n        output[output_idx + c] = channel_results[c];\n    }\n}\n'
mitchell_kernel: cupy._core.raw.RawKernel  # value = <cupy._core.raw.RawKernel object>
mitchell_kernel_code: str = '\nextern "C" __device__ float mitchell_weight(float x, float B, float C) {\n    const float ax = fabsf(x);\n    float result;\n    if (ax < 1.0f) {\n        result = ((12.0f - 9.0f * B - 6.0f * C) * ax * ax * ax\n            + (-18.0f + 12.0f * B + 6.0f * C) * ax * ax\n            + (6.0f - 2.0f * B)) / 6.0f;\n    } else if (ax < 2.0f) {\n        result = ((-B - 6.0f * C) * ax * ax * ax\n            + (6.0f * B + 30.0f * C) * ax * ax\n            + (-12.0f * B - 48.0f * C) * ax\n            + (8.0f * B + 24.0f * C)) / 6.0f;\n    } else {\n        result = 0.0f;\n    }\n    return result;\n}\n\nextern "C" __global__\nvoid mitchell_kernel(\n    const float* __restrict__ input,\n    float* __restrict__ output,\n    const int input_height,\n    const int input_width,\n    const int output_height,\n    const int output_width,\n    const int channels,\n    const float B,\n    const float C\n) {\n    const int x = blockIdx.x * blockDim.x + threadIdx.x;\n    const int y = blockIdx.y * blockDim.y + threadIdx.y;\n    if (x >= output_width || y >= output_height) return;\n\n    const float src_x = (x + 0.5f) * input_width / output_width - 0.5f;\n    const float src_y = (y + 0.5f) * input_height / output_height - 0.5f;\n\n    const int x1 = floorf(src_x - 1);\n    const int y1 = floorf(src_y - 1);\n\n    // Optimization: Check bounds before processing\n    float channel_results[3];  // Support up to 3 channels\n\n    #pragma unroll\n    for (int c = 0; c < min(channels, 4); c++) {\n        float sum = 0.0f;\n        float weight_sum = 0.0f;\n\n        // Optimization 2: Dynamically limit y loop range\n        const int start_y = max(0, y1);\n        const int end_y = min(input_height, y1 + 4);\n\n        for (int iy = start_y; iy < end_y; iy++) {\n            const float dy_dist = fabsf(src_y - iy);\n            if (dy_dist >= 2.0f) continue;\n            const float wy = mitchell_weight(dy_dist, B, C);\n\n            // Optimization 3: Dynamically limit x loop range\n            const int start_x = max(0, x1);\n            const int end_x = min(input_width, x1 + 4);\n\n            for (int ix = start_x; ix < end_x; ix++) {\n                const float dx_dist = fabsf(src_x - ix);\n                if (dx_dist >= 2.0f) continue;\n                const float wx = mitchell_weight(dx_dist, B, C);\n                const float weight = wx * wy;\n\n                const float pixel = input[(iy * input_width + ix) * channels + c];\n                sum += pixel * weight;\n                weight_sum += weight;\n            }\n        }\n\n        channel_results[c] = (weight_sum > 0.0f) ? sum / weight_sum : 0.0f;\n    }\n\n    // Optimization 4: Write results in a single operation\n    const int output_idx = (y * output_width + x) * channels;\n    #pragma unroll\n    for (int c = 0; c < min(channels, 4); c++) {\n        output[output_idx + c] = channel_results[c];\n    }\n}\n'
