from __future__ import annotations
import builtins as __builtins__
import cupy as cp
import cupy._core.raw
__all__ = ['bicubic_affine_kernel', 'bicubic_affine_kernel_code', 'bicubic_kernel', 'bicubic_kernel_code', 'cp']
__test__: dict = {}
bicubic_affine_kernel: cupy._core.raw.RawKernel  # value = <cupy._core.raw.RawKernel object>
bicubic_affine_kernel_code: str = '\nextern "C" __global__\nvoid bicubic_affine(\n    const float* __restrict__ input,\n    float* __restrict__ output,\n    const float* __restrict__ matrix,\n    const int input_height,\n    const int input_width,\n    const int output_height,\n    const int output_width\n) {\n    const int x = blockIdx.x * blockDim.x + threadIdx.x;\n    const int y = blockIdx.y * blockDim.y + threadIdx.y;\n    if (x >= output_width || y >= output_height) return;\n\n    // Define bicubic interpolation kernel as a lambda\n    auto cubic_kernel = [](float x) -> float {\n        x = abs(x);\n        if (x <= 1.0f)\n            return 1.5f * x * x * x - 2.5f * x * x + 1.0f;\n        else if (x < 2.0f)\n            return -0.5f * x * x * x + 2.5f * x * x - 4.0f * x + 2.0f;\n        return 0.0f;\n    };\n\n    // Calculate input coordinates using affine transformation\n    const float src_x = matrix[0] * x + matrix[1] * y + matrix[2];\n    const float src_y = matrix[3] * x + matrix[4] * y + matrix[5];\n\n    // Calculate nearest integer coordinates (rounding)\n    const int x0 = int(src_x) - 1;\n    const int y0 = int(src_y) - 1;\n\n    // Check bounds\n    if (src_x < 2.0f || src_x >= input_width - 2.0f ||\n        src_y < 2.0f || src_y >= input_height - 2.0f) {\n        const int out_pos = (y * output_width + x) * 3;\n        #pragma unroll\n        for (int c = 0; c < 3; c++) {\n            output[out_pos + c] = 0.0f;\n        }\n        return;\n    }\n\n    // Position of the output pixel in the output array\n    const int out_pos = (y * output_width + x) * 3;\n\n    // Calculate interpolation for each channel\n    #pragma unroll\n    for (int c = 0; c < 3; c++) {\n        float result = 0.0f;\n        float weight_sum = 0.0f;\n\n        // Calculate weighted sum for 16 neighboring pixels\n        #pragma unroll\n        for (int i = 0; i < 4; i++) {\n            const int src_y_idx = max(0, min(y0 + i, input_height - 1));\n            const float dy = abs(src_y - (y0 + i));\n            const float wy = cubic_kernel(dy);\n            if (wy == 0.0f) continue;\n\n            #pragma unroll\n            for (int j = 0; j < 4; j++) {\n                const int src_x_idx = max(0, min(x0 + j, input_width - 1));\n                const float dx = abs(src_x - (x0 + j));\n                const float wx = cubic_kernel(dx);\n                if (wx == 0.0f) continue;\n\n                const float weight = wx * wy;\n                const int src_idx = (src_y_idx * input_width + src_x_idx) * 3 + c;\n                result += input[src_idx] * weight;\n                weight_sum += weight;\n            }\n        }\n\n        // Write result\n        output[out_pos + c] = weight_sum > 0.0f ? result / weight_sum : 0.0f;\n    }\n}\n'
bicubic_kernel: cupy._core.raw.RawKernel  # value = <cupy._core.raw.RawKernel object>
bicubic_kernel_code: str = '\nextern "C" __global__\nvoid bicubic_kernel(\n    const float* __restrict__ input,\n    float* __restrict__ output,\n    const int input_height,\n    const int input_width,\n    const int output_height,\n    const int output_width,\n    const int channels\n) {\n    const int x = blockIdx.x * blockDim.x + threadIdx.x;\n    const int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n    if (x >= output_width || y >= output_height) return;\n\n    const float scale_x = (float)input_width / output_width;\n    const float scale_y = (float)input_height / output_height;\n\n    const float src_x = x * scale_x;\n    const float src_y = y * scale_y;\n\n    const int x0 = int(src_x) - 1;\n    const int y0 = int(src_y) - 1;\n\n    auto cubic_kernel = [](float x) -> float {\n        x = abs(x);\n        if (x <= 1.0f)\n            return 1.5f * x * x * x - 2.5f * x * x + 1.0f;\n        else if (x < 2.0f)\n            return -0.5f * x * x * x + 2.5f * x * x - 4.0f * x + 2.0f;\n        return 0.0f;\n    };\n\n    // Position of the output pixel in the output array\n    const int out_pos = (y * output_width + x) * channels;\n\n    // Calculate interpolation for each channel\n    #pragma unroll\n    for (int c = 0; c < channels; c++) {\n        float result = 0.0f;\n        float weight_sum = 0.0f;\n\n        // Calculate weighted sum for 16 neighboring pixels\n        for (int i = 0; i < 4; i++) {\n            for (int j = 0; j < 4; j++) {\n                const int src_y_idx = max(0, min(y0 + i, input_height - 1));\n                const int src_x_idx = max(0, min(x0 + j, input_width - 1));\n\n                const float dx = abs(src_x - (x0 + j));\n                const float dy = abs(src_y - (y0 + i));\n\n                const float weight = cubic_kernel(dx) * cubic_kernel(dy);\n\n                // Calculate direct index from input image\n                const int src_idx = (src_y_idx * input_width + src_x_idx) * channels + c;\n                result += input[src_idx] * weight;\n                weight_sum += weight;\n            }\n        }\n\n        // Write result\n        output[out_pos + c] = result / weight_sum;\n    }\n}\n'
