from __future__ import annotations
import builtins as __builtins__
import cupy as cp
import cupy._core.raw
__all__ = ['bilinear_affine_kernel', 'bilinear_affine_kernel_code', 'bilinear_kernel', 'bilinear_kernel_code', 'cp']
__test__: dict = {}
bilinear_affine_kernel: cupy._core.raw.RawKernel  # value = <cupy._core.raw.RawKernel object>
bilinear_affine_kernel_code: str = '\nextern "C" __global__\nvoid bilinear_affine_kernel(const float *src, float *dst, const float *matrix, int src_height, int src_width, int dst_height, int dst_width) {\n    unsigned int dst_x = blockIdx.x * blockDim.x + threadIdx.x;\n    unsigned int dst_y = blockIdx.y * blockDim.y + threadIdx.y;\n    if (dst_x >= dst_width || dst_y >= dst_height) return;\n\n    // Affine transformation\n    float tx = matrix[0] * dst_x + matrix[1] * dst_y + matrix[2];\n    float ty = matrix[3] * dst_x + matrix[4] * dst_y + matrix[5];\n\n    int x0 = floor(tx);\n    int y0 = floor(ty);\n    int x1 = x0 + 1;\n    int y1 = y0 + 1;\n\n    // Out of bounds\n    if (tx < 0 || tx >= src_width || ty < 0 || ty >= src_height) {\n        for (int c = 0; c < 3; c++) {\n            dst[(dst_y * dst_width + dst_x) * 3 + c] = 0.0;\n        }\n        return;\n    }\n\n    // Bilinear interpolation\n    float wa = (x1 - tx) * (y1 - ty);\n    float wb = (x1 - tx) * (ty - y0);\n    float wc = (tx - x0) * (y1 - ty);\n    float wd = (tx - x0) * (ty - y0);\n\n    x0 = max(0, min(x0, src_width - 1));\n    y0 = max(0, min(y0, src_height - 1));\n    x1 = max(0, min(x1, src_width - 1));\n    y1 = max(0, min(y1, src_height - 1));\n\n    // Output\n    for (int c = 0; c < 3; c++) {\n        float val00 = src[(y0 * src_width + x0) * 3 + c];\n        float val01 = src[(y0 * src_width + x1) * 3 + c];\n        float val10 = src[(y1 * src_width + x0) * 3 + c];\n        float val11 = src[(y1 * src_width + x1) * 3 + c];\n\n        dst[(dst_y * dst_width + dst_x) * 3 + c] = wa * val00 + wb * val10 + wc * val01 + wd * val11;\n    }\n}\n'
bilinear_kernel: cupy._core.raw.RawKernel  # value = <cupy._core.raw.RawKernel object>
bilinear_kernel_code: str = '\nextern "C" __global__ void bilinear_kernel(\n    const float* __restrict__ input,\n    float* __restrict__ output,\n    const int width_in,\n    const int height_in,\n    const int width_out,\n    const int height_out,\n    const int channels\n) {\n    // Get the output pixel coordinates\n    const int x_out = blockIdx.x * blockDim.x + threadIdx.x;\n    const int y_out = blockIdx.y * blockDim.y + threadIdx.y;\n\n    // Check if the thread is within the output image bounds\n    if (x_out >= width_out || y_out >= height_out) {\n        return;\n    }\n\n    // Calculate scaling factors\n    const float scale_x = (float)(width_in - 1) / (float)(width_out - 1);\n    const float scale_y = (float)(height_in - 1) / (float)(height_out - 1);\n\n    // Calculate the corresponding position in the input image\n    const float x_in = x_out * scale_x;\n    const float y_in = y_out * scale_y;\n\n    // Get the four neighboring pixels\n    const int x0 = __float2int_rd(x_in);  // floor\n    const int x1 = min(x0 + 1, width_in - 1);\n    const int y0 = __float2int_rd(y_in);  // floor\n    const int y1 = min(y0 + 1, height_in - 1);\n\n    // Calculate interpolation weights\n    const float wx = x_in - x0;\n    const float wy = y_in - y0;\n\n    // Interpolate for each channel\n    for (int c = 0; c < channels; c++) {\n        // Get the values of the four neighboring pixels for this channel\n        const float f00 = input[(y0 * width_in + x0) * channels + c];\n        const float f10 = input[(y0 * width_in + x1) * channels + c];\n        const float f01 = input[(y1 * width_in + x0) * channels + c];\n        const float f11 = input[(y1 * width_in + x1) * channels + c];\n\n        // Perform bilinear interpolation\n        // First interpolate in x direction\n        const float fx0 = f00 * (1.0f - wx) + f10 * wx;\n        const float fx1 = f01 * (1.0f - wx) + f11 * wx;\n\n        // Then interpolate in y direction\n        const float result = fx0 * (1.0f - wy) + fx1 * wy;\n\n        // Write the result to the output image\n        output[(y_out * width_out + x_out) * channels + c] = result;\n    }\n}\n'
