from __future__ import annotations
import builtins as __builtins__
import cupy as cp
import cupy._core.raw
__all__ = ['cp', 'nearest_affine_kernel', 'nearest_affine_kernel_code', 'nearest_kernel', 'nearest_kernel_code']
__test__: dict = {}
nearest_affine_kernel: cupy._core.raw.RawKernel  # value = <cupy._core.raw.RawKernel object>
nearest_affine_kernel_code: str = '\nextern "C" __global__ void nearest_affine(\n    const float* __restrict__ input,\n    float* __restrict__ output,\n    const float* __restrict__ matrix,\n    const int input_height,\n    const int input_width,\n    const int output_height,\n    const int output_width\n) {\n    // Calculate global thread index\n    const int x_out = blockIdx.x * blockDim.x + threadIdx.x;\n    const int y_out = blockIdx.y * blockDim.y + threadIdx.y;\n\n    // Check output image bounds\n    if (x_out >= output_width || y_out >= output_height) {\n        return;\n    }\n\n    // Calculate input coordinates using affine transformation\n    const float src_x = matrix[0] * x_out + matrix[1] * y_out + matrix[2];\n    const float src_y = matrix[3] * x_out + matrix[4] * y_out + matrix[5];\n\n    // Calculate nearest integer coordinates (rounding)\n    const int x_in = __float2int_rn(src_x);\n    const int y_in = __float2int_rn(src_y);\n\n    // Check bounds\n    if (x_in < 0 || x_in >= input_width || y_in < 0 || y_in >= input_height) {\n        const int idx_out = (y_out * output_width + x_out) * 3;\n        #pragma unroll\n        for (int c = 0; c < 3; c++) {\n            output[idx_out + c] = 0.0f;\n        }\n        return;\n    }\n\n    // Calculate base indices for input and output\n    const int idx_in_base = (y_in * input_width + x_in) * 3;\n    const int idx_out_base = (y_out * output_width + x_out) * 3;\n\n    // Optimize: Process all 3 channels at once\n    #pragma unroll\n    for (int c = 0; c < 3; c++) {\n        output[idx_out_base + c] = input[idx_in_base + c];\n    }\n}\n'
nearest_kernel: cupy._core.raw.RawKernel  # value = <cupy._core.raw.RawKernel object>
nearest_kernel_code: str = '\nextern "C" __global__ void nearest_kernel(\n    const float* __restrict__ input,\n    float* __restrict__ output,\n    const int width_in,\n    const int height_in,\n    const int width_out,\n    const int height_out,\n    const int channels\n) {\n    // Calculate global thread index\n    const int x_out = blockIdx.x * blockDim.x + threadIdx.x;\n    const int y_out = blockIdx.y * blockDim.y + threadIdx.y;\n\n    // Check output image bounds\n    if (x_out >= width_out || y_out >= height_out) {\n        return;\n    }\n\n    // Calculate scale ratio\n    const float scale_x = static_cast<float>(width_in) / width_out;\n    const float scale_y = static_cast<float>(height_in) / height_out;\n\n    // Calculate corresponding coordinates in input image\n    const int x_in = min(static_cast<int>(x_out * scale_x), width_in - 1);\n    const int y_in = min(static_cast<int>(y_out * scale_y), height_in - 1);\n\n    // Process for each channel\n    for (int c = 0; c < channels; c++) {\n        // Calculate input image index\n        const int idx_in = (y_in * width_in + x_in) * channels + c;\n        // Calculate output image index\n        const int idx_out = (y_out * width_out + x_out) * channels + c;\n\n        // Copy pixel value\n        output[idx_out] = input[idx_in];\n    }\n}\n'
