from __future__ import annotations
import builtins as __builtins__
import cupy as cp
import cupy._core.raw
__all__ = ['cp', 'lanczos_affine_kernel', 'lanczos_affine_kernel_code', 'lanczos_kernel', 'lanczos_kernel_code']
__test__: dict = {}
lanczos_affine_kernel: cupy._core.raw.RawKernel  # value = <cupy._core.raw.RawKernel object>
lanczos_affine_kernel_code: str = '\nextern "C" __device__ float sinc(float x) {\n    if (x == 0.0f) return 1.0f;\n    const float pi_x = 3.14159265358979323846f * x;\n    return sinf(pi_x) / pi_x;\n}\n\nextern "C" __device__ float lanczos(float x, int a) {\n    if (x < -a || x > a) return 0.0f;\n    if (x == 0.0f) return 1.0f;\n    return sinc(x) * sinc(x / a);\n}\n\nextern "C" __global__\nvoid lanczos_affine(\n    const float* __restrict__ input,\n    float* __restrict__ output,\n    const float* __restrict__ matrix,\n    const int input_height,\n    const int input_width,\n    const int output_height,\n    const int output_width,\n    const int a\n) {\n    const int x_out = blockIdx.x * blockDim.x + threadIdx.x;\n    const int y_out = blockIdx.y * blockDim.y + threadIdx.y;\n    if (x_out >= output_width || y_out >= output_height) return;\n\n\n    // Calculate affine transformation coordinates\n    const float src_x = matrix[0] * x_out + matrix[1] * y_out + matrix[2];\n    const float src_y = matrix[3] * x_out + matrix[4] * y_out + matrix[5];\n\n    // Check bounds\n    const int margin = a + 1;\n    if (src_x < margin || src_x >= input_width - margin ||\n        src_y < margin || src_y >= input_height - margin) {\n        const int output_idx = (y_out * output_width + x_out) * 3;\n        #pragma unroll\n        for (int c = 0; c < 3; c++) {\n            output[output_idx + c] = 0.0f;\n        }\n        return;\n    }\n\n    // Calculate nearest integer coordinates (rounding)\n    const int ix = floorf(src_x);\n    const int iy = floorf(src_y);\n\n    // Store results in temporary variables for each channel\n    float channel_results[3] = {0.0f, 0.0f, 0.0f};\n\n    #pragma unroll\n    for (int c = 0; c < 3; c++) {\n        float sum = 0.0f;\n        float weight_sum = 0.0f;\n\n        // Limit y loop range\n        const int start_y = max(0, iy - a + 1);\n        const int end_y = min(input_height - 1, iy + a);\n\n        #pragma unroll\n        for (int sy = start_y; sy < end_y + 1; sy++) {\n            const float dy = src_y - sy;\n            if (fabsf(dy) >= a) continue;\n            const float wy = lanczos(dy, a);\n\n            // Limit x loop range\n            const int start_x = max(0, ix - a + 1);\n            const int end_x = min(input_width - 1, ix + a);\n\n            #pragma unroll\n            for (int sx = start_x; sx < end_x + 1; sx++) {\n                const float dx = src_x - sx;\n                if (fabsf(dx) >= a) continue;\n                const float wx = lanczos(dx, a);\n\n                const float weight = wx * wy;\n                if (weight == 0.0f) continue;\n\n                const int input_idx = (sy * input_width + sx) * 3 + c;\n                sum += input[input_idx] * weight;\n                weight_sum += weight;\n            }\n        }\n\n        channel_results[c] = (weight_sum > 0.0f) ? sum / weight_sum : 0.0f;\n    }\n\n    // Write results\n    const int output_idx = (y_out * output_width + x_out) * 3;\n    #pragma unroll\n    for (int c = 0; c < 3; c++) {\n        output[output_idx + c] = channel_results[c];\n    }\n}\n'
lanczos_kernel: cupy._core.raw.RawKernel  # value = <cupy._core.raw.RawKernel object>
lanczos_kernel_code: str = '\nextern "C" __global__ void lanczos_kernel(\n    const float* input,\n    float* output,\n    const int width_in,\n    const int height_in,\n    const int width_out,\n    const int height_out,\n    const int a,\n    const int channels\n) {\n    const float PI = 3.14159265358979323846f;\n\n    // Calculate thread indices\n    const int x_out = blockIdx.x * blockDim.x + threadIdx.x;\n    const int y_out = blockIdx.y * blockDim.y + threadIdx.y;\n\n    if (x_out >= width_out || y_out >= height_out) return;\n\n    // Calculate scaling factors\n    const float scale_x = (float)width_out / width_in;\n    const float scale_y = (float)height_out / height_in;\n\n    // Calculate source position\n    const float x_in = x_out / scale_x;\n    const float y_in = y_out / scale_y;\n\n    // Calculate kernel centers\n    const int kernel_center_x = (int)x_in;\n    const int kernel_center_y = (int)y_in;\n\n    // Calculate kernel boundaries\n    const int start_x = max(kernel_center_x - a, 0);\n    const int end_x = min(kernel_center_x + a + 1, width_in);\n    const int start_y = max(kernel_center_y - a, 0);\n    const int end_y = min(kernel_center_y + a + 1, height_in);\n\n    // Process each channel\n    for (int c = 0; c < channels; ++c) {\n        float sum = 0.0f;\n        float weight_sum = 0.0f;\n\n        // Apply Lanczos kernel\n        for (int y_k = start_y; y_k < end_y; ++y_k) {\n            const float dy = y_in - y_k;\n            float wy;\n\n            // Calculate y-direction kernel weight\n            if (dy == 0.0f) {\n                wy = 1.0f;\n            }\n            else if (abs(dy) < a) {\n                const float y_pi = PI * dy;\n                wy = a * sin(y_pi) * sin(y_pi / a) / (y_pi * y_pi);\n            }\n            else {\n                wy = 0.0f;\n            }\n\n            for (int x_k = start_x; x_k < end_x; ++x_k) {\n                const float dx = x_in - x_k;\n                float wx;\n\n                // Calculate x-direction kernel weight\n                if (dx == 0.0f) {\n                    wx = 1.0f;\n                }\n                else if (abs(dx) < a) {\n                    const float x_pi = PI * dx;\n                    wx = a * sin(x_pi) * sin(x_pi / a) / (x_pi * x_pi);\n                }\n                else {\n                    wx = 0.0f;\n                }\n\n                const float weight = wx * wy;\n                const float pixel_value = input[(y_k * width_in + x_k) * channels + c];\n                sum += weight * pixel_value;\n                weight_sum += weight;\n            }\n        }\n\n        // Normalize and store result\n        if (weight_sum > 0.0f) {\n            output[(y_out * width_out + x_out) * channels + c] = sum / weight_sum;\n        }\n        else {\n            output[(y_out * width_out + x_out) * channels + c] = 0.0f;\n        }\n    }\n}\n'
