from __future__ import annotations
import builtins as __builtins__
import cupy as cp
import cupy._core.raw
__all__ = ['area_affine_kernel', 'area_affine_kernel_code', 'area_kernel', 'area_kernel_code', 'cp']
__test__: dict = {}
area_affine_kernel: cupy._core.raw.RawKernel  # value = <cupy._core.raw.RawKernel object>
area_affine_kernel_code: str = '\nextern "C" __global__\nvoid area_affine_kernel(\n    const float* __restrict__ input,\n    float* __restrict__ output,\n    const float* __restrict__ matrix,\n    int input_height,\n    int input_width,\n    int output_height,\n    int output_width\n) {\n    unsigned int output_x = blockIdx.x * blockDim.x + threadIdx.x;\n    unsigned int output_y = blockIdx.y * blockDim.y + threadIdx.y;\n\n    if (output_x >= output_width || output_y >= output_height) return;\n\n    // Calculate affine transformation coordinates\n    float center_x = matrix[0] * (output_x + 0.5f) + matrix[1] * (output_y + 0.5f) + matrix[2];\n    float center_y = matrix[3] * (output_x + 0.5f) + matrix[4] * (output_y + 0.5f) + matrix[5];\n\n    // Calculate actual scaling factors from transformation matrix\n    float scale_x = sqrtf(matrix[0] * matrix[0] + matrix[1] * matrix[1]);\n    float scale_y = sqrtf(matrix[3] * matrix[3] + matrix[4] * matrix[4]);\n\n    // Calculate size of sampling area\n    float half_width = 0.5f * scale_x;\n    float half_height = 0.5f * scale_y;\n\n    // Calculate boundaries of sampling area\n    float left = center_x - half_width;\n    float right = center_x + half_width;\n    float top = center_y - half_height;\n    float bottom = center_y + half_height;\n\n    // Convert to integer coordinate range\n    int ix0 = max(0, (int)floor(left));\n    int ix1 = min(input_width, (int)ceil(right));\n    int iy0 = max(0, (int)floor(top));\n    int iy1 = min(input_height, (int)ceil(bottom));\n\n    // If sampling area is outside the image\n    if (ix0 >= input_width || ix1 <= 0 || iy0 >= input_height || iy1 <= 0) {\n        for (int c = 0; c < 3; c++) {\n            output[(output_y * output_width + output_x) * 3 + c] = 0.0f;\n        }\n        return;\n    }\n\n    // Calculate average value for each channel\n    for (int c = 0; c < 3; c++) {\n        float sum = 0.0f;\n        float total_weight = 0.0f;\n\n        // Sample each pixel in the area\n        for (int y = iy0; y < iy1; y++) {\n            // Calculate vertical weight for the pixel\n            float wy = 1.0f;\n            if (y == iy0) {\n                wy = 1.0f - (top - y);\n            }\n            if (y == iy1 - 1) {\n                wy = 1.0f - (y + 1 - bottom);\n            }\n\n            for (int x = ix0; x < ix1; x++) {\n                // Calculate horizontal weight for the pixel\n                float wx = 1.0f;\n                if (x == ix0) {\n                    wx = 1.0f - (left - x);\n                }\n                if (x == ix1 - 1) {\n                    wx = 1.0f - (x + 1 - right);\n                }\n\n                // Calculate overall weight\n                float weight = wx * wy;\n\n                // Calculate weighted sum\n                sum += input[(y * input_width + x) * 3 + c] * weight;\n                total_weight += weight;\n            }\n        }\n\n        // Write normalized value\n        output[(output_y * output_width + output_x) * 3 + c] = total_weight > 0.0f ? sum / total_weight : 0.0f;\n    }\n}\n'
area_kernel: cupy._core.raw.RawKernel  # value = <cupy._core.raw.RawKernel object>
area_kernel_code: str = '\n#define TILE_SIZE 32\n#define BLOCK_SIZE 32\n\nextern "C" __global__\nvoid area_kernel(const float* __restrict__ input,\n                  float* __restrict__ output,\n                  int input_width, int input_height,\n                  int new_width, int new_height,\n                  int channels) {\n    // Shared memory for input tile caching\n    __shared__ float tile[TILE_SIZE + 2][TILE_SIZE + 2];\n    \n    int tx = threadIdx.x;\n    int ty = threadIdx.y;\n    int x = blockIdx.x * blockDim.x + tx;\n    int y = blockIdx.y * blockDim.y + ty;\n\n    if (x >= new_width || y >= new_height) return;\n\n    // Calculate ratio for scaling\n    float x_ratio = (float)input_width / new_width;\n    float y_ratio = (float)input_height / new_height;\n\n    // Calculate source coordinate range in input image\n    float input_x_start = x * x_ratio;\n    float input_y_start = y * y_ratio;\n    float input_x_end = (x + 1) * x_ratio;\n    float input_y_end = (y + 1) * y_ratio;\n\n    // Get integer parts\n    int x_start = (int)floorf(input_x_start);\n    int y_start = (int)floorf(input_y_start);\n    int x_end = (int)ceilf(input_x_end);\n    int y_end = (int)ceilf(input_y_end);\n\n    // Check bounds\n    x_start = max(0, x_start);\n    y_start = max(0, y_start);\n    x_end = min(input_width, x_end);\n    y_end = min(input_height, y_end);\n\n    // Process all channels at once\n    float values[3] = {0.0f, 0.0f, 0.0f};\n    float weight_sum = 0.0f;\n\n    // Calculate weighted sum\n    for (int sy = y_start; sy < y_end; sy++) {\n        for (int sx = x_start; sx < x_end; sx++) {\n            // Calculate pixel overlap area\n            float wx = fminf(input_x_end, sx + 1) - fmaxf(input_x_start, sx);\n            float wy = fminf(input_y_end, sy + 1) - fmaxf(input_y_start, sy);\n            float weight = wx * wy;\n\n            int idx = (sy * input_width + sx) * channels;\n            \n            // Unrolled channel access for better performance\n            if (channels == 3) {\n                values[0] += input[idx] * weight;\n                values[1] += input[idx + 1] * weight;\n                values[2] += input[idx + 2] * weight;\n            } else if (channels == 1) {\n                values[0] += input[idx] * weight;\n            } else {\n                for (int c = 0; c < channels && c < 3; ++c) {\n                    values[c] += input[idx + c] * weight;\n                }\n            }\n            weight_sum += weight;\n        }\n    }\n\n    // Write output\n    int out_idx = (y * new_width + x) * channels;\n    if (weight_sum > 0) {\n        float inv_weight = 1.0f / weight_sum;\n        if (channels == 3) {\n            output[out_idx] = values[0] * inv_weight;\n            output[out_idx + 1] = values[1] * inv_weight;\n            output[out_idx + 2] = values[2] * inv_weight;\n        } else if (channels == 1) {\n            output[out_idx] = values[0] * inv_weight;\n        } else {\n            for (int c = 0; c < channels && c < 3; ++c) {\n                output[out_idx + c] = values[c] * inv_weight;\n            }\n        }\n    } else {\n        for (int c = 0; c < channels; ++c) {\n            output[out_idx + c] = 0.0f;\n        }\n    }\n}\n'
