"""
Test: RLE Analysis with Real Parquet Data

This test demonstrates the refactored RLE module using real parquet data.
It analyzes table statistics, NDV, and value frequency patterns.
"""

import duckdb
import sys
from pathlib import Path

# Add parent directory to path to import duckrun
sys.path.insert(0, str(Path(__file__).parent.parent))

from duckrun.rle import get_table_stats, get_value_frequency_details


def test_rle_with_real_parquet():
    """Test RLE analysis with real parquet file"""
    
    # Path to the test parquet file
    parquet_path = Path(__file__).parent / "part-00000-19052469-6a9d-4faa-86ac-60efce3e4443-c000.snappy.parquet"
    
    if not parquet_path.exists():
        print(f"❌ Error: Parquet file not found at {parquet_path}")
        return False
    
    print("=" * 80)
    print("RLE ANALYSIS TEST: Real Parquet Data")
    print("=" * 80)
    print(f"File: {parquet_path.name}")
    print(f"Size: {parquet_path.stat().st_size:,} bytes")
    
    # Connect to DuckDB
    con = duckdb.connect(':memory:')
    
    try:
        # First, let's see the schema
        print("\n" + "=" * 80)
        print("SCHEMA INSPECTION")
        print("=" * 80)
        
        schema_df = con.sql(f"""
            SELECT * FROM parquet_schema('{parquet_path}')
        """).df()
        
        print(f"\nColumns found: {len(schema_df)}")
        print(schema_df.to_string(index=False))
        
        # Get row count
        row_count = con.sql(f"""
            SELECT COUNT(*) FROM read_parquet('{parquet_path}')
        """).fetchone()[0]
        
        print(f"\nTotal rows: {row_count:,}")
        
        # Run comprehensive RLE analysis
        print("\n" + "=" * 80)
        print("COMPREHENSIVE RLE ANALYSIS")
        print("=" * 80)
        
        stats_df = get_table_stats(con, str(parquet_path), is_parquet=True, top_n_values=10)
        
        # Display results
        print("\n" + "=" * 80)
        print("RESULTS: Columns Ranked by RLE Potential")
        print("=" * 80)
        
        print("\n" + stats_df[['column_name', 'data_type', 'ndv', 'cardinality_ratio', 
                                'top_value_pct', 'top_n_coverage', 'repetition_score']].to_string(index=False))
        
        # Detailed analysis of top 3 columns
        print("\n" + "=" * 80)
        print("DETAILED VALUE FREQUENCY ANALYSIS")
        print("=" * 80)
        
        for idx in range(min(3, len(stats_df))):
            col_name = stats_df.iloc[idx]['column_name']
            score = stats_df.iloc[idx]['repetition_score']
            
            print(f"\n[{idx+1}] Column: {col_name} (repetition_score: {score})")
            print("-" * 80)
            
            freq_df = get_value_frequency_details(con, str(parquet_path), col_name, 
                                                  is_parquet=True, limit=15)
            print(freq_df.to_string(index=False))
            
            if not freq_df.empty:
                print(f"\n✓ Top value appears {freq_df.iloc[0]['percentage']:.2f}% of the time")
                print(f"✓ Top 15 values cover {freq_df['cumulative_pct'].iloc[-1]:.2f}% of all data")
        
        # Summary and recommendations
        print("\n" + "=" * 80)
        print("SUMMARY & RECOMMENDATIONS")
        print("=" * 80)
        
        # Categorize columns
        excellent = stats_df[stats_df['repetition_score'] > 100]
        good = stats_df[(stats_df['repetition_score'] >= 10) & (stats_df['repetition_score'] <= 100)]
        poor = stats_df[stats_df['repetition_score'] < 10]
        
        print(f"\n📊 RLE Compression Potential:")
        print(f"   Excellent (score > 100): {len(excellent)} columns")
        if len(excellent) > 0:
            print(f"      {', '.join(excellent['column_name'].tolist())}")
        
        print(f"   Good (score 10-100): {len(good)} columns")
        if len(good) > 0:
            print(f"      {', '.join(good['column_name'].tolist())}")
        
        print(f"   Poor (score < 10): {len(poor)} columns")
        if len(poor) > 0:
            print(f"      {', '.join(poor['column_name'].tolist())}")
        
        print(f"\n💡 Sorting Recommendation:")
        top_3 = stats_df.head(3)['column_name'].tolist()
        print(f"   For optimal RLE compression, consider sorting by:")
        for i, col in enumerate(top_3, 1):
            print(f"   {i}. {col}")
        
        print(f"\n✅ Test completed successfully!")
        
        return True
        
    except Exception as e:
        print(f"\n❌ Error during analysis: {e}")
        import traceback
        traceback.print_exc()
        return False
        
    finally:
        con.close()


if __name__ == "__main__":
    print("\n" + "=" * 80)
    print("STARTING RLE ANALYSIS TEST")
    print("=" * 80)
    
    success = test_rle_with_real_parquet()
    
    print("\n" + "=" * 80)
    if success:
        print("✅ TEST PASSED")
    else:
        print("❌ TEST FAILED")
    print("=" * 80)
    
    sys.exit(0 if success else 1)
