#!/usr/bin/env python3
"""
StemSage: Uncovering Stem-loop Motifs from RBP Binding Regions
Version: 0.8.6
Author: Zixiang Wang, Lei Yang
Organization: Shandong University
"""

import os
import sys
import argparse
from pathlib import Path

# Add package PATH
sys.path.insert(0, str(Path(__file__).parent))

from stemsage.pipeline import StemSagePipeline

__version__ = "0.8.6"
__author__ = "Zixiang Wang","Lei Yang"
__email__ = "wangzixiang@sdu.edu.cn"

def main():
    BANNER = r"""
     _____ _                  _____                  
    / ____| |                / ____|                 
   | (___ | |_ ___ _ __ ___ | (___   __ _  __ _  ___ 
    \___ \| __/ _ \ '_ ` _ \ \___ \ / _` |/ _` |/ _ \
    ____) | ||  __/ | | | | |____) | (_| | (_| |  __/
   |_____/ \__\___|_| |_| |_|_____/ \__,_|\__, |\___|
                                         __/ |     
                                        |___/      

StemSage: Uncovering Stem-loop Motifs from RBP Binding Regions

"Forever Young. Forever Passionate. Forever Humbled."

Please cite: https://github.com/PrinceWang2018/stemsage
    """

    parser = argparse.ArgumentParser(
        description=BANNER,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
EXAMPLES:
  # Case 1: Only positive BED file
  StemSage --positive_bed peaks.bed --genome_fa hg38.fa --extend 25 --out ./results
  
  # Case 2: Positive and negative BED files  
  StemSage --positive_bed positive.bed --negative_bed negative.bed --genome_fa hg38.fa --out ./results
  
  # Case 3: Only positive FASTA file
  StemSage --positive_fasta positive.fasta --genome_fa hg38.fa --out ./results
  
  # Case 4: Positive and negative FASTA files
  StemSage --positive_fasta positive.fasta --negative_fasta negative.fasta --out ./results
  
  # Case 5: Use Random Forest model instead of XGBoost
  StemSage --positive_bed peaks.bed --genome_fa hg38.fa --model random_forest --out ./results
        """
    )
    
    # Input files
    input_group = parser.add_argument_group('INPUT OPTIONS')
    input_group.add_argument('--positive_bed', 
                           help='BED file of positive (RBP-binding) regions')
    input_group.add_argument('--negative_bed', 
                           help='BED file of negative (control) regions')
    input_group.add_argument('--positive_fasta', 
                           help='FASTA file of positive sequences')
    input_group.add_argument('--negative_fasta', 
                           help='FASTA file of negative sequences')
    
    # Essential parametres
    required_group = parser.add_argument_group('REQUIRED ARGUMENTS')
    required_group.add_argument('--genome_fa', required=True,
                              help='Genome reference FASTA file (recommend to use gene transcript regions)')
    required_group.add_argument('--out', required=True,
                              help='Output directory for all results and temporary files')
    
    # Optional parametres
    optional_group = parser.add_argument_group('OPTIONAL ARGUMENTS')
    optional_group.add_argument('--extend', type=int, default=50,
                              help='Base pairs to extend BED regions on both ends (default: 50)')
    optional_group.add_argument('--threads', type=int, default=1,
                              help='Number of threads for parallel processing (default: 1)')
    optional_group.add_argument('--model', choices=['xgboost', 'random_forest'], default='xgboost',
                              help='Machine learning model to use for classification (default: xgboost)')
    optional_group.add_argument('--test_size', type=float, default=0.2,
                              help='Test set size ratio for model evaluation (default: 0.2)')
    optional_group.add_argument('--random_state', type=int, default=42,
                              help='Random state for reproducibility (default: 42)')
    optional_group.add_argument('--cv_folds', type=int, default=5,
                              help='Number of cross-validation folds (default: 5)')
    optional_group.add_argument('--verbose', action='store_true',
                              help='Enable verbose logging output')
    optional_group.add_argument('--keep_temp', action='store_true',
                              help='Keep temporary files after processing')
    optional_group.add_argument('--version', '-v', action='version', 
                              version=f'StemSage {__version__}',
                              help='Show version information and exit')
    optional_group.add_argument('--max_motifs', type=int, default=5,
                          help='Maximum number of motifs to display in visualization (default: 5)')
    optional_group.add_argument('--min_stem_length', type=int, default=1,
                          help='Minimum stem length for stem-loop detection (default: 1)')
    optional_group.add_argument('--max_stem_length', type=int, default=5,
                          help='Maximum stem length before splitting (default: 5, 0=disable splitting)')
    optional_group.add_argument('--similar_matching', 
                            type=lambda x: str(x).lower() in ['true', '1', 'yes', 'y', 't'], 
                            default=True,
                            help='Enable similar sequence matching for motifs (default: True)')
    optional_group.add_argument('--similarity_threshold', type=float, default=0.9,
                            help='Similarity threshold for motif matching (default: 0.9)')
    optional_group.add_argument('--max_length_diff', type=int, default=1,
                            help='Maximum length difference for motif matching (default: 1)')
    
    args = parser.parse_args()
    
    # Input validation
    if not any([args.positive_bed, args.positive_fasta]):
        parser.error("At least one of --positive_bed or --positive_fasta is required")
    
    if args.positive_bed and args.positive_fasta:
        parser.error("Cannot specify both --positive_bed and --positive_fasta")
    
    if not (0 < args.test_size < 1):
        parser.error("--test_size must be between 0 and 1")
    
    if args.cv_folds < 2:
        parser.error("--cv_folds must be at least 2")
    
    # Create output folder
    os.makedirs(args.out, exist_ok=True)
    
    # Run pipeline
    pipeline = StemSagePipeline(args)
    success = pipeline.run()
    
    sys.exit(0 if success else 1)

if __name__ == "__main__":
    main()
