from __future__ import annotations
import argparse, sys, json, pathlib, glob, os
from datetime import datetime
from .core import parse, _guess_meta, parse_folder, parse_folder_unified
from .ai_processor import AIProcessor

class DateTimeEncoder(json.JSONEncoder):
    """Custom JSON encoder to handle datetime objects."""
    def default(self, obj):
        if isinstance(obj, datetime):
            return obj.isoformat()
        return super().default(obj)

def main(argv=None):
    argv = argv or sys.argv[1:]
    ap = argparse.ArgumentParser(prog="panparsex", description="Universal parser for files and websites")
    sub = ap.add_subparsers(dest="cmd", required=True)

    p = sub.add_parser("parse", help="Parse a path, file, or URL")
    p.add_argument("target", help="Path/URL to parse (file, folder, or URL)")
    p.add_argument("--recursive", action="store_true", help="Recurse into folders or follow links")
    p.add_argument("--folder-mode", action="store_true", help="Parse entire folder (alternative to --recursive for folders)")
    p.add_argument("--unified-output", action="store_true", help="Combine all folder files into single document")
    p.add_argument("--file-patterns", nargs="*", help="File patterns to include (e.g., *.pdf *.txt)")
    p.add_argument("--exclude-patterns", nargs="*", help="Patterns to exclude (e.g., *.tmp .git)")
    p.add_argument("--no-progress", action="store_true", help="Disable progress bar for folder parsing")
    p.add_argument("--glob", default="**/*", help="Glob when target is a folder (legacy mode)")
    p.add_argument("--max-links", type=int, default=50, help="Max links/pages when crawling")
    p.add_argument("--max-depth", type=int, default=1, help="Max depth when crawling")
    p.add_argument("--same-origin", action="store_true", help="Restrict crawl to same origin")
    p.add_argument("--pretty", action="store_true", help="Pretty-print JSON")
    p.add_argument("--output", "-o", help="Output file for parsed results (default: no output to terminal)")
    p.add_argument("--quiet", "-q", action="store_true", help="Suppress all output to terminal")
    
    # AI processing options
    p.add_argument("--ai-process", action="store_true", help="Process with AI after parsing")
    p.add_argument("--ai-task", default="analyze and restructure", help="AI task description")
    p.add_argument("--ai-format", default="structured_json", choices=["structured_json", "markdown", "summary"], help="AI output format")
    p.add_argument("--ai-output", help="Output file for AI-processed result")
    p.add_argument("--openai-key", help="OpenAI API key (or set OPENAI_API_KEY env var)")
    p.add_argument("--ai-model", default="gpt-4o-mini", help="OpenAI model to use")
    p.add_argument("--ai-tokens", type=int, default=4000, help="Max tokens for AI response")
    p.add_argument("--ai-temperature", type=float, default=0.3, help="AI temperature (0.0-1.0)")
    p.add_argument("--ai-chunk-size", type=int, help="Override automatic chunk size for large content (in tokens)")
    
    # Image extraction options (for PDFs)
    p.add_argument("--extract-images", action="store_true", help="Extract images from PDFs")
    p.add_argument("--image-output-dir", help="Directory to save extracted images (default: extracted_images)")
    p.add_argument("--min-image-size", nargs=2, type=int, default=[50, 50], metavar=("WIDTH", "HEIGHT"), help="Minimum image size to extract")
    
    # Web crawling options
    p.add_argument("--use-selenium", action="store_true", help="Use Selenium for JavaScript-heavy websites")
    p.add_argument("--headless", action="store_true", default=True, help="Run browser in headless mode (Selenium)")
    p.add_argument("--browser-delay", type=float, default=1.0, help="Delay between page loads (Selenium)")

    args = ap.parse_args(argv)

    target = args.target
    pth = pathlib.Path(target)
    docs = []
    parsed_docs = []
    
    # Check if target is a directory
    if pth.exists() and pth.is_dir():
        # Use new folder parsing if folder-mode is enabled or if no legacy glob is specified
        if args.folder_mode or args.glob == "**/*":
            # Prepare parsing options
            parse_kwargs = {
                'recursive': args.recursive,
                'show_progress': not args.no_progress,
                'file_patterns': args.file_patterns,
                'exclude_patterns': args.exclude_patterns
            }
            
            # Add image extraction options for PDFs
            if args.extract_images:
                parse_kwargs['extract_images'] = True
                if args.image_output_dir:
                    parse_kwargs['image_output_dir'] = args.image_output_dir
                parse_kwargs['min_image_size'] = tuple(args.min_image_size)
            
            # Choose parsing method
            if args.unified_output:
                # Parse folder and combine into single document
                d, summary = parse_folder_unified(target, **parse_kwargs)
                parsed_docs.append(d)
                docs.append(d.model_dump())
            else:
                # Parse folder and return list of documents
                folder_docs, summary = parse_folder(target, **parse_kwargs)
                parsed_docs.extend(folder_docs)
                docs.extend([doc.model_dump() for doc in folder_docs])
                
                # Report image extraction results
                total_images = sum(len(doc.images) for doc in folder_docs if hasattr(doc, 'images'))
                if args.extract_images and total_images > 0:
                    if not args.quiet:
                        print(f"Extracted {total_images} images from {len(folder_docs)} files", file=sys.stderr)
                        if args.image_output_dir:
                            print(f"Images saved to: {args.image_output_dir}", file=sys.stderr)
            
            # Display parsing summary
            if not args.quiet:
                print(f"\n📊 Parsing Summary:", file=sys.stderr)
                print(f"   Total files found: {summary.total_files_found}", file=sys.stderr)
                print(f"   Programming files ignored: {summary.programming_files_ignored}", file=sys.stderr)
                print(f"   Files parsed successfully: {summary.files_parsed_successfully}", file=sys.stderr)
                print(f"   Files failed: {summary.files_failed}", file=sys.stderr)
                print(f"   Total sections extracted: {summary.total_sections}", file=sys.stderr)
                print(f"   Total images extracted: {summary.total_images}", file=sys.stderr)
                
                if summary.file_types_processed:
                    print(f"   File types processed:", file=sys.stderr)
                    for ext, count in sorted(summary.file_types_processed.items()):
                        print(f"     {ext}: {count} files", file=sys.stderr)
                
                if summary.programming_files_list and len(summary.programming_files_list) <= 10:
                    print(f"   Programming files ignored:", file=sys.stderr)
                    for file_path in summary.programming_files_list[:10]:
                        print(f"     {file_path}", file=sys.stderr)
                    if len(summary.programming_files_list) > 10:
                        print(f"     ... and {len(summary.programming_files_list) - 10} more", file=sys.stderr)
                
                if summary.failed_files_list:
                    print(f"   Failed files:", file=sys.stderr)
                    for file_path, error in summary.failed_files_list:
                        print(f"     {file_path}: {error}", file=sys.stderr)
                print()  # Empty line for readability
        else:
            # Legacy glob-based parsing
            for fn in glob.glob(str(pth / args.glob), recursive=True):
                fp = pathlib.Path(fn)
                if fp.is_file():
                    parse_kwargs = {'recursive': args.recursive}
                    if args.extract_images:
                        parse_kwargs['extract_images'] = True
                        if args.image_output_dir:
                            parse_kwargs['image_output_dir'] = args.image_output_dir
                        parse_kwargs['min_image_size'] = tuple(args.min_image_size)
                    
                    d = parse(str(fp), **parse_kwargs)
                    parsed_docs.append(d)
                    docs.append(d.model_dump())
    else:
        # Choose parser based on options
        if args.use_selenium and (target.startswith('http://') or target.startswith('https://')):
            # Use Selenium parser for web URLs
            from .parsers.web_selenium import SeleniumWebParser
            selenium_parser = SeleniumWebParser()
            meta = _guess_meta(target, url=target)
            d = selenium_parser.parse(target, meta, recursive=args.recursive, max_links=args.max_links, max_depth=args.max_depth, same_origin=args.same_origin, delay=args.browser_delay, headless=args.headless)
        else:
            # Use regular parser with image extraction options if specified
            parse_kwargs = {
                'recursive': args.recursive,
                'max_links': args.max_links,
                'max_depth': args.max_depth,
                'same_origin': args.same_origin
            }
            
            # Add image extraction options for PDFs
            if args.extract_images:
                parse_kwargs['extract_images'] = True
                if args.image_output_dir:
                    parse_kwargs['image_output_dir'] = args.image_output_dir
                parse_kwargs['min_image_size'] = tuple(args.min_image_size)
            
            d = parse(target, **parse_kwargs)
        parsed_docs.append(d)
        docs.append(d.model_dump())
        
        # Report image extraction results
        if args.extract_images and hasattr(d, 'images') and d.images:
            if not args.quiet:
                print(f"Extracted {len(d.images)} images from PDF", file=sys.stderr)
                if args.image_output_dir:
                    print(f"Images saved to: {args.image_output_dir}", file=sys.stderr)

    # AI processing
    if args.ai_process:
        try:
            # Use the first document for AI processing (or combine if multiple)
            main_doc = parsed_docs[0] if parsed_docs else None
            if not main_doc:
                if not args.quiet:
                    print("No documents to process with AI", file=sys.stderr)
                sys.exit(1)
            
            # Determine output file
            output_file = args.ai_output
            if not output_file:
                if args.ai_format == "structured_json":
                    output_file = "ai_processed_result.json"
                elif args.ai_format == "markdown":
                    output_file = "ai_processed_result.md"
                else:
                    output_file = "ai_processed_result.txt"
            
            # Check for OpenAI API key
            api_key = args.openai_key or os.getenv("OPENAI_API_KEY")
            if not api_key:
                if not args.quiet:
                    print("Warning: No OpenAI API key found. Set OPENAI_API_KEY environment variable or use --openai-key", file=sys.stderr)
                    print("Skipping AI processing and saving parsed content only.", file=sys.stderr)
                
                # Save parsed content to file if requested
                if args.output:
                    with open(args.output, 'w', encoding='utf-8') as f:
                        # Use model_dump to ensure proper serialization
                        data_to_save = docs if len(docs)>1 else docs[0]
                        if args.pretty:
                            json.dump(data_to_save, f, indent=2, ensure_ascii=False, cls=DateTimeEncoder)
                        else:
                            json.dump(data_to_save, f, ensure_ascii=False, cls=DateTimeEncoder)
                    if not args.quiet:
                        print(f"Parsed content saved to: {args.output}", file=sys.stderr)
                elif not args.quiet:
                    # Print to terminal if no output file and not quiet
                    if args.pretty:
                        print(json.dumps(docs if len(docs)>1 else docs[0], indent=2, ensure_ascii=False, cls=DateTimeEncoder))
                    else:
                        print(json.dumps(docs if len(docs)>1 else docs[0], ensure_ascii=False, cls=DateTimeEncoder))
                return
            
            # Initialize AI processor
            processor = AIProcessor(
                api_key=api_key,
                model=args.ai_model
            )
            
            # Process with AI
            if not args.quiet:
                print(f"Processing with AI (model: {args.ai_model})...", file=sys.stderr)
            result = processor.process_and_save(
                main_doc,
                output_file,
                task=args.ai_task,
                output_format=args.ai_format,
                max_tokens=args.ai_tokens,
                temperature=args.ai_temperature,
                chunk_size=args.ai_chunk_size
            )
            
            if not args.quiet:
                print(f"AI processing complete. Result saved to: {output_file}", file=sys.stderr)
            
            # Save original parsed content to file if requested
            if args.output:
                with open(args.output, 'w', encoding='utf-8') as f:
                    if args.pretty:
                        json.dump(docs if len(docs)>1 else docs[0], f, indent=2, ensure_ascii=False, cls=DateTimeEncoder)
                    else:
                        json.dump(docs if len(docs)>1 else docs[0], f, ensure_ascii=False, cls=DateTimeEncoder)
                if not args.quiet:
                    print(f"Original parsed content saved to: {args.output}", file=sys.stderr)
            
            # Also print the result to stdout if pretty printing is requested and not quiet
            if args.pretty and not args.quiet:
                print("\n=== AI Processed Result ===")
                if args.ai_format == "structured_json" and "raw_response" not in result:
                    print(json.dumps(result, indent=2, ensure_ascii=False, cls=DateTimeEncoder))
                else:
                    content = result.get("content", result.get("raw_response", str(result)))
                    print(content)
                print("\n=== Original Parsed Content ===")
                print(json.dumps(docs if len(docs)>1 else docs[0], indent=2, ensure_ascii=False, cls=DateTimeEncoder))
            elif not args.quiet and not args.output:
                # Just print original content if no output file specified and not quiet
                print(json.dumps(docs if len(docs)>1 else docs[0], ensure_ascii=False, cls=DateTimeEncoder))
                
        except Exception as e:
            if not args.quiet:
                print(f"AI processing failed: {e}", file=sys.stderr)
                print("Falling back to original parsing result...", file=sys.stderr)
            
            # Save to file if requested
            if args.output:
                with open(args.output, 'w', encoding='utf-8') as f:
                    if args.pretty:
                        json.dump(docs if len(docs)>1 else docs[0], f, indent=2, ensure_ascii=False, cls=DateTimeEncoder)
                    else:
                        json.dump(docs if len(docs)>1 else docs[0], f, ensure_ascii=False, cls=DateTimeEncoder)
                if not args.quiet:
                    print(f"Parsed content saved to: {args.output}", file=sys.stderr)
            elif not args.quiet:
                # Print to terminal if no output file and not quiet
                if args.pretty:
                    print(json.dumps(docs if len(docs)>1 else docs[0], indent=2, ensure_ascii=False, cls=DateTimeEncoder))
                else:
                    print(json.dumps(docs if len(docs)>1 else docs[0], ensure_ascii=False, cls=DateTimeEncoder))
    else:
        # No AI processing, save to file or print based on options
        if args.output:
            with open(args.output, 'w', encoding='utf-8') as f:
                if args.pretty:
                    json.dump(docs if len(docs)>1 else docs[0], f, indent=2, ensure_ascii=False, cls=DateTimeEncoder)
                else:
                    json.dump(docs if len(docs)>1 else docs[0], f, ensure_ascii=False, cls=DateTimeEncoder)
            if not args.quiet:
                print(f"Parsed content saved to: {args.output}", file=sys.stderr)
        elif not args.quiet:
            # Print to terminal if no output file and not quiet
            if args.pretty:
                print(json.dumps(docs if len(docs)>1 else docs[0], indent=2, ensure_ascii=False, cls=DateTimeEncoder))
            else:
                print(json.dumps(docs if len(docs)>1 else docs[0], ensure_ascii=False, cls=DateTimeEncoder))

if __name__ == "__main__":
    main()
