"""Factoid extraction and management for pantsonfire v2.0"""

import re
from typing import List, Dict, Any, Optional
from datetime import datetime
from pydantic import BaseModel, Field
from .llm import LLMClient
from .config import Config


class Factoid(BaseModel):
    """Represents an atomic factual claim extracted from content"""
    
    factoid_id: str = Field(description="Unique identifier for this factoid")
    claim_text: str = Field(description="The actual factual claim")
    source_url: str = Field(description="URL where this was found")
    extracted_at: datetime = Field(default_factory=datetime.now, description="When we extracted this")
    published_at: Optional[datetime] = Field(default=None, description="When source was published")
    content_type: str = Field(default="unknown", description="blog, docs, api, readme, changelog")
    domain: str = Field(description="Domain of the source")
    trust_weight: float = Field(default=0.5, description="Trust weight based on source type (0-1)")
    
    # Truth assessment
    truth_score: float = Field(default=0.5, description="LLM-assessed likelihood this is true (0-1)")
    truth_sources: List[str] = Field(default_factory=list, description="URLs that corroborate this claim")
    conflict_sources: List[str] = Field(default_factory=list, description="URLs that contradict this")
    verification_method: str = Field(default="none", description="llm, web_search, consensus, manual")
    last_verified: Optional[datetime] = Field(default=None, description="When truth was last verified")
    
    # Categorization
    category: str = Field(default="general", description="api, feature, version, access_program, etc.")
    keywords: List[str] = Field(default_factory=list, description="Searchable keywords")
    confidence: float = Field(default=0.5, description="Overall confidence in extraction (0-1)")
    
    # Context
    surrounding_text: str = Field(default="", description="Context around the claim")
    section_title: Optional[str] = Field(default=None, description="Section/heading this appears under")


class FactoidExtractor:
    """Extract atomic factual claims from content using LLM"""
    
    # Content type trust weights
    TRUST_WEIGHTS = {
        "api": 0.95,
        "docs": 0.90,
        "readme": 0.85,
        "changelog": 0.80,
        "tutorial": 0.70,
        "blog": 0.60,
        "announcement": 0.55,
        "unknown": 0.50
    }
    
    # Categories for factoid classification
    CATEGORIES = [
        "api_endpoint",
        "version_requirement",
        "access_program",
        "feature_availability",
        "deprecation",
        "installation",
        "configuration",
        "performance",
        "pricing",
        "support"
    ]
    
    def __init__(self, llm_client: LLMClient, config: Config, verbose: bool = False):
        self.llm = llm_client
        self.config = config
        self.verbose = verbose
    
    def extract_factoids(
        self,
        content: str,
        source_url: str,
        content_type: str = "unknown",
        granularity: str = "paragraph"
    ) -> List[Factoid]:
        """
        Extract atomic factual claims from content.
        
        Args:
            content: The text content to analyze
            source_url: URL of the source
            content_type: Type of content (blog, docs, api, etc.)
            granularity: Level of extraction (sentence, paragraph, section)
        
        Returns:
            List of extracted factoids
        """
        if granularity == "paragraph":
            chunks = self._split_into_paragraphs(content)
        elif granularity == "sentence":
            chunks = self._split_into_sentences(content)
        else:  # section
            chunks = self._split_into_sections(content)
        
        factoids = []
        
        for i, chunk in enumerate(chunks):
            # Skip very short chunks
            if len(chunk.strip()) < 20:
                continue
            
            # Extract factoids from this chunk
            chunk_factoids = self._extract_from_chunk(
                chunk=chunk,
                source_url=source_url,
                content_type=content_type,
                chunk_index=i
            )
            
            factoids.extend(chunk_factoids)
        
        return factoids
    
    def _extract_from_chunk(
        self,
        chunk: str,
        source_url: str,
        content_type: str,
        chunk_index: int
    ) -> List[Factoid]:
        """Extract factoids from a single chunk using LLM"""
        
        # Check if LLM client is available
        if not self.llm.client:
            # Fallback to pattern-based extraction
            return self._pattern_based_extraction(chunk, source_url, content_type)
        
        prompt = self._build_extraction_prompt(chunk, content_type)
        
        try:
            # Check if the model supports JSON response format
            from .models_config import supports_json_format
            supports_json = supports_json_format(self.config.model)
            
            # Build request parameters
            request_params = {
                "model": self.config.model,
                "messages": [
                    {
                        "role": "system",
                        "content": "You are a technical documentation analyst. Extract verifiable factual claims from technical content. Focus on APIs, versions, features, access programs, and technical requirements. Ignore opinions and subjective statements."
                    },
                    {
                        "role": "user",
                        "content": prompt
                    }
                ],
                "temperature": 0.1,
                "max_tokens": 2000
            }
            
            # Only add response_format for models that support it
            if supports_json:
                request_params["response_format"] = {"type": "json_object"}
            
            if self.verbose:
                print(f"\n🔍 LLM Request:")
                print(f"  Model: {self.config.model}")
                print(f"  Supports JSON: {supports_json}")
                print(f"  Chunk length: {len(chunk)} chars")
                print(f"  Prompt preview: {prompt[:200]}...")
            
            response = self.llm.client.chat.completions.create(**request_params)
            
            result = response.choices[0].message.content
            
            if self.verbose:
                print(f"\n📥 LLM Response:")
                print(f"  Status: {'Success' if result else 'Empty'}")
                print(f"  Length: {len(result) if result else 0} chars")
                if result:
                    print(f"  Preview: {result[:200]}...")
                else:
                    print(f"  Full response object: {response}")
            
            # Debug: show what model and result we got
            if not result or not result.strip():
                print(f"⚠️  Empty response from {self.config.model}, using pattern extraction")
                return self._pattern_based_extraction(chunk, source_url, content_type)
            
            # Try to parse the result
            parsed = self._parse_factoid_response(result, source_url, content_type, chunk)
            
            # If parsing failed or returned nothing, use pattern matching
            if not parsed:
                if self.verbose:
                    print(f"  ⚠️  Parsing returned empty list for {len(result)} char response")
                else:
                    print(f"⚠️  LLM extraction returned no factoids (model: {self.config.model}), using pattern extraction")
                return self._pattern_based_extraction(chunk, source_url, content_type)
            
            return parsed
        
        except Exception as e:
            print(f"⚠️  LLM error with {self.config.model}: {str(e)[:100]}")
            # Fallback to pattern-based extraction
            return self._pattern_based_extraction(chunk, source_url, content_type)
        
        return []
    
    def _build_extraction_prompt(self, chunk: str, content_type: str) -> str:
        """Build the prompt for factoid extraction"""
        
        return f"""
Extract all verifiable factual claims from this {content_type} content. Focus on technical facts only.

CONTENT:
{chunk}

INSTRUCTIONS:
- Extract ANY mention of technical concepts, features, tools, or capabilities
- Include: product names, technology mentions, feature descriptions, processes, tools
- For marketing/blog content: extract what products/features are being discussed
- Even vague claims like "we help with X" or "supports Y" are valuable
- Be LIBERAL in extraction - we'll verify truth later
- Categories: api_endpoint, version_requirement, access_program, feature_availability, deprecation, etc.

Respond with a JSON object in this format:
{{
    "factoids": [
        {{
            "claim_text": "specific mention or claim",
            "category": "feature_availability",
            "keywords": ["keyword1", "keyword2"],
            "confidence": 0.0-1.0,
            "section_title": "optional heading"
        }}
    ]
}}

IMPORTANT: Be generous - extract everything that could be verified. Return EMPTY array only if there's truly nothing technical.
"""
    
    def _parse_factoid_response(
        self,
        response: str,
        source_url: str,
        content_type: str,
        chunk: str
    ) -> List[Factoid]:
        """Parse LLM response into Factoid objects"""
        import json
        import uuid
        from urllib.parse import urlparse
        
        try:
            # Handle empty or whitespace-only responses
            if not response or not response.strip():
                if self.verbose:
                    print(f"  ⚠️  Empty response received")
                return []
            
            if self.verbose:
                print(f"  🔄 Parsing JSON response ({len(response)} chars)...")
            
            data = json.loads(response)
            
            if self.verbose:
                print(f"  ✅ JSON parsed successfully")
            
            factoids = []
            
            factoids_list = data.get("factoids", [])
            
            # Always show this in verbose mode
            if self.verbose:
                print(f"  📊 LLM JSON contains {len(factoids_list)} factoid items")
            
            for item in factoids_list:
                # Skip items without claim_text
                if not item.get("claim_text"):
                    continue
                
                # Generate unique ID
                factoid_id = str(uuid.uuid4())[:8]
                
                # Extract domain from URL
                domain = urlparse(source_url).netloc
                
                # Get trust weight for this content type
                trust_weight = self.TRUST_WEIGHTS.get(content_type, 0.5)
                
                try:
                    factoid = Factoid(
                        factoid_id=factoid_id,
                        claim_text=item.get("claim_text", ""),
                        source_url=source_url,
                        extracted_at=datetime.now(),
                        content_type=content_type,
                        domain=domain,
                        trust_weight=trust_weight,
                        category=item.get("category", "general"),
                        keywords=item.get("keywords", []),
                        confidence=item.get("confidence", 0.5),
                        surrounding_text=chunk[:500],  # Store context
                        section_title=item.get("section_title")
                    )
                    factoids.append(factoid)
                except Exception as e:
                    if self.verbose:
                        print(f"  ⚠️  Failed to create factoid: {e}")
                    continue
            
            if self.verbose:
                print(f"  ✅ Successfully created {len(factoids)} Factoid objects")
            
            return factoids
        
        except json.JSONDecodeError as e:
            # Don't print error, just return empty - fallback will handle it
            return []
        except Exception as e:
            return []
    
    def _pattern_based_extraction(
        self,
        chunk: str,
        source_url: str,
        content_type: str
    ) -> List[Factoid]:
        """Fallback: Extract factoids using pattern matching"""
        import uuid
        from urllib.parse import urlparse
        
        factoids = []
        patterns = [
            (r'(?:requires?|needs?) (?:python|node|ruby|java|go|rust) (?:version )?(\d+\.\d+)', 'version_requirement'),
            (r'(?:API|endpoint|route) (?:is|at) ([\/\w\-]+)', 'api_endpoint'),
            (r'(?:early access|get early access|beta access|alpha|preview|coming soon)', 'access_program'),
            (r'(?:deprecated|legacy|obsolete|no longer supported)', 'deprecation'),
            (r'(?:available|released|launched) (?:in|since) (\d{4})', 'feature_availability'),
            (r'(?:sign up|signup|register|join|apply) (?:for|to get)', 'access_program'),
            (r'(?:fine-tun(?:e|ing)|train(?:ing)?|model)', 'feature_availability'),
            (r'v\d+(?:\.\d+){1,2}', 'version_requirement'),
            (r'(?:install|setup|configure)', 'installation'),
        ]
        
        for pattern, category in patterns:
            matches = re.finditer(pattern, chunk, re.IGNORECASE)
            for match in matches:
                factoid_id = str(uuid.uuid4())[:8]
                domain = urlparse(source_url).netloc
                trust_weight = self.TRUST_WEIGHTS.get(content_type, 0.5)
                
                factoid = Factoid(
                    factoid_id=factoid_id,
                    claim_text=match.group(0),
                    source_url=source_url,
                    content_type=content_type,
                    domain=domain,
                    trust_weight=trust_weight,
                    category=category,
                    confidence=0.6,  # Lower confidence for pattern matching
                    surrounding_text=chunk[:500]
                )
                
                factoids.append(factoid)
        
        return factoids
    
    def _split_into_paragraphs(self, content: str) -> List[str]:
        """Split content into paragraphs"""
        # Split on double newlines
        paragraphs = re.split(r'\n\s*\n', content)
        return [p.strip() for p in paragraphs if p.strip()]
    
    def _split_into_sentences(self, content: str) -> List[str]:
        """Split content into sentences"""
        # Simple sentence splitting (can be enhanced with NLTK)
        sentences = re.split(r'[.!?]+\s+', content)
        return [s.strip() for s in sentences if s.strip() and len(s.strip()) > 20]
    
    def _split_into_sections(self, content: str) -> List[str]:
        """Split content into sections (by headings)"""
        # Split on markdown headings
        sections = re.split(r'\n#{1,6}\s+', content)
        return [s.strip() for s in sections if s.strip()]


class TimestampDetector:
    """Detect publication and update timestamps in content"""
    
    # Common date patterns in blogs/docs
    DATE_PATTERNS = [
        r'(?:published|posted|updated|last modified)[:\s]+(\d{4}-\d{2}-\d{2})',
        r'(\d{4}-\d{2}-\d{2})',
        r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}',
        r'\d{1,2}/\d{1,2}/\d{4}',
        r'Q[1-4]\s+\d{4}',
    ]
    
    def detect_timestamp(self, content: str, html_meta: Optional[Dict[str, str]] = None) -> Optional[datetime]:
        """
        Detect the publication or last-update timestamp.
        
        Args:
            content: The text content
            html_meta: Optional HTML metadata dict
        
        Returns:
            Detected datetime or None
        """
        # First check HTML meta tags if available
        if html_meta:
            for key in ['published_time', 'modified_time', 'article:published_time']:
                if key in html_meta:
                    try:
                        return datetime.fromisoformat(html_meta[key].replace('Z', '+00:00'))
                    except:
                        pass
        
        # Then try content patterns
        for pattern in self.DATE_PATTERNS:
            match = re.search(pattern, content, re.IGNORECASE)
            if match:
                date_str = match.group(1) if match.groups() else match.group(0)
                try:
                    # Try various date parsing formats
                    return self._parse_date_string(date_str)
                except:
                    continue
        
        return None
    
    def _parse_date_string(self, date_str: str) -> Optional[datetime]:
        """Parse various date string formats"""
        from dateutil import parser
        
        try:
            return parser.parse(date_str)
        except:
            # Handle quarter format (Q1 2024)
            quarter_match = re.match(r'Q([1-4])\s+(\d{4})', date_str)
            if quarter_match:
                quarter = int(quarter_match.group(1))
                year = int(quarter_match.group(2))
                month = (quarter - 1) * 3 + 1
                return datetime(year, month, 1)
        
        return None
    
    def calculate_staleness(self, published_at: datetime) -> float:
        """
        Calculate staleness score (0 = very stale, 1 = fresh).
        
        Args:
            published_at: Publication timestamp
        
        Returns:
            Staleness score (0-1)
        """
        if not published_at:
            return 0.5  # Unknown age
        
        days_old = (datetime.now() - published_at).days
        
        # Decay over 2 years
        if days_old > 730:
            return 0.0  # Very stale
        elif days_old > 365:
            return 0.3  # Stale
        elif days_old > 180:
            return 0.6  # Moderately fresh
        elif days_old > 90:
            return 0.8  # Fresh
        else:
            return 1.0  # Very fresh


class ContentTypeClassifier:
    """Classify content type and assign trust weights"""
    
    # URL patterns for content type detection
    CONTENT_TYPE_PATTERNS = {
        "api": [r'/api(?:/|$)', r'/reference(?:/|$)', r'openapi', r'swagger'],
        "docs": [r'/docs(?:/|$)', r'/documentation(?:/|$)', r'readthedocs'],
        "readme": [r'readme\.md', r'/blob/main/README'],
        "changelog": [r'changelog', r'release(?:-notes)?', r'whats-new'],
        "tutorial": [r'/tutorial(?:s)?(?:/|$)', r'/guide(?:s)?(?:/|$)', r'/examples?(?:/|$)'],
        "blog": [r'/blog(?:/|$)', r'/post(?:s)?(?:/|$)', r'/article(?:s)?(?:/|$)'],
        "announcement": [r'/announcement(?:s)?', r'/news(?:/|$)', r'/entry(?:/|$)'],
    }
    
    def classify(self, url: str, content: str) -> str:
        """
        Classify content type based on URL and content analysis.
        
        Args:
            url: Source URL
            content: Content text
        
        Returns:
            Content type string
        """
        url_lower = url.lower()
        
        # Check URL patterns first
        for content_type, patterns in self.CONTENT_TYPE_PATTERNS.items():
            for pattern in patterns:
                if re.search(pattern, url_lower):
                    return content_type
        
        # Fallback: analyze content
        return self._classify_by_content(content)
    
    def _classify_by_content(self, content: str) -> str:
        """Classify based on content patterns"""
        content_lower = content.lower()
        
        # Check for API documentation indicators
        if re.search(r'(?:endpoint|curl|request|response|http method)', content_lower):
            return "api"
        
        # Check for tutorial indicators
        if re.search(r'(?:step \d|tutorial|getting started|quick start)', content_lower):
            return "tutorial"
        
        # Check for changelog indicators
        if re.search(r'(?:version \d+\.\d+|released|fixed|added|deprecated)', content_lower):
            return "changelog"
        
        # Check for README indicators
        if re.search(r'(?:installation|features|usage|contributing)', content_lower) and len(content) < 5000:
            return "readme"
        
        # Default to blog
        return "blog"
    
    def get_trust_weight(self, content_type: str) -> float:
        """Get trust weight for a content type"""
        return FactoidExtractor.TRUST_WEIGHTS.get(content_type, 0.5)


class FactoidDeduplicator:
    """Deduplicate similar factoids across sources"""
    
    def __init__(self, similarity_threshold: float = 0.85):
        self.similarity_threshold = similarity_threshold
    
    def deduplicate(self, factoids: List[Factoid]) -> List[Factoid]:
        """
        Remove duplicate factoids, keeping the highest-trust version.
        
        Args:
            factoids: List of factoids to deduplicate
        
        Returns:
            Deduplicated list
        """
        if not factoids:
            return []
        
        # Group by similar claims
        groups = []
        processed = set()
        
        for i, factoid in enumerate(factoids):
            if i in processed:
                continue
            
            group = [factoid]
            processed.add(i)
            
            # Find similar factoids
            for j, other in enumerate(factoids[i+1:], start=i+1):
                if j in processed:
                    continue
                
                if self._are_similar(factoid.claim_text, other.claim_text):
                    group.append(other)
                    processed.add(j)
            
            groups.append(group)
        
        # For each group, keep the highest-trust factoid
        deduplicated = []
        for group in groups:
            best = max(group, key=lambda f: f.trust_weight * f.confidence)
            
            # Merge sources from all factoids in group
            all_sources = set()
            for factoid in group:
                all_sources.add(factoid.source_url)
            
            best.truth_sources = list(all_sources)
            deduplicated.append(best)
        
        return deduplicated
    
    def _are_similar(self, text1: str, text2: str) -> bool:
        """Check if two claim texts are similar (simple implementation)"""
        # Normalize texts
        norm1 = self._normalize_text(text1)
        norm2 = self._normalize_text(text2)
        
        # Simple similarity: check if one is substring of other or word overlap
        if norm1 in norm2 or norm2 in norm1:
            return True
        
        # Word overlap
        words1 = set(norm1.split())
        words2 = set(norm2.split())
        
        if not words1 or not words2:
            return False
        
        overlap = len(words1 & words2) / max(len(words1), len(words2))
        return overlap >= self.similarity_threshold
    
    def _normalize_text(self, text: str) -> str:
        """Normalize text for comparison"""
        # Remove punctuation, lowercase, remove extra spaces
        text = re.sub(r'[^\w\s]', '', text.lower())
        text = re.sub(r'\s+', ' ', text).strip()
        return text

