"""
BM42 pre-scoring for improved candidate ranking.

Uses BM42 (fastembed) for semantic-aware scoring of candidates before reranking.
BM42 combines transformer attention weights with IDF for better semantic matching.
"""

from __future__ import annotations

from typing import List, Optional
from .base import CandidateMatch

# Try to import fastembed for BM42
try:
    from fastembed import SparseTextEmbedding
    HAS_BM42 = True
except ImportError:
    HAS_BM42 = False


class BM42Scorer:
    """
    BM42 scoring for candidate pre-ranking with semantic understanding.

    Uses transformer attention weights + IDF for better semantic matching
    than pure lexical BM25.
    """

    def __init__(self):
        self._model = None

    def _get_model(self):
        """Lazy load BM42 model."""
        if self._model is None:
            self._model = SparseTextEmbedding(
                model_name="Qdrant/bm42-all-minilm-l6-v2-attentions"
            )
        return self._model

    def _sparse_dot_product(self, emb1, emb2) -> float:
        """Compute dot product of two sparse embeddings."""
        # Create index->value maps
        idx1_map = dict(zip(emb1.indices.tolist(), emb1.values.tolist()))
        idx2_map = dict(zip(emb2.indices.tolist(), emb2.values.tolist()))

        # Find common indices and compute dot product
        common = set(idx1_map.keys()) & set(idx2_map.keys())
        score = sum(idx1_map[idx] * idx2_map[idx] for idx in common)
        return score

    def score_candidates(
        self,
        query: str,
        candidates: List[CandidateMatch],
        keywords: Optional[dict] = None
    ) -> List[CandidateMatch]:
        """
        Score and sort candidates by BM42 relevance.

        Args:
            query: The search query
            candidates: List of candidates to score
            keywords: Optional extracted keywords for additional terms

        Returns:
            Candidates sorted by BM42 score (highest first)
        """
        if not candidates:
            return candidates

        if len(candidates) < 2:
            return candidates

        if not HAS_BM42:
            raise ImportError("fastembed is required for BM42 scoring. Install with: pip install fastembed")

        model = self._get_model()

        # Build corpus text for each candidate
        corpus_texts = []
        for candidate in candidates:
            text = candidate.matched_text
            if candidate.context_before:
                text += ' ' + candidate.context_before
            if candidate.context_after:
                text += ' ' + candidate.context_after
            # Add file path
            text += ' ' + candidate.path.replace('\\', ' ').replace('/', ' ')
            corpus_texts.append(text)

        # Build query text with primary_terms
        query_text = query
        if keywords and keywords.get('primary_terms'):
            query_text += ' ' + ' '.join(keywords['primary_terms'])

        # Get embeddings
        query_embedding = list(model.query_embed(query_text))[0]
        doc_embeddings = list(model.embed(corpus_texts))

        # Score each candidate
        scored_candidates = []
        for i, candidate in enumerate(candidates):
            base_score = self._sparse_dot_product(query_embedding, doc_embeddings[i])
            candidate.pre_filter_score = base_score
            scored_candidates.append((base_score, candidate))

        # Sort by score descending
        scored_candidates.sort(key=lambda x: x[0], reverse=True)
        return [c for _, c in scored_candidates]


# Global instance for reuse
_scorer = None

def get_scorer() -> BM42Scorer:
    """Get or create the global BM42 scorer instance."""
    global _scorer
    if _scorer is None:
        _scorer = BM42Scorer()
    return _scorer


def score_candidates(
    query: str,
    candidates: List[CandidateMatch],
    keywords: Optional[dict] = None
) -> List[CandidateMatch]:
    """
    Convenience function to score candidates.

    Args:
        query: The search query
        candidates: List of candidates to score
        keywords: Optional extracted keywords

    Returns:
        Candidates sorted by BM42 score
    """
    return get_scorer().score_candidates(query, candidates, keywords)
