"""
Workspace Content Service - Structured Content Caching.

This service manages structured caching of publications, datasets, and metadata
in the DataManagerV2 workspace. It provides schema validation, professional naming,
and flexible content retrieval with level-based filtering.

Phase 4a: Workspace Tools Implementation
- Structured caching for publications, datasets, metadata
- Pydantic schema validation for content types
- Professional naming convention enforcement
- Level-based content retrieval (summary/methods/samples/platform)
- Integration with DataManagerV2 workspace
"""

import json
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

from pydantic import BaseModel, Field, field_validator

from lobster.core.data_manager_v2 import DataManagerV2
from lobster.utils.logger import get_logger

logger = get_logger(__name__)


# ===============================================================================
# Content Type Enums
# ===============================================================================


class ContentType(str, Enum):
    """Content types for workspace caching."""

    PUBLICATION = "publication"
    DATASET = "dataset"
    METADATA = "metadata"
    DOWNLOAD_QUEUE = "download_queue"


class RetrievalLevel(str, Enum):
    """Content retrieval detail levels."""

    SUMMARY = "summary"  # Basic overview (title, authors, sample count, etc.)
    METHODS = "methods"  # Methods section (for publications)
    SAMPLES = "samples"  # Sample metadata (for datasets)
    PLATFORM = "platform"  # Platform/technology info
    FULL = "full"  # All available content


# ===============================================================================
# Pydantic Content Schemas
# ===============================================================================


class PublicationContent(BaseModel):
    """
    Schema for cached publication content.

    Stores publication metadata, abstract, methods, and full-text
    retrieved from PubMed, PMC, bioRxiv, or other sources.
    """

    identifier: str = Field(
        ..., description="Publication identifier (PMID, DOI, or bioRxiv ID)"
    )
    title: Optional[str] = Field(None, description="Publication title")
    authors: List[str] = Field(default_factory=list, description="List of author names")
    journal: Optional[str] = Field(None, description="Journal name")
    year: Optional[int] = Field(None, description="Publication year")
    abstract: Optional[str] = Field(None, description="Abstract text")
    methods: Optional[str] = Field(None, description="Methods section text")
    full_text: Optional[str] = Field(None, description="Full publication text")
    keywords: List[str] = Field(
        default_factory=list, description="Publication keywords"
    )
    source: str = Field(..., description="Source provider (PMC, PubMed, bioRxiv, etc.)")
    cached_at: str = Field(
        ..., description="ISO 8601 timestamp when content was cached"
    )
    url: Optional[str] = Field(None, description="Publication URL")

    @field_validator("identifier")
    @classmethod
    def validate_identifier(cls, v: str) -> str:
        """Validate publication identifier format."""
        if not v or not v.strip():
            raise ValueError("Publication identifier cannot be empty")
        return v.strip()

    @field_validator("source")
    @classmethod
    def validate_source(cls, v: str) -> str:
        """Validate source is not empty."""
        if not v or not v.strip():
            raise ValueError("Source cannot be empty")
        return v

    class Config:
        """Pydantic configuration."""

        json_schema_extra = {
            "example": {
                "identifier": "PMID:35042229",
                "title": "Single-cell RNA-seq reveals...",
                "authors": ["Smith J", "Jones A"],
                "journal": "Nature",
                "year": 2022,
                "abstract": "We performed single-cell RNA-seq...",
                "methods": "Cells were processed using 10X Chromium...",
                "source": "PMC",
                "cached_at": "2025-01-12T10:30:00",
            }
        }


class DatasetContent(BaseModel):
    """
    Schema for cached dataset content.

    Stores dataset metadata from GEO, SRA, or other repositories,
    including platform info, sample metadata, and experimental design.
    """

    identifier: str = Field(..., description="Dataset identifier (GSE, SRA, etc.)")
    title: Optional[str] = Field(None, description="Dataset title")
    platform: Optional[str] = Field(
        None, description="Platform/technology (e.g., Illumina NovaSeq)"
    )
    platform_id: Optional[str] = Field(None, description="Platform ID (e.g., GPL570)")
    organism: Optional[str] = Field(None, description="Organism (e.g., Homo sapiens)")
    sample_count: int = Field(..., description="Number of samples in dataset")
    samples: Optional[Dict[str, Any]] = Field(
        None, description="Sample metadata (GSM IDs → metadata)"
    )
    experimental_design: Optional[str] = Field(
        None, description="Experimental design description"
    )
    summary: Optional[str] = Field(None, description="Dataset summary/abstract")
    pubmed_ids: List[str] = Field(
        default_factory=list, description="Associated PubMed IDs"
    )
    source: str = Field(..., description="Source repository (GEO, SRA, PRIDE, etc.)")
    cached_at: str = Field(
        ..., description="ISO 8601 timestamp when content was cached"
    )
    url: Optional[str] = Field(None, description="Dataset URL")

    @field_validator("identifier")
    @classmethod
    def validate_identifier(cls, v: str) -> str:
        """Validate dataset identifier format."""
        if not v or not v.strip():
            raise ValueError("Dataset identifier cannot be empty")
        return v.strip()

    @field_validator("sample_count")
    @classmethod
    def validate_sample_count(cls, v: int) -> int:
        """Validate sample count is positive."""
        if v < 0:
            raise ValueError("Sample count must be non-negative")
        return v

    class Config:
        """Pydantic configuration."""

        json_schema_extra = {
            "example": {
                "identifier": "GSE123456",
                "title": "Single-cell RNA-seq of aging brain",
                "platform": "Illumina NovaSeq 6000",
                "platform_id": "GPL24676",
                "organism": "Homo sapiens",
                "sample_count": 12,
                "samples": {"GSM1": {"age": 25, "tissue": "brain"}},
                "source": "GEO",
                "cached_at": "2025-01-12T10:30:00",
            }
        }


class MetadataContent(BaseModel):
    """
    Schema for cached metadata content.

    Stores arbitrary metadata like sample mappings, validation results,
    quality control reports, or other structured analysis outputs.
    """

    identifier: str = Field(..., description="Unique metadata identifier")
    content_type: str = Field(
        ..., description="Metadata type (sample_mapping, validation, qc_report, etc.)"
    )
    description: Optional[str] = Field(None, description="Human-readable description")
    data: Dict[str, Any] = Field(..., description="Metadata content (arbitrary JSON)")
    related_datasets: List[str] = Field(
        default_factory=list, description="Related dataset identifiers"
    )
    source: str = Field(..., description="Source of metadata (tool or service name)")
    cached_at: str = Field(
        ..., description="ISO 8601 timestamp when content was cached"
    )

    @field_validator("identifier")
    @classmethod
    def validate_identifier(cls, v: str) -> str:
        """Validate metadata identifier format."""
        if not v or not v.strip():
            raise ValueError("Metadata identifier cannot be empty")
        return v.strip()

    @field_validator("content_type")
    @classmethod
    def validate_content_type(cls, v: str) -> str:
        """Validate content type is not empty."""
        if not v or not v.strip():
            raise ValueError("Content type cannot be empty")
        return v

    class Config:
        """Pydantic configuration."""

        json_schema_extra = {
            "example": {
                "identifier": "gse12345_to_gse67890_mapping",
                "content_type": "sample_mapping",
                "description": "Sample ID mapping between two datasets",
                "data": {"exact_matches": 10, "fuzzy_matches": 5},
                "related_datasets": ["GSE12345", "GSE67890"],
                "source": "SampleMappingService",
                "cached_at": "2025-01-12T10:30:00",
            }
        }


# ===============================================================================
# Workspace Content Service
# ===============================================================================


class WorkspaceContentService:
    """
    Workspace content management service.

    Manages structured caching of publications, datasets, and metadata in the
    DataManagerV2 workspace with schema validation, professional naming, and
    flexible content retrieval.

    **Features:**
    - Schema validation via Pydantic models
    - Professional naming conventions (lowercase, underscores, timestamps)
    - Level-based content retrieval (summary/methods/samples/platform/full)
    - Content listing and discovery
    - Integration with DataManagerV2 workspace

    **Content Types:**
    - Publications: Papers from PubMed, PMC, bioRxiv
    - Datasets: GEO, SRA, PRIDE datasets with metadata
    - Metadata: Sample mappings, validation results, QC reports

    **Storage Structure:**
    - workspace_path/literature/*.json (publications)
    - workspace_path/data/*.json (datasets)
    - workspace_path/metadata/*.json (metadata)

    Examples:
        >>> service = WorkspaceContentService(data_manager)
        >>>
        >>> # Cache publication
        >>> pub_content = PublicationContent(
        ...     identifier="PMID:35042229",
        ...     title="Single-cell analysis...",
        ...     authors=["Smith J"],
        ...     source="PMC",
        ...     cached_at=datetime.now().isoformat()
        ... )
        >>> path = service.write_content(pub_content, ContentType.PUBLICATION)
        >>>
        >>> # Retrieve summary level
        >>> summary = service.read_content("PMID:35042229",
        ...     ContentType.PUBLICATION, level=RetrievalLevel.SUMMARY)
        >>>
        >>> # List all cached publications
        >>> publications = service.list_content(ContentType.PUBLICATION)
    """

    def __init__(self, data_manager: DataManagerV2):
        """
        Initialize WorkspaceContentService.

        Args:
            data_manager: DataManagerV2 instance for workspace access
        """
        self.data_manager = data_manager
        # Use workspace_path directly for backward compatibility with existing tools
        self.workspace_base = Path(data_manager.workspace_path)

        # Create content subdirectories (aligned with existing research_agent tools)
        self.publications_dir = self.workspace_base / "literature"
        self.datasets_dir = self.workspace_base / "data"
        self.metadata_dir = self.workspace_base / "metadata"
        self.download_queue_dir = self.workspace_base / "download_queue"

        # Create directories if they don't exist
        self.publications_dir.mkdir(parents=True, exist_ok=True)
        self.datasets_dir.mkdir(parents=True, exist_ok=True)
        self.metadata_dir.mkdir(parents=True, exist_ok=True)
        self.download_queue_dir.mkdir(parents=True, exist_ok=True)

        logger.info(
            f"WorkspaceContentService initialized with workspace at {self.workspace_base}"
        )

    def _get_content_dir(self, content_type: ContentType) -> Path:
        """
        Get directory for content type.

        Args:
            content_type: Content type enum

        Returns:
            Path: Directory path for content type
        """
        if content_type == ContentType.PUBLICATION:
            return self.publications_dir
        elif content_type == ContentType.DATASET:
            return self.datasets_dir
        elif content_type == ContentType.METADATA:
            return self.metadata_dir
        elif content_type == ContentType.DOWNLOAD_QUEUE:
            return self.download_queue_dir
        else:
            raise ValueError(f"Unknown content type: {content_type}")

    def _sanitize_filename(self, identifier: str) -> str:
        """
        Sanitize identifier for use as filename.

        Converts identifiers to lowercase, replaces special characters with
        underscores, and enforces professional naming conventions.

        Args:
            identifier: Content identifier (PMID, GSE, etc.)

        Returns:
            str: Sanitized filename (without .json extension)

        Examples:
            >>> service._sanitize_filename("PMID:35042229")
            "pmid_35042229"
            >>> service._sanitize_filename("GSE123456")
            "gse123456"
            >>> service._sanitize_filename("gse12345_to_gse67890_mapping")
            "gse12345_to_gse67890_mapping"
        """
        # Convert to lowercase
        sanitized = identifier.lower()

        # Replace special characters with underscores
        sanitized = sanitized.replace(":", "_").replace("/", "_").replace("\\", "_")
        sanitized = sanitized.replace(" ", "_").replace("-", "_")

        # Remove consecutive underscores
        while "__" in sanitized:
            sanitized = sanitized.replace("__", "_")

        # Strip leading/trailing underscores
        sanitized = sanitized.strip("_")

        return sanitized

    def write_content(
        self,
        content: Union[PublicationContent, DatasetContent, MetadataContent],
        content_type: ContentType,
    ) -> str:
        """
        Write content to workspace with schema validation.

        Args:
            content: Content model (PublicationContent, DatasetContent, or MetadataContent)
            content_type: Content type enum

        Returns:
            str: Path to cached content file

        Raises:
            ValueError: If content type doesn't match content model
            ValidationError: If content fails schema validation

        Examples:
            >>> pub = PublicationContent(
            ...     identifier="PMID:35042229",
            ...     title="Paper title",
            ...     source="PMC",
            ...     cached_at=datetime.now().isoformat()
            ... )
            >>> path = service.write_content(pub, ContentType.PUBLICATION)
            >>> print(path)
            "/workspace/cache/content/publications/pmid_35042229.json"
        """
        # Validate content type matches model
        if content_type == ContentType.PUBLICATION and not isinstance(
            content, PublicationContent
        ):
            raise ValueError(
                "Content type PUBLICATION requires PublicationContent model"
            )
        elif content_type == ContentType.DATASET and not isinstance(
            content, DatasetContent
        ):
            raise ValueError("Content type DATASET requires DatasetContent model")
        elif content_type == ContentType.METADATA and not isinstance(
            content, MetadataContent
        ):
            raise ValueError("Content type METADATA requires MetadataContent model")

        # Get content directory
        content_dir = self._get_content_dir(content_type)

        # Sanitize filename
        filename = self._sanitize_filename(content.identifier)
        file_path = content_dir / f"{filename}.json"

        # Convert content to dict and write to file
        content_dict = content.model_dump()

        with open(file_path, "w") as f:
            json.dump(content_dict, f, indent=2, default=str)

        logger.info(
            f"Cached {content_type.value} '{content.identifier}' to {file_path}"
        )

        return str(file_path)

    def read_content(
        self,
        identifier: str,
        content_type: ContentType,
        level: Optional[RetrievalLevel] = None,
    ) -> Dict[str, Any]:
        """
        Read content from workspace with level-based filtering.

        Args:
            identifier: Content identifier (PMID, GSE, etc.)
            content_type: Content type enum
            level: Retrieval level (summary/methods/samples/platform/full)

        Returns:
            Dict[str, Any]: Content dictionary (filtered by level if specified)

        Raises:
            FileNotFoundError: If content not found in workspace
            ValueError: If level is invalid for content type

        Examples:
            >>> # Get publication summary
            >>> summary = service.read_content(
            ...     "PMID:35042229",
            ...     ContentType.PUBLICATION,
            ...     level=RetrievalLevel.SUMMARY
            ... )
            >>> print(summary.keys())
            dict_keys(['identifier', 'title', 'authors', 'journal', 'year'])
            >>>
            >>> # Get dataset samples
            >>> samples = service.read_content(
            ...     "GSE123456",
            ...     ContentType.DATASET,
            ...     level=RetrievalLevel.SAMPLES
            ... )
            >>> print(samples.keys())
            dict_keys(['identifier', 'sample_count', 'samples'])
        """
        # Get content directory
        content_dir = self._get_content_dir(content_type)

        # Sanitize filename
        filename = self._sanitize_filename(identifier)
        file_path = content_dir / f"{filename}.json"

        # Check if file exists
        if not file_path.exists():
            raise FileNotFoundError(
                f"{content_type.value.capitalize()} '{identifier}' not found in workspace"
            )

        # Read content from file
        with open(file_path, "r") as f:
            content_dict = json.load(f)

        # Apply level-based filtering if specified
        if level is None or level == RetrievalLevel.FULL:
            return content_dict

        # Filter by level
        filtered_dict = self._filter_by_level(content_dict, content_type, level)

        return filtered_dict

    def _filter_by_level(
        self, content: Dict[str, Any], content_type: ContentType, level: RetrievalLevel
    ) -> Dict[str, Any]:
        """
        Filter content dictionary by retrieval level.

        Args:
            content: Full content dictionary
            content_type: Content type
            level: Retrieval level

        Returns:
            Dict[str, Any]: Filtered content dictionary

        Raises:
            ValueError: If level is invalid for content type
        """
        if level == RetrievalLevel.FULL:
            return content

        # Define fields for each level and content type
        level_fields = {
            ContentType.PUBLICATION: {
                RetrievalLevel.SUMMARY: [
                    "identifier",
                    "title",
                    "authors",
                    "journal",
                    "year",
                    "keywords",
                    "source",
                    "cached_at",
                    "url",
                ],
                RetrievalLevel.METHODS: [
                    "identifier",
                    "title",
                    "methods",
                    "source",
                    "cached_at",
                ],
            },
            ContentType.DATASET: {
                RetrievalLevel.SUMMARY: [
                    "identifier",
                    "title",
                    "sample_count",
                    "organism",
                    "source",
                    "cached_at",
                    "url",
                ],
                RetrievalLevel.SAMPLES: [
                    "identifier",
                    "sample_count",
                    "samples",
                    "experimental_design",
                    "source",
                    "cached_at",
                ],
                RetrievalLevel.PLATFORM: [
                    "identifier",
                    "platform",
                    "platform_id",
                    "organism",
                    "source",
                    "cached_at",
                ],
            },
            ContentType.METADATA: {
                RetrievalLevel.SUMMARY: [
                    "identifier",
                    "content_type",
                    "description",
                    "related_datasets",
                    "source",
                    "cached_at",
                ],
            },
        }

        # Check if level is valid for content type
        if content_type not in level_fields:
            raise ValueError(f"Level filtering not supported for {content_type.value}")

        if level not in level_fields[content_type]:
            raise ValueError(
                f"Level '{level.value}' not valid for {content_type.value}. "
                f"Valid levels: {', '.join(k.value for k in level_fields[content_type].keys())}"
            )

        # Filter content to include only specified fields
        fields = level_fields[content_type][level]
        filtered = {k: v for k, v in content.items() if k in fields}

        return filtered

    def list_content(
        self, content_type: Optional[ContentType] = None
    ) -> List[Dict[str, Any]]:
        """
        List all cached content, optionally filtered by type.

        Args:
            content_type: Content type to filter by (None = all types)

        Returns:
            List[Dict[str, Any]]: List of content summaries

        Examples:
            >>> # List all cached content
            >>> all_content = service.list_content()
            >>>
            >>> # List only publications
            >>> publications = service.list_content(ContentType.PUBLICATION)
            >>>
            >>> for pub in publications:
            ...     print(f"{pub['identifier']}: {pub['title']}")
        """
        content_list = []

        # Determine which content types to list
        types_to_list = [content_type] if content_type else list(ContentType)

        for ctype in types_to_list:
            content_dir = self._get_content_dir(ctype)

            for json_file in content_dir.glob("*.json"):
                try:
                    with open(json_file, "r") as f:
                        content_dict = json.load(f)

                    # Add content type and file path
                    content_dict["_content_type"] = ctype.value
                    content_dict["_file_path"] = str(json_file)

                    content_list.append(content_dict)
                except Exception as e:
                    logger.warning(f"Could not read {json_file}: {e}")

        # Sort by cached_at (most recent first)
        content_list.sort(key=lambda x: x.get("cached_at", ""), reverse=True)

        logger.info(
            f"Listed {len(content_list)} cached items"
            + (f" of type {content_type.value}" if content_type else "")
        )

        return content_list

    def delete_content(self, identifier: str, content_type: ContentType) -> bool:
        """
        Delete content from workspace.

        Args:
            identifier: Content identifier
            content_type: Content type

        Returns:
            bool: True if content was deleted, False if not found

        Examples:
            >>> service.delete_content("PMID:35042229", ContentType.PUBLICATION)
            True
        """
        content_dir = self._get_content_dir(content_type)
        filename = self._sanitize_filename(identifier)
        file_path = content_dir / f"{filename}.json"

        if file_path.exists():
            file_path.unlink()
            logger.info(f"Deleted {content_type.value} '{identifier}' from workspace")
            return True
        else:
            logger.warning(
                f"{content_type.value.capitalize()} '{identifier}' not found in workspace"
            )
            return False

    def read_download_queue_entry(self, entry_id: str) -> Dict[str, Any]:
        """
        Read a specific download queue entry.

        Args:
            entry_id: Queue entry identifier

        Returns:
            Dict[str, Any]: Entry details

        Raises:
            FileNotFoundError: If entry not found in queue
            AttributeError: If DataManager download_queue not available

        Examples:
            >>> entry = service.read_download_queue_entry("queue_entry_123")
            >>> print(entry['dataset_id'])
            GSE180759
        """
        if not self.data_manager or not hasattr(self.data_manager, "download_queue"):
            raise AttributeError("DataManager download_queue not available")

        if self.data_manager.download_queue is None:
            raise AttributeError("DataManager download_queue not available")

        try:
            entry = self.data_manager.download_queue.get_entry(entry_id)
            return entry.model_dump(mode="json")  # Pydantic v2 method
        except Exception as e:
            raise FileNotFoundError(
                f"Download queue entry '{entry_id}' not found"
            ) from e

    def list_download_queue_entries(
        self, status_filter: Optional[str] = None
    ) -> List[Dict[str, Any]]:
        """
        List all download queue entries with optional status filtering.

        Args:
            status_filter: Optional status to filter by (PENDING, IN_PROGRESS, COMPLETED, FAILED)

        Returns:
            List[Dict[str, Any]]: List of entry dictionaries

        Examples:
            >>> # List all entries
            >>> entries = service.list_download_queue_entries()
            >>> print(len(entries))
            5
            >>>
            >>> # Filter by status
            >>> pending = service.list_download_queue_entries(status_filter="PENDING")
            >>> print(len(pending))
            2
        """
        if not self.data_manager or not hasattr(self.data_manager, "download_queue"):
            return []

        if self.data_manager.download_queue is None:
            return []

        from lobster.core.schemas.download_queue import DownloadStatus

        # Convert string to enum if provided
        status_enum = None
        if status_filter:
            try:
                # DownloadStatus enum values are lowercase ("pending", "completed")
                status_enum = DownloadStatus(status_filter.lower())
            except ValueError:
                # Invalid status, return empty list
                logger.warning(
                    f"Invalid status filter '{status_filter}', returning empty list"
                )
                return []

        entries = self.data_manager.download_queue.list_entries(status=status_enum)
        return [entry.model_dump(mode="json") for entry in entries]

    def get_workspace_stats(self) -> Dict[str, Any]:
        """
        Get statistics about cached workspace content.

        Returns:
            Dict[str, Any]: Statistics including counts by type and total size

        Examples:
            >>> stats = service.get_workspace_stats()
            >>> print(stats)
            {
                'total_items': 42,
                'publications': 15,
                'datasets': 20,
                'metadata': 7,
                'total_size_mb': 12.5,
                'cache_dir': '/workspace/cache/content'
            }
        """
        stats = {
            "total_items": 0,
            "publications": 0,
            "datasets": 0,
            "metadata": 0,
            "total_size_mb": 0.0,
            "cache_dir": str(self.cache_dir),
        }

        # Count items by type and calculate total size
        for content_type in ContentType:
            content_dir = self._get_content_dir(content_type)
            json_files = list(content_dir.glob("*.json"))

            count = len(json_files)
            stats[content_type.value + "s"] = count  # pluralize
            stats["total_items"] += count

            # Calculate size
            for json_file in json_files:
                stats["total_size_mb"] += json_file.stat().st_size / (1024 * 1024)

        # Round size to 2 decimals
        stats["total_size_mb"] = round(stats["total_size_mb"], 2)

        return stats
