# phredator/parser/fastqc_parser.py
import os
import zipfile
from typing import Dict, List
from dataclasses import dataclass, field

@dataclass
class FastQCReport:
    sample_name: str
    per_base_quality: Dict[str, Dict[str, float]] = field(default_factory=dict)
    gc_content: float = 0.0
    duplication_levels: Dict[str, float] = field(default_factory=dict)
    adapter_content: Dict[str, float] = field(default_factory=dict)
    overrepresented_sequences: List[str] = field(default_factory=list)

class FastQCParser:
    def __init__(self, filepath: str):
        self.filepath = filepath
        self.sample_name = os.path.basename(filepath).split('_fastqc')[0]
        self.data = FastQCReport(sample_name=self.sample_name)

    def parse(self) -> FastQCReport:
        """Parse FastQC zip or folder and return structured data."""
        if not os.path.exists(self.filepath):
            raise FileNotFoundError(f"File not found: {self.filepath}")

        # Handle zip file
        if self.filepath.endswith('.zip'):
            with zipfile.ZipFile(self.filepath, 'r') as z:
                if 'fastqc_data.txt' not in z.namelist():
                    raise ValueError("fastqc_data.txt missing in zip")
                with z.open('fastqc_data.txt') as f:
                    self._parse_fastqc_data(f.read().decode('utf-8'))
        else:
            # Handle folder with fastqc_data.txt
            data_path = os.path.join(self.filepath, 'fastqc_data.txt')
            if not os.path.exists(data_path):
                raise FileNotFoundError(f"{data_path} missing")
            with open(data_path, 'r') as f:
                self._parse_fastqc_data(f.read())

        return self.data

    def _parse_fastqc_data(self, content: str):
        """Internal method to parse fastqc_data.txt content."""
        lines = content.splitlines()
        section = None

        for line in lines:
            line = line.strip()
            if line.startswith('>>Per base sequence quality'):
                section = 'per_base_quality'
                continue
            elif line.startswith('>>Per sequence GC content'):
                section = 'gc_content'
                continue
            elif line.startswith('>>Sequence Duplication Levels'):
                section = 'duplication_levels'
                continue
            elif line.startswith('>>Adapter Content'):
                section = 'adapter_content'
                continue
            elif line.startswith('>>Overrepresented sequences'):
                section = 'overrepresented_sequences'
                continue
            elif line.startswith('>>END_MODULE'):
                section = None
                continue

            if section == 'per_base_quality' and line and not line.startswith('#'):
                parts = line.split()
                if len(parts) >= 3:
                    base_range = parts[0]
                    mean_quality = float(parts[1])
                    median_quality = float(parts[2])
                    self.data.per_base_quality[base_range] = {
                        'mean': mean_quality,
                        'median': median_quality
                    }

            elif section == 'gc_content' and line and not line.startswith('#'):
                # Typically lines: "Count GC"
                parts = line.split()
                if len(parts) >= 2:
                    try:
                        gc_percent = float(parts[1])
                        self.data.gc_content = gc_percent
                    except ValueError:
                        continue  # skip malformed lines

            elif section == 'duplication_levels' and line and not line.startswith('#'):
                parts = line.split()
                if len(parts) >= 2:
                    try:
                        percent_seq = float(parts[0])
                        fraction = float(parts[1])
                        self.data.duplication_levels[str(int(percent_seq))] = fraction
                    except ValueError:
                        continue

            elif section == 'adapter_content' and line and not line.startswith('#'):
                parts = line.split()
                if len(parts) >= 2:
                    adapter = parts[0]
                    try:
                        fraction = float(parts[1])
                        self.data.adapter_content[adapter] = fraction
                    except ValueError:
                        continue

            elif section == 'overrepresented_sequences' and line and not line.startswith('#'):
                seq = line.split()[0]  # sequence is the first column
                self.data.overrepresented_sequences.append(seq)