from pathlib import Path
from typing import Optional
from rdkit import Chem

def split_sdf_to_individuals(
    input_sdf: Path,
    output_dir: Path,
    prefix: str = "ligand_",
    start_index: int = 1,
) -> None:
    """Split a large SDF file into individual SDF files, one molecule per file,
    skipping those that cannot be converted to SMILES (likely invalid).
    """
    output_dir.mkdir(parents=True, exist_ok=True)

    supplier = Chem.SDMolSupplier(str(input_sdf), removeHs=False, sanitize=False)

    for i, mol in enumerate(supplier):
        idx = start_index + i
        output_file = output_dir / f"{prefix}{idx}.sdf"

        if mol is None:
            print(f"Skipped molecule {idx}: RDKit failed to parse.")
            continue

        # Try to convert to SMILES to catch common structural errors (e.g., valence)
        try:
            _ = Chem.MolToSmiles(mol, allHsExplicit=True)
        except Exception as e:
            print(f"Skipped molecule {idx}: SMILES conversion failed ({e})")
            continue

        try:
            writer = Chem.SDWriter(str(output_file))
            writer.write(mol)
            writer.close()
            print(f"Created {output_file}")
        except Exception as e:
            print(f"Skipped molecule {idx}: Failed to write: {e}")

def main():
    import argparse

    parser = argparse.ArgumentParser(description="Split a large SDF file into individual files")
    parser.add_argument("input_sdf", type=str, help="Path to input SDF file")
    parser.add_argument("output_dir", type=str, help="Path to output directory")
    parser.add_argument("--prefix", type=str, default="ligand_", help="Prefix for output filenames")
    parser.add_argument("--start-index", type=int, default=1, help="Starting index for output filenames")

    args = parser.parse_args()

    split_sdf_to_individuals(
        input_sdf=Path(args.input_sdf),
        output_dir=Path(args.output_dir),
        prefix=args.prefix,
        start_index=args.start_index,
    )

if __name__ == "__main__":
    main()