import os
import zipfile
import shutil
import gzip
import logging
import pandas as pd
import argparse

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def extract_and_flatten(zip_path, target_dir):
    # Create the target directory if it doesn't exist
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    
    # Open the ZIP file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Loop through all files in the ZIP
        for file in zip_ref.namelist():
            # Extract each file to the target directory
            extracted_path = zip_ref.extract(file, target_dir)
            # Flatten the directory structure (if any)
            if os.path.isdir(extracted_path):
                continue
            else:
                base_name = os.path.basename(file)
                flattened_path = os.path.join(target_dir, base_name)
                # Move the file to the flat directory
                shutil.move(extracted_path, flattened_path)
    
    # Remove all empty folders
    for root, dirs, files in os.walk(target_dir, topdown=False):
        for dir_name in dirs:
            dir_path = os.path.join(root, dir_name)
            if not os.listdir(dir_path):  # Check if the directory is empty
                os.rmdir(dir_path)  # Remove the empty directory
    
    print(f"Files extracted, flattened, and empty folders removed in: {target_dir}")

def extract_db2_gz(file_path):
    # Ensure the file is a .db2.gz file
    if not file_path.endswith('.db2.gz'):
        print("The provided file is not a .db2.gz file.")
        return

    # Create the extracted folder inside the same directory as the file
    extracted_folder = os.path.join(os.path.dirname(file_path), 'extracted')
    os.makedirs(extracted_folder, exist_ok=True)

    # Output path by removing the .gz extension
    output_path = os.path.join(extracted_folder, os.path.basename(file_path)[:-3])

    # Extract the .gz file
    with gzip.open(file_path, 'rb') as f_in:
        with open(output_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    print(f"Extracted: {file_path} -> {output_path}")

def extract_db2_gz_from_folder(input_folder):
    # Iterate over the files in the given folder
    for file in os.listdir(input_folder):
        if file.endswith('.db2.gz'):
            extract_db2_gz(os.path.join(input_folder, file))


def tldr_batch_download(csv_file, api_manager, output_path=".", overwrite=False):
  df = pd.read_csv(csv_file)

  # Iterate over each row in the DataFrame and download decoys
  for _, row in df.iterrows():
      job_number = str(row['job_no'])  # Ensure the job number is a string
      receptor_folder = row['receptor']  # Assuming 'receptor' column has the folder name
      rec_output_path = os.path.join(output_path, receptor_folder, "tldr_download")
      extract_dir = os.path.join(output_path, receptor_folder, "decoys")

      if overwrite or not os.path.exists(rec_output_path):
        print(f"Downloading decoys for job {job_number} in {receptor_folder}")

        # Download the decoys for this job
        download_decoys(api_manager, job_number, rec_output_path, retries=5)

        # Check if decoys_*.zip exist in $receptor_folder and extract to output_folder
        # TODO: Might need to not hard code this, since this might conflict with future modules
        decoys_zip = f"decoys_{job_number}.zip"
        decoys_zip_path= os.path.join(rec_output_path, decoys_zip)
        if os.path.exists(decoys_zip_path):
            extract_and_flatten(decoys_zip_path, extract_dir)
            extract_db2_gz_from_folder(extract_dir)
        else:
            print(f"No {decoys_zip} found for job {job_number} in {rec_output_path}")
      else:
        print(f"Skipping job {job_number} in {receptor_folder} as output already exists")

def main():
"""CLI Parser for handling different extraction and download tasks."""
parser = argparse.ArgumentParser(description="Utility for extracting and processing molecular data.")

subparsers = parser.add_subparsers(dest="command")

# Extract ZIP files
zip_parser = subparsers.add_parser("extract-zip", help="Extract a ZIP file and flatten contents")
zip_parser.add_argument("--zip_path", required=True, help="Path to the ZIP file")
zip_parser.add_argument("--target_dir", required=True, help="Directory to extract contents into")

# Extract a single .db2.gz file
db2_parser = subparsers.add_parser("extract-db2", help="Extract a .db2.gz file")
db2_parser.add_argument("--file_path", required=True, help="Path to the .db2.gz file")

# Extract all .db2.gz files from a folder
db2_folder_parser = subparsers.add_parser("extract-db2-folder", help="Extract all .db2.gz files from a folder")
db2_folder_parser.add_argument("--input_folder", required=True, help="Folder containing .db2.gz files")

# TLDR Batch Download
tldr_parser = subparsers.add_parser("tldr-batch", help="Download and process TLDR batch jobs from a CSV")
tldr_parser.add_argument("--csv_file", required=True, help="Path to the CSV file")
tldr_parser.add_argument("--output_path", default=".", help="Output directory (default: current directory)")
tldr_parser.add_argument("--overwrite", action="store_true", help="Overwrite existing files")

args = parser.parse_args()

if args.command == "extract-zip":
    extract_and_flatten(args.zip_path, args.target_dir)
elif args.command == "extract-db2":
    extract_db2_gz(args.file_path)
elif args.command == "extract-db2-folder":
    extract_db2_gz_from_folder(args.input_folder)
elif args.command == "tldr-batch":
    api_manager = None  # Replace with actual API manager instance
    tldr_batch_download(args.csv_file, api_manager, args.output_path, args.overwrite)
else:
    parser.print_help()

if __name__ == "__main__":
    main()
