#!/usr/bin/env python

"""
Copyright (c) 2015 Kwangbom Choi, The Jackson Laboratory
This software was developed by Kwangbom "KB" Choi in Gary Churchill's Lab.
This is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this software. If not, see <http://www.gnu.org/licenses/>.
"""

import os
import sys
import getopt
import string
import subprocess
from collections import defaultdict
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_dna
from itertools import dropwhile


help_message = '''
Usage:
    create-hybrid -F <fasta_files> [ -s <hap_list> -o <out_file> --create-bowtie-index ]

Input:
    -F <fasta_files> : List of fasta files (comma delimited)
    -s <hap_list>    : Names of haplotypes to be used instead (comma delimited, in the order of genomes)
    -o <out_file>    : Output file name (default: './emase.pooled.targets.fa')

Parameters:
    --help, -h            : shows this help message
    --create-bowtie-index : builds bowtie1 index
'''


class Usage(Exception):
    def __init__(self, msg):
        self.msg = msg


def main(argv=None):
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "hF:s:o:", ["help", "create-bowtie-index"])
        except getopt.error, msg:
            raise Usage(msg)

        # Default values of vars
        fastalist = list()
        haplist = None
        num_haps = 0
        outfile = './emase.pooled.targets.fa'
        build_bowtie_index = False

        # option processing (change this later with optparse)
        for option, value in opts:
            if option in ("-h", "--help"):
                raise Usage(help_message)
            if option == "-F":
                fastalist = value.split(',')
                num_haps = len(fastalist)
            if option == "-s":
                haplist = value.split(',')
            if option == "--create-bowtie-index":
                build_bowtie_index = True
            if option == "-o":
                outfile = value

        # Check if the required options are given
        if num_haps < 2:
            print >> sys.stderr, "At least two fasta files should be given."
            return 2
        if haplist is None:
            haplist = list(string.uppercase[:num_haps])
            print >> sys.stderr, "Default haplotype names will be used: %s" % ', '.join(haplist)
        if len(haplist) != num_haps:
            print >> sys.stderr, "The number of specified haplotypes is not matching to the number of fasta files."
            return 2

        #
        # Main body
        #

        outdir = os.path.dirname(outfile)
        if not os.path.exists(outdir):
            os.mkdir(outdir)

        #
        # Get pooled transcriptome
        outbase = os.path.splitext(outfile)[0]
        lenfile = outbase + '.info'
        seqout = open(outfile, 'w')
        lenout = open(lenfile, 'w')
        for hid in xrange(num_haps):
            fasta = fastalist[hid]
            #fastaname = os.path.splitext(os.path.basename(fasta))[0]
            hapname = haplist[hid]
            print >> sys.stderr, "Adding suffix \'_%s\' to the sequence ID's of %s..." % (hapname, fasta)
            fh = open(fasta)
            curline = fh.next()  # The first fasta header
            curline = curline.rstrip().split()[0] + '_' + hapname
            seqout.write('%s\n' % curline)
            lenout.write('%s\t' % curline[1:])
            seqlen = 0
            for curline in fh:
                if curline[0] == '>':
                    curline = curline.rstrip().split()[0] + '_' + hapname + "\n"
                    lenout.write('%d\n%s\t' % (seqlen, curline[1:].rstrip()))
                    seqlen = 0
                else:
                    seqlen += len(curline.rstrip())
                seqout.write(curline)
            fh.close()
            lenout.write('%d\n' % seqlen)
        seqout.close()
        lenout.close()

        #
        # Build bowtie index for the pooled transcriptome
        if build_bowtie_index:
            out_index = outbase + '.bowtie1'
            print >> sys.stderr, "Building bowtie1 index..."
            status = subprocess.call("bowtie-build %s %s" % (outfile, out_index), shell=True)

        #
        # End of main body
        #

    except Usage, err:
        print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
        return 2


if __name__ == "__main__":
    sys.exit(main())
