
#################################################
### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
#################################################
# file to edit: ./nb/main.ipynb

import sys
if __name__ == '__main__': sys.path.append('..')
import download_youtube_subtitle.common as common
from pprint import pprint
def pj(*args, **kargs):
    if common.IN_JUPYTER:
        pprint(*args, **kargs)

from functools import partial
import sys

perr = partial(print, "ERR: ")

import requests
import socket
socket.setdefaulttimeout(5.)

# dealing with xml.dom
def getVal(dom, key):
    att = dom.attributes[key]
    return att.value

def eachTxt(txt):
    start = getVal(txt, 'start')
    dur = getVal(txt, 'dur')
    txt = html.unescape((txt.firstChild.data))
    return {
        "start":start,
        "dur": dur,
        "text": txt
    }

# todo
# add tlang=zh-Hans to baseUrl
# will get translation

# getting track info

def get_data(link):
    data = requests.get(link)
    data = data.text
    return data

import re
import json
import urllib
def get_tracks_title(data):
    decodedData = urllib.parse.unquote(data)
    if 'captionTracks' not in decodedData:
        perr("no caption available. ;(")
        exit(1)
    match = re.search(r'({"captionTracks":.*isTranslatable":(true|false)}])', decodedData)
    match = match.group(1)
    match = f"{match}}}"
    captionTracks =  json.loads(match)['captionTracks']
    match = re.search(r'title":"(.*?)"', decodedData)
    title = match.group(1)
    return captionTracks, title

# dealing with transcript
import math
from functools import partial
import sys
from xml.dom.minidom import parseString
import html
def parseTranscript(transcript):
    try:
        dom = parseString(transcript.text)
    except :
        perr("check your lang code")
        perr("server response")
        perr(transcript.text)
        exit(1)
    texts = dom.getElementsByTagName('text')
    texts = list(map( eachTxt, texts,))
    return texts

def each_sent(o, file=sys.stdout):
    start = o['start']
    start = float(start)
    minute = math.floor(start/60)
    second = math.floor(start%60)
    p = partial(print, file=file)
    p(f"---------{minute:02d}:{second:02d}----------")
    p(o['text'])
    translate_text = o.get('translate_text', None)
    if translate_text:
        p(translate_text)


# dealing with valid filename
# https://github.com/django/django/blob/master/django/utils/text.py
import re
def get_valid_filename(s):
    """
    Return the given string converted to a string that can be used for a clean
    filename. Remove leading and trailing spaces; convert other spaces to
    underscores; and remove anything that is not an alphanumeric, dash,
    underscore, or dot.
    >>> get_valid_filename("john's portrait in 2004.jpg")
    'johns_portrait_in_2004.jpg'
    """
    s = str(s).strip().replace(' ', '_')
    return re.sub(r'(?u)[^-\w.]', '', s)

def parseVideoID(videoID):
    if 'youtu' in videoID:
        videoID = re.search('v=([^&]+)', videoID).group(1)

    video_link = f'https://www.youtube.com/watch?v={videoID}'
    data_link=f"https://youtube.com/get_video_info?video_id={videoID}"
    return videoID, video_link, data_link

import fire
import sys
from functools import partial
import json
import re
def main(videoID, output_file=None, save_to_file=True, translation='zh-Hans', to_json=False):
    """
    download youtube closed caption(subtitles) by videoID

    Examples:
    dl-youtube-cc 5tKOV0KqPlg --save_to_file=False # print stuff in console
    dl-youtube-cc 5tKOV0KqPlg --output_file='test.txt' # print stuff in named file
    dl-youtube-cc 5tKOV0KqPlg --to_json=True # print stuff in json
    dl-youtube-cc 5tKOV0KqPlg --translation 'ja' # print stuff in named file
    dl-youtube-cc 5tKOV0KqPlg --translation False # without translation

    Argument:
    videoID: string, the id of youtube video, the string after 'v=' in a youtube video link
    output_file: string, default to vidio title
    save_to_file: bool, default to True, True or False
    translation: bool or string, default to 'zh-Hans' for simplified Chinese, False or lang code, see ./lang_code.json for full list
    to_json: bool, default to False, export caption to json

    """

    videoID, video_link, data_link = parseVideoID(videoID)
    data=get_data(data_link)
    captionTracks, title = get_tracks_title(data)

    info = partial(print, "INFO: ")

    info("available caption(s) are")

    for caption in captionTracks:
        info(caption['name']['simpleText'], '')

    info('using',captionTracks[0]['name']['simpleText'] )

    baseUrl = captionTracks[0]['baseUrl']
    transcript = requests.get(baseUrl)
    subtitle = parseTranscript(transcript)

    if translation:
        baseUrl = captionTracks[0]['baseUrl'] + '&tlang=' + translation
        transcript = requests.get(baseUrl)
        subtitle_cn = parseTranscript(transcript)
        for sub, cn in zip(subtitle, subtitle_cn):
            sub['translate_text'] = cn['text']

    f = sys.stdout
    if save_to_file :
        if output_file is None:
            if to_json:
                output_file = get_valid_filename(f'{title}.json')
            else:
                output_file = get_valid_filename(f'{title}.txt')
        f = open(output_file , 'w', encoding='UTF-8')
        info("save to ", output_file )

    if to_json:
        json.dump(subtitle, f, indent=4, ensure_ascii=False)
        return

    pfile = partial(print, file=f)
    pfile(video_link, file=f)
    for sent in subtitle:
        each_sent(sent, file=f)
        pfile()



from functools import partial
def set_fire(fn):
    if common.IN_TRAVIS or common.IN_JUPYTER:
        return
    fire.Fire(fn)
if __name__ == '__main__':
    if common.IN_TRAVIS or common.IN_JUPYTER:
        pass
    else :
        set_fire(main)
fire_main = partial(set_fire, main)