# Author : berru@riseup.net
# Source : https://git.sr.ht/~berru/url2epub

import pandoc
import requests
import argparse
import re
import string
import random
import tempfile
from readability import Document
from sys import exit, stderr
from unidecode import unidecode
from bs4 import BeautifulSoup
from typing import Dict, Optional
from os import fdopen


def sanitize_argument(cli_argument: str) -> str:
    return cli_argument.replace("'", "")


def to_cli_argument(metadata_name: str, metadata_value: Optional[str]) -> str:
    if metadata_value:
        return f"--metadata={sanitize_argument(metadata_name)}:'{sanitize_argument(metadata_value)}'"
    else:
        return f"--metadata={sanitize_argument(metadata_name)}"


def download_cover_image(img_url: str) -> Optional[str]:
    """Downloads the image to a tempfile and returns its path"""
    img_suffix = img_url.split("/")[-1].split("?")[0]

    try:
        img_response = requests.get(img_url)
        fd, path = tempfile.mkstemp(suffix=img_suffix)
        with fdopen(fd, "wb") as f:
            f.write(img_response.content)
            return path
    except:
        # Failures can be disk space, networks, invalid url… All of them
        # result in None (no image) anyway
        return None


def extract_opengraph_tags(html: str) -> Dict[str, str]:
    """Extract OpenGraph metadata."""
    return {
        og_tag["property"].split(":")[1]: og_tag.get("content", "")
        for og_tag in BeautifulSoup(html, "html.parser").find_all("meta", property=True)
        if og_tag["property"].startswith("og:")
    }


def random_alphanum(size: int) -> str:
    return "".join(
        random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits)
        for _ in range(16)
    )


def sanitize_filename(page_title: str) -> str:
    """
    >>> sanitize_name("élève")
    'eleve.epub'
    >>> sanitize_name("à l'école")
    'a_l_ecole.epub'
    >>> sanitize_name("Éco-paturage : pourquoi ? économie / écologie / société")
    'eco-paturage_pourquoi_economie_ecologie_societe.epub'
    """
    sane_name: str = str.join(
        "_",
        [
            p
            for p in re.sub(r"[^a-z0-9\-_]+", "_", unidecode(page_title.lower())).split(
                "_"
            )
            if len(p) > 0
        ],
    )
    if len(sane_name) == 0:
        sane_name = random_alphanum(8)
    return f"{sane_name}.epub"


def main():
    parser = argparse.ArgumentParser(
        prog="url2epub",
        usage="url2epub.py --url myverygoodwebsite.com/article --outfile /mnt/my-ereader/article.epub",
        description="Downloads a webpage, extracts the readable content using Readability, and saves it to an epub to be read on your favorite e-reader using Pandoc.",
    )
    parser.add_argument(
        metavar="URL",
        dest="url",
        help="URL to get the epub from. If not present, 'http://' will be prepended to the argument.",
    )
    parser.add_argument(
        "-o",
        "--outfile",
        type=argparse.FileType("w"),
        help="Outfile to save the ebook to. If not present, will try to make up a name from the webpage's title. Might overwrite existing file, use with caution.",
    )
    parser.add_argument(
        "--epub3",
        action="store_true",
        default=False,
        help="Saves the book in epub3 format (default=False)",
    )
    parser.add_argument('--version', action='version', version='%(prog)s 1.1.0')

    options = parser.parse_args()
    url = options.url if options.url.startswith("http") else f"http://{options.url}"

    response = requests.get(url)
    if response.status_code != 200:
        exit(-1)
    else:
        document = Document(response.content.decode())
        og_tags: Dict[str, str] = extract_opengraph_tags(response.content)
        cover_image_url = og_tags["image"]
        pandoc_options = [to_cli_argument(k, v) for k, v in og_tags.items()]
        if "image" in og_tags.keys():
            image_filepath = download_cover_image(og_tags["image"])
            if image_filepath:
                pandoc_options.append(f"--epub-cover-image={image_filepath}")
            else:
                print(
                    f"Error downloading cover image '{og_tags['image']}'", file=stderr
                )

        outfile_path = (
            options.outfile.name
            if options.outfile
            else sanitize_filename(document.short_title())
        )
        pandoc.write(
            pandoc.read(source=document.summary(), format="HTML"),
            file=outfile_path,
            format="epub3" if options.epub3 else "epub",
            options=pandoc_options,
        )
        print(f"Saved to '{outfile_path}'")


if __name__ == "__main__":
    main()
