import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)


class CleanTweet:
    def __init__(self, text, *args):
        """
        a class for cleaning and preprocessing the text
        :param text: .txt file containing the textual data you want to use for the Natural Language Processing task.
        :param args: pass any argument that you want (leave this empty).
        """
        self.text = text
        self.word_count = 0
        self.special_characters = ""

    def clean(self, tokenize_method: bool = False):
        """
        method to clean the text
        :param tokenize_method: if set to True, it wil sentence tokenize else it will
        word tokenize. The default is word tokenize
        :return: the word tokens as a string
        """
        with open(self.text, "r", encoding='utf8', errors='ignore') as read_object:
            lines = read_object.read()

            # remove special characters and punctuations
            lines = re.sub('#', '', lines)
            lines = re.sub('\\n\\n', '', lines)
            lines = re.sub('[\\n\\n]', '', lines)
            lines = re.sub('(\n\n)', '', lines)
            lines = re.sub('[{}:_@\[\]0-9,%&*""?!/-]', '', lines)

            # remove the id and text tag
            lines = re.sub('(id)', '', lines)
            lines = re.sub('(text)', '', lines)
            lines = re.sub('(RT)', '', lines)

            # remove paragraph space/indentation
            lines = re.sub('  ', '', lines)
            if tokenize_method is True:
                lines = nltk.sent_tokenize(lines)
                self.word_count = len(lines)
                lines = ' '.join(lines)
            else:
                lines = word_tokenize(lines)
                lines = [line for line in lines if line.isalpha()]
                self.word_count = len(lines)
                lines = ' '.join(lines)
        return lines

    def show_word_collocations(self):
        """
        method to show the corresponding Word Collocations in the document
        :return: a list of the word collocations
        """
        text = nltk.Text(self.text)
        return text.collocation_list()

    def remove_curse_words(self):
        """
        method to remove curse words like 'fuck', 'hell', and 'damn'
        :return: a list of appropriate words in the text/corpus
        """
        data = self.clean()
        data_list = data.split()
        for datum in range(0, len(data_list)):
            if data_list[datum].strip().lower() == 'fuck' or data_list[datum].strip().lower() == 'hell'\
                    or data_list[datum].strip().lower() == 'damn':
                data_list[datum] = '****'
        return data_list

    def remove_stopwords(self):
        """
        Method to remove common English stopwords from the text
        :return: text with stopwords removed as a string
        """
        from nltk.corpus import stopwords
        nltk.download('stopwords', quiet=True)
        stop_words = set(stopwords.words('english'))

        words = self.clean().split()
        filtered_words = [
            word for word in words if word.lower() not in stop_words]
        return ' '.join(filtered_words)

    def remove_conjunctions(self):
        """
        Method to remove coordinating conjunctions (and, but, or, nor, for, yet, so) from the text
        :return: text with conjunctions removed as a string
        """
        conjunctions = {'and', 'but', 'or', 'nor', 'for', 'yet', 'so'}
        words = self.clean().split()
        filtered_words = [
            word for word in words if word.lower() not in conjunctions]
        return ' '.join(filtered_words)

    def remove_pronouns(self):
        """
        Method to remove common pronouns (I, me, my, mine, we, us, our, etc.) from the text
        :return: text with pronouns removed as a string
        """
        pronouns = {
            'i', 'me', 'my', 'mine', 'myself',
            'we', 'us', 'our', 'ours', 'ourselves',
            'you', 'your', 'yours', 'yourself', 'yourselves',
            'he', 'him', 'his', 'himself',
            'she', 'her', 'hers', 'herself',
            'it', 'its', 'itself',
            'they', 'them', 'their', 'theirs', 'themselves'
        }
        words = self.clean().split()
        filtered_words = [
            word for word in words if word.lower() not in pronouns]
        return ' '.join(filtered_words)

    def remove_custom_words(self, custom_words):
        """
        Method to remove any custom list of words provided by the user
        :param custom_words: set or list of words to remove
        :return: text with custom words removed as a string
        """
        if not isinstance(custom_words, (set, list)):
            raise ValueError("custom_words must be a set or list of strings")

        custom_words = set(word.lower() for word in custom_words)
        words = self.clean().split()
        filtered_words = [
            word for word in words if word.lower() not in custom_words]
        return ' '.join(filtered_words)

    def remove_articles(self):
        """
        Method to remove articles (a, an, the) from the text
        :return: text with articles removed as a string
        """
        articles = {'a', 'an', 'the'}
        words = self.clean().split()
        filtered_words = [
            word for word in words if word.lower() not in articles]
        return ' '.join(filtered_words)

    def remove_prepositions(self):
        """
        Method to remove common prepositions from the text
        :return: text with prepositions removed as a string
        """
        prepositions = {
            'in', 'on', 'at', 'to', 'for', 'with', 'by', 'of',
            'from', 'about', 'against', 'between', 'into', 'through',
            'during', 'before', 'after', 'above', 'below', 'up',
            'down', 'under', 'over', 'again', 'further', 'beyond'
        }
        words = self.clean().split()
        filtered_words = [
            word for word in words if word.lower() not in prepositions]
        return ' '.join(filtered_words)
