import re
import nltk
from cleantweet import CleanTweet
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('vader_lexicon', quiet=True)


class ClassificationTweet(CleanTweet):
    """
    Class for various text classification tasks including spam detection,
    hate speech detection, question answering, religious undertones, and racial tone classification
    """
    
    def __init__(self, text, *args):
        super().__init__(text, *args)
        self._spam_keywords = {
            'free', 'click', 'now', 'limited', 'offer', 'winner', 'congratulations',
            'prize', 'claim', 'urgent', 'act now', 'buy now', 'discount', 'save',
            'guaranteed', 'risk free', 'no credit check', 'make money', 'work from home'
        }
        
        self._hate_keywords = {
            'hate', 'kill', 'destroy', 'attack', 'violence', 'threat', 'harm',
            'stupid', 'idiot', 'moron', 'worthless', 'pathetic', 'disgusting'
        }
        
        self._religious_keywords = {
            'god', 'jesus', 'christ', 'allah', 'buddha', 'hindu', 'islam', 'christian',
            'bible', 'quran', 'prayer', 'pray', 'worship', 'faith', 'holy', 'sacred',
            'divine', 'blessed', 'amen', 'hallelujah', 'prophet', 'saint', 'angel',
            'heaven', 'hell', 'sin', 'salvation', 'redemption', 'spiritual'
        }
        
        self._racial_keywords = {
            'race', 'racial', 'ethnic', 'racist', 'discrimination', 'prejudice',
            'stereotype', 'minority', 'majority', 'white', 'black', 'asian', 'hispanic',
            'african', 'european', 'indigenous', 'native', 'immigrant', 'refugee'
        }

    def spam_detection(self, threshold: float = 0.3):
        """
        Method to detect spam content in the text
        :param threshold: threshold for spam detection (0.0 to 1.0), default is 0.3
        :return: a dictionary with spam score and classification
        """
        text_lower = self.clean().lower()
        words = word_tokenize(text_lower)
        word_set = set(words)
        
        # Count spam keywords
        spam_matches = len(word_set.intersection(self._spam_keywords))
        total_words = len(word_set)
        
        # Calculate spam score
        spam_score = spam_matches / max(total_words, 1)
        
        # Check for excessive capitalization (spam indicator)
        caps_ratio = sum(1 for c in self.clean() if c.isupper()) / max(len(self.clean()), 1)
        
        # Check for excessive punctuation
        punct_count = sum(1 for c in self.clean() if c in '!?')
        punct_ratio = punct_count / max(len(sent_tokenize(self.clean())), 1)
        
        # Combined spam score
        final_score = (spam_score * 0.5) + (min(caps_ratio, 0.3) * 0.3) + (min(punct_ratio, 0.2) * 0.2)
        
        is_spam = final_score >= threshold
        
        return {
            'is_spam': is_spam,
            'spam_score': round(final_score, 3),
            'keyword_matches': spam_matches,
            'total_words': total_words,
            'confidence': 'high' if abs(final_score - threshold) > 0.2 else 'medium'
        }

    def hate_speech_detection(self, threshold: float = 0.2):
        """
        Method to detect hateful or harmful speech in the text
        :param threshold: threshold for hate speech detection (0.0 to 1.0), default is 0.2
        :return: a dictionary with hate speech score and classification
        """
        text_lower = self.clean().lower()
        words = word_tokenize(text_lower)
        word_set = set(words)
        
        # Count hate keywords
        hate_matches = len(word_set.intersection(self._hate_keywords))
        total_words = len(word_set)
        
        # Calculate hate speech score
        hate_score = hate_matches / max(total_words, 1)
        
        # Check for aggressive punctuation
        aggressive_punct = sum(1 for c in self.clean() if c in '!?')
        punct_score = min(aggressive_punct / max(len(sent_tokenize(self.clean())), 1), 0.3)
        
        # Combined score
        final_score = (hate_score * 0.7) + (punct_score * 0.3)
        
        is_hateful = final_score >= threshold
        
        return {
            'is_hateful': is_hateful,
            'hate_score': round(final_score, 3),
            'keyword_matches': hate_matches,
            'total_words': total_words,
            'severity': 'high' if final_score > 0.4 else 'medium' if final_score > threshold else 'low'
        }

    def religious_undertones(self):
        """
        Method to detect religious undertones in the text
        :return: a dictionary with religious content analysis
        """
        text_lower = self.clean().lower()
        words = word_tokenize(text_lower)
        word_set = set(words)
        
        # Count religious keywords
        religious_matches = word_set.intersection(self._religious_keywords)
        match_count = len(religious_matches)
        total_words = len(word_set)
        
        # Calculate religious score
        religious_score = match_count / max(total_words, 1)
        
        # Identify which religions are mentioned
        religions_mentioned = []
        if any(word in text_lower for word in ['god', 'jesus', 'christ', 'bible', 'christian']):
            religions_mentioned.append('Christianity')
        if any(word in text_lower for word in ['allah', 'islam', 'quran', 'prophet', 'muslim']):
            religions_mentioned.append('Islam')
        if any(word in text_lower for word in ['buddha', 'buddhist', 'buddhism']):
            religions_mentioned.append('Buddhism')
        if any(word in text_lower for word in ['hindu', 'hinduism', 'veda']):
            religions_mentioned.append('Hinduism')
        
        return {
            'has_religious_content': match_count > 0,
            'religious_score': round(religious_score, 3),
            'keyword_matches': match_count,
            'matched_keywords': list(religious_matches),
            'religions_mentioned': religions_mentioned,
            'intensity': 'high' if religious_score > 0.1 else 'medium' if religious_score > 0.05 else 'low'
        }

    def racial_tone_classification(self):
        """
        Method to classify the racial tone of the text
        :return: a dictionary with racial tone analysis
        """
        text_lower = self.clean().lower()
        words = word_tokenize(text_lower)
        word_set = set(words)
        
        # Count racial keywords
        racial_matches = word_set.intersection(self._racial_keywords)
        match_count = len(racial_matches)
        total_words = len(word_set)
        
        # Calculate racial content score
        racial_score = match_count / max(total_words, 1)
        
        # Check for negative context
        negative_words = {'hate', 'against', 'oppose', 'reject', 'deny', 'blame'}
        negative_context = len(word_set.intersection(negative_words)) > 0
        
        # Determine tone
        if match_count == 0:
            tone = 'neutral'
        elif negative_context and racial_score > 0.05:
            tone = 'negative'
        elif racial_score > 0.05:
            tone = 'discussion'
        else:
            tone = 'neutral'
        
        return {
            'has_racial_content': match_count > 0,
            'racial_score': round(racial_score, 3),
            'keyword_matches': match_count,
            'matched_keywords': list(racial_matches),
            'tone': tone,
            'has_negative_context': negative_context
        }

    def question_answering(self, question: str):
        """
        Simple question answering method using keyword matching and sentence extraction
        :param question: the question to answer
        :return: a dictionary with the answer and relevant sentences
        """
        question_lower = question.lower()
        question_words = set(word_tokenize(question_lower))
        
        # Remove question words
        question_words = question_words - {'what', 'who', 'where', 'when', 'why', 'how', 'is', 'are', 'was', 'were', 'do', 'does', 'did', '?', 'the', 'a', 'an'}
        
        sentences = sent_tokenize(self.clean())
        sentence_scores = {}
        
        for sentence in sentences:
            sentence_lower = sentence.lower()
            sentence_words = set(word_tokenize(sentence_lower))
            
            # Score based on keyword overlap
            overlap = len(question_words.intersection(sentence_words))
            score = overlap / max(len(question_words), 1)
            sentence_scores[sentence] = score
        
        # Get top scoring sentences
        ranked_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)
        top_sentences = [sent for sent, score in ranked_sentences[:3] if score > 0]
        
        # Generate answer
        if top_sentences:
            answer = top_sentences[0]
            confidence = 'high' if sentence_scores[answer] > 0.5 else 'medium' if sentence_scores[answer] > 0.2 else 'low'
        else:
            answer = "I couldn't find a relevant answer in the text."
            confidence = 'low'
        
        return {
            'answer': answer,
            'confidence': confidence,
            'relevant_sentences': top_sentences,
            'score': round(sentence_scores.get(answer, 0), 3) if top_sentences else 0
        }

