"""
ChalkML Data Manipulation Engine (v3.0) - PRODUCTION-GRADE ML DATA ENGINEERING
Terminal-based data operations with deterministic execution

Philosophy: Command-line data mastery with FL Studio-level precision
Portable: Works on any machine, any OS
"""
import json
from pathlib import Path
from typing import Optional, List, Tuple, Union, Dict, Any
import hashlib
import shutil
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
import warnings
warnings.filterwarnings('ignore')


class ChalkMLEngine:
    """
    Deterministic data manipulation engine for Chalk IDE.
    
    Features:
    - Column/row operations with intuitive notation (01N, 0N2)
    - Automatic backups before modifications
    - Undo/redo with checkpoint system
    - SHA-256 checksums for reproducibility
    - Terminal-based workflow
    """
    
    def __init__(self, workspace_path: Optional[str] = None):
        # Use current directory if no workspace specified (portable)
        if workspace_path is None:
            workspace_path = Path.cwd()
        self.workspace_path = Path(workspace_path)
        self.backup_dir = self.workspace_path / ".chalk" / "data_backups"
        self.backup_dir.mkdir(parents=True, exist_ok=True)
        
        # Operation history for undo/redo
        self.history = []
        self.max_history = 50
        
    def _parse_position(self, position: str, total_count: int) -> int:
        """
        Parse position notation to index.
        
        NEW NOTATION (v2):
        - "01N", "02N", "10N" = 1st, 2nd, 10th from left (index 0, 1, 9)
        - "N01", "N02", "N10" = Last, 2nd last, 10th last from right
        - "" or None = last (index -1)
        
        Args:
            position: Position string
            total_count: Total number of items
            
        Returns:
            int: Python index
        """
        if not position:
            return -1  # Last item by default
        
        position = position.upper().strip()
        
        if 'N' not in position:
            raise ValueError(f"Invalid position: {position}. Must contain 'N' (e.g., 01N, N02)")
        
        # Check if N is at start (right indexing) or at end (left indexing)
        if position.startswith('N'):
            # Right indexing: N01, N02, N10
            num_str = position[1:].lstrip('0') or '1'
            num = int(num_str)
            return total_count - num  # N01 = -1, N02 = -2
        elif position.endswith('N'):
            # Left indexing: 01N, 02N, 10N
            num_str = position[:-1].lstrip('0') or '1'
            num = int(num_str)
            return num - 1  # 01N = 0, 02N = 1
        else:
            raise ValueError(f"Invalid position: {position}. N must be at start or end")
    
    def _parse_range(self, range_expr: str, total_count: int) -> Tuple[int, int]:
        """
        Parse range expression to start and end indices.
        
        Examples:
        - "01N:05N" = indices 0 to 4 (columns 1-5 from left)
        - "01N:N03" = indices 0 to -3 (from 1st to 3rd last)
        - "N05:N01" = indices -5 to -1 (last 5 items)
        
        Args:
            range_expr: Range string like "01N:05N"
            total_count: Total number of items
        
        Returns:
            Tuple[int, int]: (start_index, end_index) inclusive
        """
        if ':' not in range_expr:
            raise ValueError(f"Invalid range: {range_expr}. Must contain ':'")
        
        parts = range_expr.split(':')
        if len(parts) != 2:
            raise ValueError(f"Invalid range format: {range_expr}")
        
        start_pos, end_pos = parts
        start_idx = self._parse_position(start_pos.strip(), total_count)
        end_idx = self._parse_position(end_pos.strip(), total_count)
        
        # Convert negative indices to positive for slicing
        if start_idx < 0:
            start_idx = total_count + start_idx
        if end_idx < 0:
            end_idx = total_count + end_idx
        
        return start_idx, end_idx + 1  # +1 for inclusive end
    
    def _create_backup(self, file_path: Path) -> str:
        """
        Create backup of file before modification.
        
        Returns:
            str: Backup file path
        """
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        backup_name = f"{file_path.stem}_{timestamp}{file_path.suffix}"
        backup_path = self.backup_dir / backup_name
        
        shutil.copy2(file_path, backup_path)
        
        # Store in history
        self.history.append({
            'original': str(file_path),
            'backup': str(backup_path),
            'operation': 'backup',
            'timestamp': timestamp
        })
        
        # Limit history size
        if len(self.history) > self.max_history:
            old_backup = self.history.pop(0)
            # Optionally delete old backups
            if Path(old_backup['backup']).exists():
                Path(old_backup['backup']).unlink()
        
        return str(backup_path)
    
    def _compute_checksum(self, file_path: Path) -> str:
        """Compute SHA-256 checksum for verification."""
        sha256 = hashlib.sha256()
        with open(file_path, 'rb') as f:
            for chunk in iter(lambda: f.read(8192), b''):
                sha256.update(chunk)
        return sha256.hexdigest()
    
    def remove_column(
        self, 
        file_path: str, 
        position: Optional[str] = None,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """
        Remove column from CSV file.
        
        Args:
            file_path: Path to CSV file
            position: Position notation (01N, 0N2, etc.) or None for last
            output_path: Output file path (None = overwrite)
        
        Returns:
            Tuple[bool, str]: (success, message)
        """
        try:
            file_path = Path(file_path)
            
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            # Create backup
            backup_path = self._create_backup(file_path)
            
            # Load data
            df = pd.read_csv(file_path)
            
            if df.shape[1] == 0:
                return False, "CSV has no columns"
            
            # Parse position
            col_index = self._parse_position(position, df.shape[1])
            
            # Get column name
            col_name = df.columns[col_index]
            
            # Remove column
            df_modified = df.drop(columns=[col_name])
            
            # Save
            output_path = Path(output_path) if output_path else file_path
            df_modified.to_csv(output_path, index=False)
            
            # Compute checksums
            new_checksum = self._compute_checksum(output_path)
            
            message = (
                f"✅ Column '{col_name}' removed (position: {position or 'last'})\n"
                f"   Backup: {backup_path}\n"
                f"   Checksum: {new_checksum[:16]}...\n"
                f"   Columns: {df.shape[1]} → {df_modified.shape[1]}"
            )
            
            return True, message
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def remove_row(
        self,
        file_path: str,
        position: Optional[str] = None,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """
        Remove row from CSV file.
        
        Args:
            file_path: Path to CSV file
            position: Position notation (01N, 0N2, etc.) or None for last
            output_path: Output file path (None = overwrite)
        
        Returns:
            Tuple[bool, str]: (success, message)
        """
        try:
            file_path = Path(file_path)
            
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            # Create backup
            backup_path = self._create_backup(file_path)
            
            # Load data
            df = pd.read_csv(file_path)
            
            if df.shape[0] == 0:
                return False, "CSV has no rows"
            
            # Parse position
            row_index = self._parse_position(position, df.shape[0])
            
            # Remove row
            df_modified = df.drop(df.index[row_index])
            
            # Save
            output_path = Path(output_path) if output_path else file_path
            df_modified.to_csv(output_path, index=False)
            
            # Compute checksums
            new_checksum = self._compute_checksum(output_path)
            
            message = (
                f"✅ Row removed (position: {position or 'last'}, index: {row_index})\n"
                f"   Backup: {backup_path}\n"
                f"   Checksum: {new_checksum[:16]}...\n"
                f"   Rows: {df.shape[0]} → {df_modified.shape[0]}"
            )
            
            return True, message
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def show_info(self, file_path: str) -> Tuple[bool, str]:
        """
        Display dataset information.
        
        Args:
            file_path: Path to dataset file
        
        Returns:
            Tuple[bool, str]: (success, info_string)
        """
        try:
            file_path = Path(file_path)
            
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            # Load data
            df = pd.read_csv(file_path)
            
            # Compute statistics
            info = [
                f"📊 Dataset: {file_path.name}",
                f"   Shape: {df.shape[0]} rows × {df.shape[1]} columns",
                f"   Size: {file_path.stat().st_size / 1024:.1f} KB",
                f"   Checksum: {self._compute_checksum(file_path)[:16]}...",
                f"",
                f"   Columns:",
            ]
            
            for i, col in enumerate(df.columns, 1):
                dtype = df[col].dtype
                null_count = df[col].isnull().sum()
                info.append(f"     {i}. {col} ({dtype}) - {null_count} nulls")
            
            return True, "\n".join(info)
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def undo_last(self) -> Tuple[bool, str]:
        """
        Undo last operation by restoring from backup.
        
        Returns:
            Tuple[bool, str]: (success, message)
        """
        if not self.history:
            return False, "No operations to undo"
        
        try:
            last_op = self.history[-1]
            
            # Restore from backup
            shutil.copy2(last_op['backup'], last_op['original'])
            
            message = (
                f"✅ Undone: {last_op['operation']} at {last_op['timestamp']}\n"
                f"   Restored: {last_op['original']}"
            )
            
            self.history.pop()
            
            return True, message
            
        except Exception as e:
            return False, f"Undo failed: {str(e)}"
    
    def move_column(
        self,
        file_path: str,
        from_pos: str,
        to_pos: str,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """
        Move column from one position to another.
        
        Args:
            file_path: Path to CSV file
            from_pos: Source position (01N, N02, etc.)
            to_pos: Target position
            output_path: Output file path (None = overwrite)
        
        Returns:
            Tuple[bool, str]: (success, message)
        """
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            from_idx = self._parse_position(from_pos, df.shape[1])
            to_idx = self._parse_position(to_pos, df.shape[1])
            
            col_name = df.columns[from_idx]
            col_data = df.iloc[:, from_idx]
            
            df_new = df.drop(columns=[col_name])
            df_new.insert(to_idx if to_idx >= 0 else len(df_new.columns), col_name, col_data)
            
            output_path = Path(output_path) if output_path else file_path
            df_new.to_csv(output_path, index=False)
            
            return True, f"✅ Moved column '{col_name}' from {from_pos} to {to_pos}"
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def copy_column(
        self,
        file_path: str,
        from_pos: str,
        to_pos: str,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """Copy column to another position."""
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            from_idx = self._parse_position(from_pos, df.shape[1])
            to_idx = self._parse_position(to_pos, df.shape[1])
            
            col_name = df.columns[from_idx]
            col_data = df.iloc[:, from_idx].copy()
            
            df.insert(to_idx if to_idx >= 0 else len(df.columns), f"{col_name}_copy", col_data)
            
            output_path = Path(output_path) if output_path else file_path
            df.to_csv(output_path, index=False)
            
            return True, f"✅ Copied column '{col_name}' from {from_pos} to {to_pos}"
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def rename_column(
        self,
        file_path: str,
        position: str,
        new_name: str,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """Rename column at position."""
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            col_idx = self._parse_position(position, df.shape[1])
            old_name = df.columns[col_idx]
            
            df.rename(columns={old_name: new_name}, inplace=True)
            
            output_path = Path(output_path) if output_path else file_path
            df.to_csv(output_path, index=False)
            
            return True, f"✅ Renamed column '{old_name}' → '{new_name}'"
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def slice_data(
        self,
        file_path: str,
        target: str,
        range_expr: str,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """Slice data to keep only specified range."""
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            if target == 'col':
                start, end = self._parse_range(range_expr, df.shape[1])
                df_sliced = df.iloc[:, start:end]
                msg = f"✅ Sliced to columns {start+1}-{end}"
            elif target == 'row':
                start, end = self._parse_range(range_expr, df.shape[0])
                df_sliced = df.iloc[start:end, :]
                msg = f"✅ Sliced to rows {start+1}-{end}"
            else:
                return False, f"Invalid target: {target}"
            
            output_path = Path(output_path) if output_path else file_path
            df_sliced.to_csv(output_path, index=False)
            
            return True, msg
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def fill_empty(
        self,
        file_path: str,
        target: str,
        position: str,
        fill_value: str,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """Fill empty cells with specified value."""
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            if target == 'col':
                col_idx = self._parse_position(position, df.shape[1])
                col_name = df.columns[col_idx]
                before = df[col_name].isnull().sum()
                df[col_name].fillna(fill_value, inplace=True)
                msg = f"✅ Filled {before} empty cells in column '{col_name}' with '{fill_value}'"
            elif target == 'row':
                row_idx = self._parse_position(position, df.shape[0])
                before = df.iloc[row_idx].isnull().sum()
                df.iloc[row_idx] = df.iloc[row_idx].fillna(fill_value)
                msg = f"✅ Filled {before} empty cells in row {row_idx+1} with '{fill_value}'"
            else:
                return False, f"Invalid target: {target}"
            
            output_path = Path(output_path) if output_path else file_path
            df.to_csv(output_path, index=False)
            
            return True, msg
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def merge_columns(
        self,
        file_path: str,
        positions: List[str],
        new_name: str,
        separator: str = " ",
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """Merge multiple columns into one."""
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            indices = [self._parse_position(pos, df.shape[1]) for pos in positions]
            col_names = [df.columns[idx] for idx in indices]
            
            # Merge columns
            df[new_name] = df[col_names].apply(lambda row: separator.join(row.astype(str)), axis=1)
            
            # Remove original columns
            df.drop(columns=col_names, inplace=True)
            
            output_path = Path(output_path) if output_path else file_path
            df.to_csv(output_path, index=False)
            
            return True, f"✅ Merged {len(col_names)} columns into '{new_name}'"
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def remove_range(
        self,
        file_path: str,
        target: str,
        range_expr: str,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """Remove a range of columns or rows."""
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            if target == 'col':
                start, end = self._parse_range(range_expr, df.shape[1])
                cols_to_drop = df.columns[start:end]
                df_new = df.drop(columns=cols_to_drop)
                msg = f"✅ Removed columns {start+1}-{end} ({len(cols_to_drop)} columns)"
            elif target == 'row':
                start, end = self._parse_range(range_expr, df.shape[0])
                df_new = df.drop(df.index[start:end])
                msg = f"✅ Removed rows {start+1}-{end} ({end-start} rows)"
            else:
                return False, f"Invalid target: {target}"
            
            output_path = Path(output_path) if output_path else file_path
            df_new.to_csv(output_path, index=False)
            
            return True, msg
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    # ============================================================================
    # SMART RANDOMIZATION - Knowledge Graph Integration
    # ============================================================================
    
    def randomize_column(
        self,
        file_path: str,
        position: str,
        class_path: str,
        distribution: str = 'uniform',
        context_col: Optional[str] = None,
        output_path: Optional[str] = None,
        **dist_kwargs
    ) -> Tuple[bool, str]:
        """
        Fill column with smart random data from knowledge graph.
        
        Args:
            file_path: Path to CSV file
            position: Column position (01N, N02, etc.)
            class_path: Knowledge graph path (e.g., "person.name.first_name")
            distribution: Distribution type ('uniform', 'normal', 'beta')
            context_col: Position of context column for dependent generation
            **dist_kwargs: Distribution parameters (mean, std, etc.)
        
        Returns:
            Tuple[bool, str]: (success, message)
        """
        try:
            from chalkml_knowledge_graph import get_knowledge_graph
            
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            col_idx = self._parse_position(position, df.shape[1])
            col_name = df.columns[col_idx]
            
            kg = get_knowledge_graph()
            
            # Get context if provided
            context = {}
            if context_col:
                context_idx = self._parse_position(context_col, df.shape[1])
                context_col_name = df.columns[context_idx]
                # Use first non-null value as context (simple approach)
                context_value = df[context_col_name].dropna().iloc[0] if not df[context_col_name].dropna().empty else None
                if context_value:
                    # Extract key from class_path
                    context_key = class_path.split('.')[-1]
                    if context_key == 'city':
                        context['country'] = context_value
                    elif context_key == 'major':
                        context['university'] = context_value
            
            # Generate data
            count = len(df)
            if distribution == 'uniform':
                values = [kg.generate(class_path, context) for _ in range(count)]
            else:
                values = kg.generate_distribution(class_path, count, distribution, **dist_kwargs)
            
            df[col_name] = values
            
            output_path = Path(output_path) if output_path else file_path
            df.to_csv(output_path, index=False)
            
            msg = f"✅ Randomized column '{col_name}' with class '{class_path}' ({distribution} distribution)"
            return True, msg
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    # ============================================================================
    # SMART FILL STRATEGIES
    # ============================================================================
    
    def fill_smart(
        self,
        file_path: str,
        position: str,
        strategy: str,
        output_path: Optional[str] = None,
        **kwargs
    ) -> Tuple[bool, str]:
        """
        Smart fill missing values using various strategies.
        
        Strategies:
        - 'forward': Forward fill (carry last value forward)
        - 'backward': Backward fill (carry next value backward)
        - 'interpolate': Linear interpolation (numerical only)
        - 'mean': Fill with column mean
        - 'median': Fill with column median
        - 'mode': Fill with most frequent value
        - 'knn': KNN imputation (requires other columns)
        
        Args:
            file_path: Path to CSV file
            position: Column position
            strategy: Fill strategy
            **kwargs: Strategy-specific parameters
        
        Returns:
            Tuple[bool, str]: (success, message)
        """
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            col_idx = self._parse_position(position, df.shape[1])
            col_name = df.columns[col_idx]
            
            before_nulls = df[col_name].isnull().sum()
            
            if strategy == 'forward':
                df[col_name].fillna(method='ffill', inplace=True)
            elif strategy == 'backward':
                df[col_name].fillna(method='bfill', inplace=True)
            elif strategy == 'interpolate':
                df[col_name] = df[col_name].interpolate(method='linear')
            elif strategy == 'mean':
                df[col_name].fillna(df[col_name].mean(), inplace=True)
            elif strategy == 'median':
                df[col_name].fillna(df[col_name].median(), inplace=True)
            elif strategy == 'mode':
                mode_value = df[col_name].mode()[0] if not df[col_name].mode().empty else None
                if mode_value:
                    df[col_name].fillna(mode_value, inplace=True)
            elif strategy == 'knn':
                # KNN imputation using all numerical columns
                numerical_cols = df.select_dtypes(include=[np.number]).columns
                imputer = KNNImputer(n_neighbors=kwargs.get('k', 5))
                df[numerical_cols] = imputer.fit_transform(df[numerical_cols])
            else:
                return False, f"Unknown strategy: {strategy}"
            
            after_nulls = df[col_name].isnull().sum()
            filled = before_nulls - after_nulls
            
            output_path = Path(output_path) if output_path else file_path
            df.to_csv(output_path, index=False)
            
            msg = f"✅ Filled {filled} nulls in '{col_name}' using '{strategy}' strategy"
            return True, msg
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    # ============================================================================
    # CONDITIONAL OPERATIONS
    # ============================================================================
    
    def fill_conditional(
        self,
        file_path: str,
        position: str,
        condition: str,
        fill_value: Any,
        else_value: Optional[Any] = None,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """
        Fill column conditionally based on another column's value.
        
        Args:
            file_path: Path to CSV file
            position: Target column position
            condition: Condition string (e.g., "col:03N==USA" or "col:05N>18")
            fill_value: Value to fill when condition is True
            else_value: Value to fill when condition is False (optional)
        
        Returns:
            Tuple[bool, str]: (success, message)
        """
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            # Parse target column
            col_idx = self._parse_position(position, df.shape[1])
            col_name = df.columns[col_idx]
            
            # Parse condition (e.g., "col:03N==USA")
            if condition.startswith('col:'):
                parts = condition[4:].split('==')
                if len(parts) == 2:
                    cond_pos, cond_value = parts
                    cond_idx = self._parse_position(cond_pos.strip(), df.shape[1])
                    cond_col = df.columns[cond_idx]
                    
                    # Apply conditional fill
                    mask = df[cond_col] == cond_value.strip()
                    df.loc[mask, col_name] = fill_value
                    
                    if else_value is not None:
                        df.loc[~mask, col_name] = else_value
                    
                    filled = mask.sum()
                    msg = f"✅ Filled {filled} cells in '{col_name}' where {cond_col}=={cond_value}"
                    
                    output_path = Path(output_path) if output_path else file_path
                    df.to_csv(output_path, index=False)
                    
                    return True, msg
            
            return False, "Invalid condition format. Use 'col:03N==value'"
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    # ============================================================================
    # FEATURE ENGINEERING
    # ============================================================================
    
    def derive_column(
        self,
        file_path: str,
        new_name: str,
        formula: str,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """
        Create derived column using formula.
        
        Formula syntax:
        - Use 'col:position' to reference columns
        - Example: "col:05N * 1.15 + col:06N"
        - Example: "col:weight / (col:height ** 2)"
        
        Args:
            file_path: Path to CSV file
            new_name: Name for new derived column
            formula: Derivation formula
        
        Returns:
            Tuple[bool, str]: (success, message)
        """
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            # Replace col:position references with actual column names
            import re
            formula_parsed = formula
            
            for match in re.finditer(r'col:(\w+)', formula):
                pos = match.group(1)
                try:
                    idx = self._parse_position(pos, df.shape[1])
                    col = df.columns[idx]
                    formula_parsed = formula_parsed.replace(f'col:{pos}', f'df["{col}"]')
                except:
                    # Try as direct column name
                    formula_parsed = formula_parsed.replace(f'col:{pos}', f'df["{pos}"]')
            
            # Evaluate formula
            df[new_name] = eval(formula_parsed)
            
            output_path = Path(output_path) if output_path else file_path
            df.to_csv(output_path, index=False)
            
            msg = f"✅ Derived column '{new_name}' using formula: {formula}"
            return True, msg
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def one_hot_encode(
        self,
        file_path: str,
        position: str,
        prefix: Optional[str] = None,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """One-hot encode categorical column."""
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            col_idx = self._parse_position(position, df.shape[1])
            col_name = df.columns[col_idx]
            
            # One-hot encode
            prefix = prefix or col_name
            encoded = pd.get_dummies(df[col_name], prefix=prefix)
            
            # Drop original and add encoded columns
            df = df.drop(columns=[col_name])
            df = pd.concat([df, encoded], axis=1)
            
            output_path = Path(output_path) if output_path else file_path
            df.to_csv(output_path, index=False)
            
            msg = f"✅ One-hot encoded '{col_name}' into {len(encoded.columns)} columns"
            return True, msg
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def bin_column(
        self,
        file_path: str,
        position: str,
        n_bins: int = 5,
        strategy: str = 'quantile',
        labels: Optional[List[str]] = None,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """
        Bin continuous column into discrete categories.
        
        Strategies:
        - 'quantile': Equal-sized bins
        - 'uniform': Equal-width bins
        - 'kmeans': K-means clustering bins
        """
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            col_idx = self._parse_position(position, df.shape[1])
            col_name = df.columns[col_idx]
            
            if strategy == 'quantile':
                df[f'{col_name}_binned'] = pd.qcut(df[col_name], q=n_bins, labels=labels, duplicates='drop')
            elif strategy == 'uniform':
                df[f'{col_name}_binned'] = pd.cut(df[col_name], bins=n_bins, labels=labels)
            else:
                return False, f"Unknown binning strategy: {strategy}"
            
            output_path = Path(output_path) if output_path else file_path
            df.to_csv(output_path, index=False)
            
            msg = f"✅ Binned '{col_name}' into {n_bins} bins using '{strategy}' strategy"
            return True, msg
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def scale_column(
        self,
        file_path: str,
        position: str,
        method: str = 'standard',
        feature_range: Tuple[float, float] = (0, 1),
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """
        Scale numerical column.
        
        Methods:
        - 'standard': Standardization (mean=0, std=1)
        - 'minmax': Min-max scaling to feature_range
        """
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            col_idx = self._parse_position(position, df.shape[1])
            col_name = df.columns[col_idx]
            
            if method == 'standard':
                scaler = StandardScaler()
                df[col_name] = scaler.fit_transform(df[[col_name]])
            elif method == 'minmax':
                scaler = MinMaxScaler(feature_range=feature_range)
                df[col_name] = scaler.fit_transform(df[[col_name]])
            else:
                return False, f"Unknown scaling method: {method}"
            
            output_path = Path(output_path) if output_path else file_path
            df.to_csv(output_path, index=False)
            
            msg = f"✅ Scaled '{col_name}' using '{method}' method"
            return True, msg
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    # ============================================================================
    # VALIDATION & CONSTRAINTS
    # ============================================================================
    
    def validate_data(
        self,
        file_path: str,
        constraints: Optional[Dict[str, Dict]] = None
    ) -> Tuple[bool, str]:
        """
        Validate data against constraints.
        
        Constraint format:
        {
            'column_position': {
                'type': 'age',  # age, email, phone, etc.
                'min': 18,
                'max': 65,
                'required': True,
                'unique': False
            }
        }
        """
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            df = pd.read_csv(file_path)
            
            violations = []
            
            if constraints:
                for pos, rules in constraints.items():
                    col_idx = self._parse_position(pos, df.shape[1])
                    col_name = df.columns[col_idx]
                    
                    # Check required
                    if rules.get('required') and df[col_name].isnull().any():
                        null_count = df[col_name].isnull().sum()
                        violations.append(f"❌ '{col_name}': {null_count} null values (required field)")
                    
                    # Check min/max
                    if 'min' in rules:
                        below_min = (df[col_name] < rules['min']).sum()
                        if below_min > 0:
                            violations.append(f"❌ '{col_name}': {below_min} values below minimum {rules['min']}")
                    
                    if 'max' in rules:
                        above_max = (df[col_name] > rules['max']).sum()
                        if above_max > 0:
                            violations.append(f"❌ '{col_name}': {above_max} values above maximum {rules['max']}")
                    
                    # Check unique
                    if rules.get('unique') and df[col_name].duplicated().any():
                        dupe_count = df[col_name].duplicated().sum()
                        violations.append(f"❌ '{col_name}': {dupe_count} duplicate values (must be unique)")
            
            if violations:
                msg = "⚠️  VALIDATION FAILED:\n" + "\n".join(violations)
                return False, msg
            else:
                msg = f"✅ VALIDATION PASSED\n   All constraints satisfied for {len(df)} rows"
                return True, msg
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    # ============================================================================
    # CONTROL FLOW - IF/ELSE, WHILE
    # ============================================================================
    
    def apply_if_else(
        self,
        file_path: str,
        target_position: str,
        condition: str,
        if_value: Any,
        else_value: Any,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """
        Apply if/else logic to column based on condition.
        
        Syntax:
            condition: "col:pos operator value"
            operators: ==, !=, >, <, >=, <=, contains, startswith, endswith
        
        Examples:
            col:03N==USA
            col:05N>100
            col:04N contains "ML"
        
        Args:
            file_path: Path to CSV file
            target_position: Column to modify
            condition: Condition expression
            if_value: Value when condition is True
            else_value: Value when condition is False
        
        Returns:
            Tuple[bool, str]: (success, message)
        """
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            # Parse target column
            target_idx = self._parse_position(target_position, df.shape[1])
            target_col = df.columns[target_idx]
            
            # Parse condition
            if condition.startswith('col:'):
                # Extract condition parts
                if '==' in condition:
                    parts = condition[4:].split('==')
                    op = '=='
                elif '!=' in condition:
                    parts = condition[4:].split('!=')
                    op = '!='
                elif '>=' in condition:
                    parts = condition[4:].split('>=')
                    op = '>='
                elif '<=' in condition:
                    parts = condition[4:].split('<=')
                    op = '<='
                elif '>' in condition:
                    parts = condition[4:].split('>')
                    op = '>'
                elif '<' in condition:
                    parts = condition[4:].split('<')
                    op = '<'
                elif ' contains ' in condition:
                    parts = condition[4:].split(' contains ')
                    op = 'contains'
                elif ' startswith ' in condition:
                    parts = condition[4:].split(' startswith ')
                    op = 'startswith'
                elif ' endswith ' in condition:
                    parts = condition[4:].split(' endswith ')
                    op = 'endswith'
                else:
                    return False, f"Invalid condition operator in: {condition}"
                
                cond_pos = parts[0].strip()
                cond_value = parts[1].strip().strip('"\'')
                
                # Get condition column
                cond_idx = self._parse_position(cond_pos, df.shape[1])
                cond_col = df.columns[cond_idx]
                
                # Apply condition
                if op == '==':
                    mask = df[cond_col] == cond_value
                elif op == '!=':
                    mask = df[cond_col] != cond_value
                elif op == '>':
                    mask = df[cond_col] > float(cond_value)
                elif op == '<':
                    mask = df[cond_col] < float(cond_value)
                elif op == '>=':
                    mask = df[cond_col] >= float(cond_value)
                elif op == '<=':
                    mask = df[cond_col] <= float(cond_value)
                elif op == 'contains':
                    mask = df[cond_col].astype(str).str.contains(cond_value)
                elif op == 'startswith':
                    mask = df[cond_col].astype(str).str.startswith(cond_value)
                elif op == 'endswith':
                    mask = df[cond_col].astype(str).str.endswith(cond_value)
                
                # Apply if/else
                df.loc[mask, target_col] = if_value
                df.loc[~mask, target_col] = else_value
                
                true_count = mask.sum()
                false_count = (~mask).sum()
                
                output_path = Path(output_path) if output_path else file_path
                df.to_csv(output_path, index=False)
                
                msg = (
                    f"✅ Applied IF/ELSE on '{target_col}'\n"
                    f"   Condition: {condition}\n"
                    f"   IF({true_count} rows) = {if_value}\n"
                    f"   ELSE({false_count} rows) = {else_value}"
                )
                return True, msg
            
            return False, "Condition must start with 'col:'"
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def apply_while(
        self,
        file_path: str,
        target_position: str,
        condition: str,
        operation: str,
        max_iterations: int = 100,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """
        Apply operation repeatedly while condition is true (row-wise).
        
        Examples:
            WHILE col:balance > 0 DO col:balance = col:balance * 0.9
            WHILE col:temp < 100 DO col:temp = col:temp + 5
        
        Args:
            file_path: Path to CSV file
            target_position: Column to modify
            condition: Condition to check (col:pos operator value)
            operation: Operation to perform (Python expression)
            max_iterations: Safety limit
        
        Returns:
            Tuple[bool, str]: (success, message)
        """
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            target_idx = self._parse_position(target_position, df.shape[1])
            target_col = df.columns[target_idx]
            
            iterations = 0
            rows_modified = 0
            
            while iterations < max_iterations:
                # Evaluate condition for each row
                if '>' in condition:
                    parts = condition.split('>')
                    col_ref = parts[0].strip()
                    threshold = float(parts[1].strip())
                    
                    if col_ref.startswith('col:'):
                        col_pos = col_ref[4:]
                        col_idx = self._parse_position(col_pos, df.shape[1])
                        col_name = df.columns[col_idx]
                        mask = df[col_name] > threshold
                    else:
                        break
                elif '<' in condition:
                    parts = condition.split('<')
                    col_ref = parts[0].strip()
                    threshold = float(parts[1].strip())
                    
                    if col_ref.startswith('col:'):
                        col_pos = col_ref[4:]
                        col_idx = self._parse_position(col_pos, df.shape[1])
                        col_name = df.columns[col_idx]
                        mask = df[col_name] < threshold
                    else:
                        break
                else:
                    return False, f"Unsupported condition: {condition}"
                
                # If no rows match, exit
                if not mask.any():
                    break
                
                # Apply operation to matching rows
                # Parse operation (e.g., "col:balance * 0.9")
                import re
                operation_parsed = operation
                for match in re.finditer(r'col:(\w+)', operation):
                    pos = match.group(1)
                    idx = self._parse_position(pos, df.shape[1])
                    col = df.columns[idx]
                    operation_parsed = operation_parsed.replace(f'col:{pos}', f'df.loc[mask, "{col}"]')
                
                # Execute operation
                df.loc[mask, target_col] = eval(operation_parsed)
                
                rows_modified += mask.sum()
                iterations += 1
            
            output_path = Path(output_path) if output_path else file_path
            df.to_csv(output_path, index=False)
            
            msg = (
                f"✅ WHILE loop completed\n"
                f"   Iterations: {iterations}\n"
                f"   Rows modified: {rows_modified}\n"
                f"   Condition: {condition}\n"
                f"   Operation: {operation}"
            )
            return True, msg
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    # ============================================================================
    # ADVANCED DESIGN PATTERNS
    # ============================================================================
    
    def map_pattern(
        self,
        file_path: str,
        target_position: str,
        function: str,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """
        Map pattern: Apply function to each element in column.
        
        f: X → Y
        map(f, [x₁, x₂, ..., xₙ]) = [f(x₁), f(x₂), ..., f(xₙ)]
        
        Examples:
            "x * 2" - double each value
            "x.upper()" - uppercase strings
            "x ** 2" - square each value
        
        Args:
            file_path: Path to CSV file
            target_position: Column to map over
            function: Lambda expression (use 'x' as variable)
        
        Returns:
            Tuple[bool, str]: (success, message)
        """
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            col_idx = self._parse_position(target_position, df.shape[1])
            col_name = df.columns[col_idx]
            
            # Apply map function
            df[f'{col_name}_mapped'] = df[col_name].apply(lambda x: eval(function))
            
            output_path = Path(output_path) if output_path else file_path
            df.to_csv(output_path, index=False)
            
            msg = f"✅ MAP pattern applied: {function}\n   New column: '{col_name}_mapped'"
            return True, msg
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def reduce_pattern(
        self,
        file_path: str,
        target_positions: List[str],
        operation: str,
        result_name: str,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """
        Reduce pattern: Combine multiple columns into single value per row.
        
        reduce(⊕, [x₁, x₂, ..., xₙ]) = x₁ ⊕ x₂ ⊕ ... ⊕ xₙ
        
        Operations:
            'sum' - Σxᵢ
            'product' - Πxᵢ
            'mean' - (1/n)Σxᵢ
            'max' - max(x₁, ..., xₙ)
            'min' - min(x₁, ..., xₙ)
        
        Args:
            file_path: Path to CSV file
            target_positions: List of column positions to reduce
            operation: Reduction operation
            result_name: Name for result column
        
        Returns:
            Tuple[bool, str]: (success, message)
        """
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            # Get columns to reduce
            col_indices = [self._parse_position(pos, df.shape[1]) for pos in target_positions]
            col_names = [df.columns[idx] for idx in col_indices]
            
            # Apply reduction
            if operation == 'sum':
                df[result_name] = df[col_names].sum(axis=1)
            elif operation == 'product':
                df[result_name] = df[col_names].product(axis=1)
            elif operation == 'mean':
                df[result_name] = df[col_names].mean(axis=1)
            elif operation == 'max':
                df[result_name] = df[col_names].max(axis=1)
            elif operation == 'min':
                df[result_name] = df[col_names].min(axis=1)
            else:
                return False, f"Unknown operation: {operation}"
            
            output_path = Path(output_path) if output_path else file_path
            df.to_csv(output_path, index=False)
            
            msg = (
                f"✅ REDUCE pattern applied\n"
                f"   Operation: {operation}\n"
                f"   Columns: {', '.join(col_names)}\n"
                f"   Result: '{result_name}'"
            )
            return True, msg
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def stencil_pattern(
        self,
        file_path: str,
        target_position: str,
        window_size: int,
        operation: str,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """
        Stencil pattern: Apply operation to sliding window of neighbors.
        
        S[i] = f(x[i-k], ..., x[i], ..., x[i+k])
        
        Operations:
            'rolling_mean' - Moving average
            'rolling_sum' - Moving sum
            'rolling_max' - Moving maximum
            'rolling_min' - Moving minimum
            'rolling_std' - Moving standard deviation
        
        Args:
            file_path: Path to CSV file
            target_position: Column to apply stencil
            window_size: Size of sliding window
            operation: Stencil operation
        
        Returns:
            Tuple[bool, str]: (success, message)
        """
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            col_idx = self._parse_position(target_position, df.shape[1])
            col_name = df.columns[col_idx]
            
            # Apply stencil operation
            if operation == 'rolling_mean':
                df[f'{col_name}_stencil'] = df[col_name].rolling(window=window_size, center=True).mean()
            elif operation == 'rolling_sum':
                df[f'{col_name}_stencil'] = df[col_name].rolling(window=window_size, center=True).sum()
            elif operation == 'rolling_max':
                df[f'{col_name}_stencil'] = df[col_name].rolling(window=window_size, center=True).max()
            elif operation == 'rolling_min':
                df[f'{col_name}_stencil'] = df[col_name].rolling(window=window_size, center=True).min()
            elif operation == 'rolling_std':
                df[f'{col_name}_stencil'] = df[col_name].rolling(window=window_size, center=True).std()
            else:
                return False, f"Unknown stencil operation: {operation}"
            
            output_path = Path(output_path) if output_path else file_path
            df.to_csv(output_path, index=False)
            
            msg = (
                f"✅ STENCIL pattern applied\n"
                f"   Operation: {operation}\n"
                f"   Window size: {window_size}\n"
                f"   New column: '{col_name}_stencil'"
            )
            return True, msg
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def farm_pattern(
        self,
        file_path: str,
        operations: List[Dict[str, Any]],
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """
        Farm pattern: Apply multiple independent operations in parallel.
        
        farm([f₁, f₂, ..., fₙ], x) = [f₁(x), f₂(x), ..., fₙ(x)]
        
        Each operation is independent and can run concurrently.
        
        Args:
            file_path: Path to CSV file
            operations: List of operations, each with 'type', 'position', 'params'
            
        Example:
            operations = [
                {'type': 'scale', 'position': '01N', 'method': 'standard'},
                {'type': 'fillsmart', 'position': '02N', 'strategy': 'mean'},
                {'type': 'onehot', 'position': '03N'}
            ]
        
        Returns:
            Tuple[bool, str]: (success, message)
        """
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            
            results = []
            for op in operations:
                op_type = op.get('type')
                position = op.get('position')
                
                if op_type == 'scale':
                    success, msg = self.scale_column(str(file_path), position, op.get('method', 'standard'))
                elif op_type == 'fillsmart':
                    success, msg = self.fill_smart(str(file_path), position, op.get('strategy', 'mean'))
                elif op_type == 'onehot':
                    success, msg = self.one_hot_encode(str(file_path), position)
                elif op_type == 'bin':
                    success, msg = self.bin_column(str(file_path), position, op.get('n_bins', 5))
                else:
                    results.append(f"Unknown operation: {op_type}")
                    continue
                
                results.append(f"{'✅' if success else '❌'} {op_type}: {msg}")
            
            msg = (
                f"✅ FARM pattern completed\n"
                f"   Operations: {len(operations)}\n"
                + "\n".join(f"   {r}" for r in results)
            )
            return True, msg
            
        except Exception as e:
            return False, f"Error: {str(e)}"
    
    def scan_pattern(
        self,
        file_path: str,
        target_position: str,
        operation: str,
        output_path: Optional[str] = None
    ) -> Tuple[bool, str]:
        """
        Scan pattern: Cumulative/prefix operation.
        
        scan(⊕, [x₁, x₂, ..., xₙ]) = [x₁, x₁⊕x₂, x₁⊕x₂⊕x₃, ..., x₁⊕...⊕xₙ]
        
        Operations:
            'cumsum' - Cumulative sum
            'cumprod' - Cumulative product
            'cummax' - Cumulative maximum
            'cummin' - Cumulative minimum
        
        Args:
            file_path: Path to CSV file
            target_position: Column to scan
            operation: Scan operation
        
        Returns:
            Tuple[bool, str]: (success, message)
        """
        try:
            file_path = Path(file_path)
            if not file_path.exists():
                return False, f"File not found: {file_path}"
            
            self._create_backup(file_path)
            df = pd.read_csv(file_path)
            
            col_idx = self._parse_position(target_position, df.shape[1])
            col_name = df.columns[col_idx]
            
            # Apply scan operation
            if operation == 'cumsum':
                df[f'{col_name}_scan'] = df[col_name].cumsum()
            elif operation == 'cumprod':
                df[f'{col_name}_scan'] = df[col_name].cumprod()
            elif operation == 'cummax':
                df[f'{col_name}_scan'] = df[col_name].cummax()
            elif operation == 'cummin':
                df[f'{col_name}_scan'] = df[col_name].cummin()
            else:
                return False, f"Unknown scan operation: {operation}"
            
            output_path = Path(output_path) if output_path else file_path
            df.to_csv(output_path, index=False)
            
            msg = (
                f"✅ SCAN pattern applied\n"
                f"   Operation: {operation}\n"
                f"   New column: '{col_name}_scan'"
            )
            return True, msg
            
        except Exception as e:
            return False, f"Error: {str(e)}"


# Global singleton instance
_chalkml_engine: Optional[ChalkMLEngine] = None

def get_chalkml_engine(workspace_path: Optional[str] = None) -> ChalkMLEngine:
    """Get or create singleton ChalkML engine instance. Portable across machines."""
    global _chalkml_engine
    if _chalkml_engine is None:
        _chalkml_engine = ChalkMLEngine(workspace_path)
    return _chalkml_engine
