import os
import sys
import json
import re
import mysql.connector
from datetime import datetime, timedelta
import cv2
import numpy as np
from PIL import Image
import pytesseract
import argparse
import logging
from typing import Dict, List, Optional, Tuple

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/var/www/html/logs/ocr_processor.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

class TesseractReceiptOCRProcessor:
    def __init__(self, db_config: Dict[str, str]):
        """Initialize OCR processor with database configuration"""
        self.db_config = db_config
        self.init_tesseract()
        
    def init_tesseract(self):
        """Initialize Tesseract OCR"""
        try:
            # Test Tesseract installation
            version = pytesseract.get_tesseract_version()
            logger.info(f"Tesseract OCR initialized successfully - Version: {version}")
        except Exception as e:
            logger.error(f"Failed to initialize Tesseract: {e}")
            raise
    
    def get_db_connection(self):
        """Get database connection"""
        try:
            connection = mysql.connector.connect(**self.db_config)
            return connection
        except mysql.connector.Error as e:
            logger.error(f"Database connection failed: {e}")
            raise
    
    def enhanced_preprocess_image(self, image_path: str) -> np.ndarray:
        """Enhanced preprocessing for better OCR accuracy"""
        try:
            # Read image
            image = cv2.imread(image_path)
            if image is None:
                raise ValueError(f"Could not read image from {image_path}")
            
            # Convert to grayscale
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            
            # Get original dimensions
            height, width = gray.shape
            logger.info(f"Original image size: {width}x{height}")
            
            # Resize image for better OCR (larger = better accuracy)
            if height < 1200:
                scale = 1200 / height
                new_width = int(width * scale)
                gray = cv2.resize(gray, (new_width, 1200), interpolation=cv2.INTER_CUBIC)
                logger.info(f"Resized image to: {new_width}x1200")
            
            # Apply Gaussian blur to reduce noise
            blurred = cv2.GaussianBlur(gray, (3, 3), 0)
            
            # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
            clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
            enhanced = clahe.apply(blurred)
            
            # Apply bilateral filter to reduce noise while keeping edges sharp
            filtered = cv2.bilateralFilter(enhanced, 9, 75, 75)
            
            # Apply adaptive threshold for better text separation
            thresh = cv2.adaptiveThreshold(
                filtered, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
            )
            
            # Apply morphological operations to clean up the image
            kernel = np.ones((1,1), np.uint8)
            cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
            
            # Optional: Apply dilation to make text slightly thicker (better for OCR)
            kernel2 = np.ones((1,1), np.uint8)
            final = cv2.dilate(cleaned, kernel2, iterations=1)
            
            logger.info(f"Enhanced image preprocessing completed successfully")
            return final
            
        except Exception as e:
            logger.error(f"Enhanced image preprocessing failed: {e}")
            # Fallback to basic preprocessing
            return self.basic_preprocess_image(image_path)
    
    def basic_preprocess_image(self, image_path: str) -> np.ndarray:
        """Basic preprocessing as fallback"""
        try:
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            if image is None:
                raise ValueError(f"Could not read image from {image_path}")
            
            # Simple threshold
            _, thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            return thresh
            
        except Exception as e:
            logger.error(f"Basic preprocessing failed: {e}")
            return cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    def preprocess_image(self, image_path: str) -> np.ndarray:
        """Main preprocessing function - uses enhanced preprocessing"""
        return self.enhanced_preprocess_image(image_path)
    
    def extract_text_from_image(self, image_path: str) -> Tuple[str, float]:
        """Extract text from image using enhanced Tesseract OCR"""
        try:
            # Use enhanced preprocessing
            processed_image = self.preprocess_image(image_path)
            
            # Convert to PIL Image for Tesseract
            pil_image = Image.fromarray(processed_image)
            
            # Enhanced Tesseract configuration for receipts
            custom_config = r'--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.,:/\-+*@#$%&()[]{}| '
            
            # Extract text using Tesseract
            extracted_text = pytesseract.image_to_string(pil_image, config=custom_config)
            
            # Get confidence scores for each word
            data = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DICT, config=custom_config)
            confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
            avg_confidence = sum(confidences) / len(confidences) if confidences else 0
            
            # Clean up the extracted text
            cleaned_text = self.clean_extracted_text(extracted_text)
            
            logger.info(f"Enhanced OCR completed. Extracted {len(cleaned_text.split())} words with avg confidence {avg_confidence:.2f}")
            
            # Log first few lines for debugging
            lines = cleaned_text.split('\n')[:5]
            logger.info(f"First few lines extracted: {lines}")
            
            return cleaned_text, avg_confidence
            
        except Exception as e:
            logger.error(f"Enhanced OCR text extraction failed: {e}")
            return "", 0.0
    
    def clean_extracted_text(self, text: str) -> str:
        """Clean and normalize extracted text"""
        if not text:
            return ""
        
        # Split into lines and clean each line
        lines = text.split('\n')
        cleaned_lines = []
        
        for line in lines:
            # Remove extra whitespace
            line = line.strip()
            
            # Skip very short lines (likely noise)
            if len(line) < 2:
                continue
                
            # Remove lines with too many special characters (likely noise)
            special_char_ratio = sum(1 for c in line if not c.isalnum() and c != ' ') / len(line) if line else 0
            if special_char_ratio > 0.7:
                continue
                
            cleaned_lines.append(line)
        
        return '\n'.join(cleaned_lines)
    
    def parse_receipt_data(self, ocr_text: str) -> Dict:
        """Parse structured data from OCR text"""
        try:
            data = {
                'store_name': None,
                'store_address': None,
                'receipt_date': None,
                'receipt_time': None,
                'total_amount': None,
                'currency': 'ZAR',
                'items_count': 0,
                'receipt_number': None
            }
            
            lines = ocr_text.split('\n')
            
            # South African retailers (expanded list)
            retailers = [
                'PICK N PAY', 'SHOPRITE', 'CHECKERS', 'SPAR', 'WOOLWORTHS',
                'GAME', 'MAKRO', 'DISCHEM', 'CLICKS', 'PEP', 'ACKERMANS',
                'MR PRICE', 'EDGARS', 'FOSCHINI', 'TRUWORTHS', 'CAPITEC',
                'ABSA', 'FNB', 'STANDARD BANK', 'NEDBANK'
            ]
            
            # Find store name (check first 10 lines)
            for line in lines[:10]:
                line_upper = line.upper().strip()
                for retailer in retailers:
                    if retailer in line_upper:
                        data['store_name'] = retailer
                        break
                if data['store_name']:
                    break
            
            # Find date patterns (multiple formats)
            date_patterns = [
                r'(\d{4}[-/]\d{2}[-/]\d{2})',  # YYYY-MM-DD or YYYY/MM/DD
                r'(\d{2}[-/]\d{2}[-/]\d{4})',  # DD-MM-YYYY or DD/MM/YYYY
                r'(\d{2}[-/]\d{2}[-/]\d{2})',  # DD-MM-YY or DD/MM/YY
                r'(\d{1,2}\s+[A-Za-z]{3}\s+\d{4})',  # DD MMM YYYY
                r'(\d{1,2}\s+[A-Za-z]{3,9}\s+\d{4})'  # DD Month YYYY
            ]
            
            for line in lines:
                for pattern in date_patterns:
                    match = re.search(pattern, line)
                    if match:
                        try:
                            date_str = match.group(1)
                            # Try to parse different date formats
                            date_formats = [
                                '%Y-%m-%d', '%Y/%m/%d', '%d-%m-%Y', '%d/%m/%Y', 
                                '%y-%m-%d', '%y/%m/%d', '%d %b %Y', '%d %B %Y'
                            ]
                            for fmt in date_formats:
                                try:
                                    parsed_date = datetime.strptime(date_str, fmt)
                                    data['receipt_date'] = parsed_date.strftime('%Y-%m-%d')
                                    break
                                except ValueError:
                                    continue
                            if data['receipt_date']:
                                break
                        except:
                            continue
                if data['receipt_date']:
                    break
            
            # Find time patterns
            time_patterns = [
                r'(\d{2}:\d{2}:\d{2})',  # HH:MM:SS
                r'(\d{2}:\d{2})',        # HH:MM
                r'(\d{1,2}:\d{2}\s*[AP]M)'  # H:MM AM/PM
            ]
            
            for line in lines:
                for pattern in time_patterns:
                    match = re.search(pattern, line, re.IGNORECASE)
                    if match:
                        time_str = match.group(1)
                        # Normalize time format
                        if len(time_str.split(':')) == 2:
                            time_str += ':00'
                        data['receipt_time'] = time_str
                        break
                if data['receipt_time']:
                    break
            
            # Find total amount (enhanced patterns)
            amount_patterns = [
                r'TOTAL[:\s]*R?[\s]*(\d+[.,]\d{2})',
                r'AMOUNT[:\s]*R?[\s]*(\d+[.,]\d{2})',
                r'GRAND\s*TOTAL[:\s]*R?[\s]*(\d+[.,]\d{2})',
                r'BALANCE[:\s]*R?[\s]*(\d+[.,]\d{2})',
                r'R[\s]*(\d+[.,]\d{2})(?:\s*$|\s*TOTAL)',
                r'(\d+[.,]\d{2})[\s]*R?[\s]*$'
            ]
            
            # Look for total amount in lines containing "total" or similar keywords
            for line in lines:
                line_upper = line.upper()
                if any(keyword in line_upper for keyword in ['TOTAL', 'AMOUNT', 'BALANCE', 'DUE']):
                    for pattern in amount_patterns:
                        match = re.search(pattern, line_upper)
                        if match:
                            amount_str = match.group(1).replace(',', '.')
                            try:
                                amount = float(amount_str)
                                # Reasonable amount validation (between R1 and R10000)
                                if 1.0 <= amount <= 10000.0:
                                    data['total_amount'] = amount
                                    break
                            except ValueError:
                                continue
                    if data['total_amount']:
                        break
            
            # Count items (improved estimation)
            item_count = 0
            for line in lines:
                # Look for lines that look like item entries
                if re.search(r'\d+[.,]\d{2}', line) and len(line.strip()) > 5:
                    # Additional checks to avoid counting totals as items
                    line_upper = line.upper()
                    if not any(keyword in line_upper for keyword in ['TOTAL', 'SUBTOTAL', 'TAX', 'CHANGE', 'TENDERED']):
                        item_count += 1
            
            data['items_count'] = max(1, item_count)
            
            # Find receipt number
            receipt_patterns = [
                r'RECEIPT[:\s#]*(\d+)',
                r'REF[:\s#]*(\d+)',
                r'NO[:\s#]*(\d+)',
                r'TRANSACTION[:\s#]*(\d+)',
                r'TXN[:\s#]*(\d+)'
            ]
            
            for line in lines:
                line_upper = line.upper()
                for pattern in receipt_patterns:
                    match = re.search(pattern, line_upper)
                    if match:
                        data['receipt_number'] = match.group(1)
                        break
                if data['receipt_number']:
                    break
            
            logger.info(f"Receipt data parsed: {data}")
            return data
            
        except Exception as e:
            logger.error(f"Receipt data parsing failed: {e}")
            return {}
    
    def save_ocr_data(self, receipt_id: int, raw_text: str, parsed_data: Dict, confidence: float):
        """Save OCR data to database - Fixed version without pos_terminal"""
        try:
            connection = self.get_db_connection()
            cursor = connection.cursor()
            
            # Prepare data for insertion (removed pos_terminal)
            extracted_data_json = json.dumps(parsed_data)
            
            insert_query = """
                INSERT INTO receipt_ocr_data (
                    receipt_id, raw_ocr_text, extracted_data, store_name, store_address,
                    receipt_date, receipt_time, total_amount, currency, items_count,
                    receipt_number, ocr_confidence, processing_status, created_at
                ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            """
            
            values = (
                receipt_id,
                raw_text,
                extracted_data_json,
                parsed_data.get('store_name'),
                parsed_data.get('store_address'),
                parsed_data.get('receipt_date'),
                parsed_data.get('receipt_time'),
                parsed_data.get('total_amount'),
                parsed_data.get('currency', 'ZAR'),
                parsed_data.get('items_count', 0),
                parsed_data.get('receipt_number'),
                confidence,
                'completed',
                datetime.now()
            )
            
            cursor.execute(insert_query, values)
            connection.commit()
            
            ocr_data_id = cursor.lastrowid
            logger.info(f"OCR data saved successfully with ID: {ocr_data_id}")
            
            cursor.close()
            connection.close()
            
            return ocr_data_id
            
        except Exception as e:
            logger.error(f"Failed to save OCR data: {e}")
            raise
    
    def process_receipt(self, receipt_id: int) -> bool:
        """Process a single receipt"""
        try:
            # Get receipt information from database
            connection = self.get_db_connection()
            cursor = connection.cursor(dictionary=True)
            
            cursor.execute("""
                SELECT id, user_id, file_path, file_name, upload_status
                FROM receipt_uploads 
                WHERE id = %s AND upload_status = 'uploaded'
            """, (receipt_id,))
            
            receipt = cursor.fetchone()
            cursor.close()
            connection.close()
            
            if not receipt:
                logger.error(f"Receipt {receipt_id} not found or not in uploaded status")
                return False
            
            # Build full file path
            file_path = f"/var/www/html/uploads/{receipt['file_path']}"
            
            if not os.path.exists(file_path):
                logger.error(f"Receipt file not found: {file_path}")
                return False
            
            logger.info(f"Processing receipt {receipt_id}: {receipt['file_name']}")
            
            # Extract text using OCR
            raw_text, confidence = self.extract_text_from_image(file_path)
            
            if not raw_text:
                logger.error(f"No text extracted from receipt {receipt_id}")
                return False
            
            # Parse structured data
            parsed_data = self.parse_receipt_data(raw_text)
            
            # Save OCR data to database
            self.save_ocr_data(receipt_id, raw_text, parsed_data, confidence)
            
            logger.info(f"Receipt {receipt_id} processed successfully")
            return True
            
        except Exception as e:
            logger.error(f"Receipt processing failed for ID {receipt_id}: {e}")
            return False

def main():
    """Main function"""
    parser = argparse.ArgumentParser(description='Process receipt with Tesseract OCR')
    parser.add_argument('--receipt-id', type=int, required=True, help='Receipt ID to process')
    
    args = parser.parse_args()
    
    # Database configuration
    db_config = {
        'host': 'localhost',
        'user': 'prime_usr',
        'password': '!1945@Tata!',
        'database': 'prime_dbs_loyalty',
        'charset': 'utf8mb4',
        'collation': 'utf8mb4_unicode_ci'
    }
    
    try:
        processor = TesseractReceiptOCRProcessor(db_config)
        success = processor.process_receipt(args.receipt_id)
        
        if success:
            logger.info(f"Receipt {args.receipt_id} processed successfully")
            sys.exit(0)
        else:
            logger.error(f"Failed to process receipt {args.receipt_id}")
            sys.exit(1)
            
    except Exception as e:
        logger.error(f"OCR processor failed: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()


