#!/usr/bin/env python3
"""
PDF to Text Converter with OCR Fallback
Processes all PDFs in the documents directory and extracts text.
Handles both native PDFs and scanned/image-based PDFs using OCR.
"""

import os
import subprocess
import sys
from pathlib import Path
from typing import Tuple

# Configuration
BASE_DIR = Path(__file__).resolve().parent
PDF_DIR = BASE_DIR / "documents"
TEXT_DIR = BASE_DIR / "text"
MIN_TEXT_LENGTH = 100  # Minimum text length to consider PDF as text-based


def setup_directories():
    """Create output directory structure."""
    TEXT_DIR.mkdir(exist_ok=True)
    print(f"Output directory: {TEXT_DIR}")


def check_pdf_has_text(pdf_path: Path) -> Tuple[bool, str]:
    """
    Check if PDF has extractable text using pdftotext.
    Returns (has_text, extracted_text)
    """
    try:
        result = subprocess.run(
            ["pdftotext", "-l", "3", str(pdf_path), "-"],
            capture_output=True,
            text=True,
            timeout=30
        )
        text = result.stdout.strip()
        has_text = len(text) > MIN_TEXT_LENGTH
        return has_text, text if has_text else ""
    except subprocess.TimeoutExpired:
        print(f"  Warning: Timeout checking {pdf_path.name}")
        return False, ""
    except Exception as e:
        print(f"  Error checking {pdf_path.name}: {e}")
        return False, ""


def extract_text_from_pdf(pdf_path: Path, output_path: Path) -> bool:
    """
    Extract text from PDF using pdftotext.
    Returns True if successful.
    """
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            result = subprocess.run(
                ["pdftotext", "-layout", str(pdf_path), "-"],
                capture_output=True,
                text=True,
                timeout=120
            )
            if result.returncode == 0:
                f.write(result.stdout)
                return True
            else:
                print(f"  pdftotext error: {result.stderr[:200]}")
                return False
    except subprocess.TimeoutExpired:
        print(f"  Timeout extracting text from {pdf_path.name}")
        return False
    except Exception as e:
        print(f"  Error: {e}")
        return False


def extract_with_ocr(pdf_path: Path, output_path: Path) -> bool:
    """
    Extract text using OCR (for scanned PDFs).
    Converts PDF to images then runs tesseract.
    Returns True if successful.
    """
    try:
        # Convert PDF to images and pipe directly to tesseract
        # Using pdftoppm to convert PDF to images
        temp_prefix = f"/tmp/pdf_ocr_{pdf_path.stem}"

        # Convert PDF to PPM images
        subprocess.run(
            ["pdftoppm", "-r", "300", str(pdf_path), temp_prefix],
            capture_output=True,
            timeout=300
        )

        # Find generated images
        import glob
        images = sorted(glob.glob(f"{temp_prefix}*.ppm"))

        if not images:
            print(f"  No images generated for OCR")
            return False

        # Run OCR on each image and combine
        all_text = []
        for i, img in enumerate(images, 1):
            try:
                result = subprocess.run(
                    ["tesseract", img, "stdout"],
                    capture_output=True,
                    text=True,
                    timeout=60
                )
                if result.returncode == 0:
                    all_text.append(f"\n--- Page {i} ---\n")
                    all_text.append(result.stdout)
                # Clean up image
                os.remove(img)
            except Exception as e:
                print(f"  OCR error on page {i}: {e}")
                try:
                    os.remove(img)
                except:
                    pass

        # Write combined text
        if all_text:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(''.join(all_text))
            return True
        return False

    except subprocess.TimeoutExpired:
        print(f"  Timeout during OCR processing")
        return False
    except Exception as e:
        print(f"  OCR error: {e}")
        return False


def process_pdf(pdf_path: Path) -> str:
    """
    Process a single PDF file.
    Returns status string.
    """
    output_path = TEXT_DIR / f"{pdf_path.stem}.txt"

    # Skip if already processed
    if output_path.exists():
        return "skip"

    # Check if PDF has extractable text
    has_text, _ = check_pdf_has_text(pdf_path)

    if has_text:
        # Extract text directly
        if extract_text_from_pdf(pdf_path, output_path):
            return "text"
        else:
            return "error"
    else:
        # PDF is likely scanned, use OCR
        print(f"  Using OCR (scanned/image-based PDF)")
        if extract_with_ocr(pdf_path, output_path):
            return "ocr"
        else:
            return "error"


def main():
    """Main processing loop."""
    setup_directories()

    # Get list of all PDFs
    pdfs = sorted(PDF_DIR.glob("*.pdf"))
    total = len(pdfs)

    print(f"\nFound {total} PDF files to process\n")

    # Counters
    stats = {
        "skip": 0,
        "text": 0,
        "ocr": 0,
        "error": 0
    }

    # Process each PDF
    for i, pdf_path in enumerate(pdfs, 1):
        size_mb = pdf_path.stat().st_size / (1024 * 1024)
        print(f"[{i}/{total}] {pdf_path.name} ({size_mb:.1f} MB)")

        status = process_pdf(pdf_path)
        stats[status] += 1

        if status == "skip":
            print(f"  Skipped (already processed)")
        elif status == "text":
            print(f"  ✓ Extracted text")
        elif status == "ocr":
            print(f"  ✓ Extracted with OCR")
        elif status == "error":
            print(f"  ✗ Failed to process")

        print()

    # Print summary
    print("\n" + "="*50)
    print("PROCESSING COMPLETE")
    print("="*50)
    print(f"Total PDFs:        {total}")
    print(f"Already processed: {stats['skip']}")
    print(f"Text extracted:    {stats['text']}")
    print(f"OCR extracted:     {stats['ocr']}")
    print(f"Errors:            {stats['error']}")
    print(f"\nText files saved to: {TEXT_DIR}")
    print("="*50)


if __name__ == "__main__":
    main()
