#!/usr/bin/env python3
"""
Document Ingestion Pipeline for Research Development Framework.

This script processes documents from import folders through the full pipeline:
1. Extract metadata and content from source files
2. Convert to markdown format
3. Assess text quality (with optional OCR auto-repair)
4. AUTO-CLASSIFY using LLM/Taxonomist (NEW in v2.0)
5. Register in database with classification
6. Move to LOGICAL folder organization (by category/author)

IMPORT FOLDERS (checked in order):
    1. NEW_DOCS/           - Primary import folder (root directory, easy access)
    2. library/NEW_DOCS/incoming/  - Legacy import folder (still supported)

Usage:
    python ingest_documents.py                    # Process all import folders
    python ingest_documents.py --file path.pdf    # Process single file
    python ingest_documents.py --dry-run          # Preview without processing
    python ingest_documents.py --no-classify      # Skip auto-classification
    python ingest_documents.py --no-logical-org   # Use file-type folders
    python ingest_documents.py --source NEW_DOCS  # Process specific folder only
    python ingest_documents.py --auto-repair      # OCR fallback for poor quality PDFs
"""

import os
import sys
import hashlib
import shutil
import re
import logging
from pathlib import Path
from datetime import datetime
from typing import Optional, Dict, Any, Tuple
import argparse

# Document processing libraries
try:
    import pypdf
    HAS_PYPDF = True
except ImportError:
    HAS_PYPDF = False

try:
    from docx import Document as DocxDocument
    HAS_DOCX = True
except ImportError:
    HAS_DOCX = False

try:
    import tiktoken
    HAS_TIKTOKEN = True
except ImportError:
    HAS_TIKTOKEN = False

# Legacy DOC support - check for multiple extraction tools
import subprocess

def _check_command_available(cmd: str) -> bool:
    """Check if a command-line tool is available."""
    try:
        subprocess.run([cmd, '--help'], capture_output=True, check=False, timeout=5)
        return True
    except (FileNotFoundError, subprocess.SubprocessError, subprocess.TimeoutExpired):
        return False

# Check available DOC extraction tools (in order of preference)
HAS_PANDOC = _check_command_available('pandoc')
HAS_LIBREOFFICE = _check_command_available('libreoffice')
HAS_ANTIWORD = _check_command_available('antiword')

# At least one DOC extractor available?
HAS_DOC_EXTRACTOR = HAS_PANDOC or HAS_LIBREOFFICE or HAS_ANTIWORD

# RTF support
try:
    from striprtf.striprtf import rtf_to_text
    HAS_RTF = True
except ImportError:
    HAS_RTF = False

# ODT support (OpenDocument Text)
try:
    from odf import text as odf_text
    from odf.opendocument import load as odf_load
    HAS_ODT = True
except ImportError:
    HAS_ODT = False

# CSV support (built-in)
import csv
HAS_CSV = True

# JSON support (built-in)
import json
HAS_JSON = True

# XML/HTML support
try:
    from bs4 import BeautifulSoup
    HAS_BS4 = True
except ImportError:
    HAS_BS4 = False

from config import PATHS, PROCESSING_CONFIG, QUALITY_THRESHOLDS, LOGGING_CONFIG
from db_utils import insert_document, get_db_connection, execute_query

# Try to import Taxonomist for auto-classification
try:
    from taxonomist import Taxonomist
    HAS_TAXONOMIST = True
except ImportError:
    HAS_TAXONOMIST = False

# OCR support for auto-repair (optional)
try:
    import pytesseract
    from PIL import Image, ImageEnhance, ImageFilter
    HAS_TESSERACT = True
except ImportError:
    HAS_TESSERACT = False

try:
    from pdf2image import convert_from_path
    HAS_PDF2IMAGE = True
except ImportError:
    HAS_PDF2IMAGE = False

# Combined OCR availability
HAS_OCR = HAS_TESSERACT and HAS_PDF2IMAGE

# Setup logging
logging.basicConfig(
    level=getattr(logging, LOGGING_CONFIG['level']),
    format=LOGGING_CONFIG['format']
)
logger = logging.getLogger(__name__)


class DocumentProcessor:
    """Handles document ingestion and processing."""

    def __init__(
        self,
        dry_run: bool = False,
        enable_classification: bool = True,
        logical_organization: bool = True,
        enable_auto_repair: bool = False
    ):
        """
        Initialize the document processor.

        Args:
            dry_run: Preview without making changes
            enable_classification: Use Taxonomist for auto-classification
            logical_organization: Organize files by category/author instead of file type
            enable_auto_repair: Auto-OCR PDFs with poor text quality
        """
        self.dry_run = dry_run
        self.enable_classification = enable_classification and HAS_TAXONOMIST
        self.logical_organization = logical_organization
        self.enable_auto_repair = enable_auto_repair and HAS_OCR
        self.stats = {
            'processed': 0,
            'failed': 0,
            'skipped': 0,
            'low_quality': 0,
            'classified': 0,
            'ocr_repaired': 0
        }

        # Initialize taxonomist if classification is enabled
        self.taxonomist = None
        if self.enable_classification:
            try:
                self.taxonomist = Taxonomist()
                logger.info("Taxonomist initialized for auto-classification")
            except Exception as e:
                logger.warning(f"Failed to initialize Taxonomist: {e}")
                self.enable_classification = False

    def calculate_file_hash(self, file_path: Path) -> str:
        """Calculate SHA-256 hash of file contents."""
        sha256_hash = hashlib.sha256()
        with open(file_path, "rb") as f:
            for byte_block in iter(lambda: f.read(4096), b""):
                sha256_hash.update(byte_block)
        return sha256_hash.hexdigest()

    def extract_metadata_from_filename(self, filename: str) -> Dict[str, Any]:
        """
        Extract metadata from filename patterns.

        Supported patterns:
        - Author_Name_Title_Year.pdf
        - Title_(Author)_Year.pdf
        - Simple_Title.pdf
        """
        metadata = {
            'title': None,
            'author': None,
            'year': None
        }

        # Remove extension
        name = Path(filename).stem

        # Try to extract year (4 digits)
        year_match = re.search(r'(\d{4})', name)
        if year_match:
            year = int(year_match.group(1))
            if 1400 <= year <= datetime.now().year + 1:
                metadata['year'] = year
                name = name.replace(year_match.group(1), '').strip('_- ')

        # Try to extract author in parentheses
        author_match = re.search(r'\(([^)]+)\)', name)
        if author_match:
            metadata['author'] = author_match.group(1).replace('_', ' ')
            name = re.sub(r'\([^)]+\)', '', name).strip('_- ')

        # Clean up title
        metadata['title'] = name.replace('_', ' ').strip()

        return metadata

    def extract_text_from_pdf(self, file_path: Path) -> Tuple[str, Dict]:
        """
        Extract text content from PDF file with page number tracking.

        Returns:
            Tuple of (text_content, metadata) where metadata includes:
            - page_count: Total number of pages
            - page_map: List of (start_char, end_char, page_num) for mapping text to pages
            - page_texts: List of text per page (for precise page tracking)
        """
        if not HAS_PYPDF:
            raise ImportError("pypdf not installed. Run: pip install pypdf")

        text_content = []
        page_texts = []
        page_map = []  # List of (start_char, end_char, page_number)
        metadata = {}
        current_pos = 0

        with open(file_path, 'rb') as f:
            reader = pypdf.PdfReader(f)

            # Extract metadata
            if reader.metadata:
                metadata['title'] = reader.metadata.get('/Title', '')
                metadata['author'] = reader.metadata.get('/Author', '')
                metadata['subject'] = reader.metadata.get('/Subject', '')

            metadata['page_count'] = len(reader.pages)

            # Extract text from each page with position tracking
            for page_num, page in enumerate(reader.pages, start=1):
                text = page.extract_text()
                if text:
                    text_content.append(text)
                    page_texts.append(text)

                    # Track character positions for this page
                    text_len = len(text) + 2  # +2 for \n\n separator
                    page_map.append({
                        'page': page_num,
                        'start_char': current_pos,
                        'end_char': current_pos + text_len - 2,
                        'char_count': len(text)
                    })
                    current_pos += text_len
                else:
                    page_texts.append('')

        metadata['page_map'] = page_map
        metadata['page_texts'] = page_texts

        return '\n\n'.join(text_content), metadata

    def extract_text_via_ocr(self, file_path: Path, dpi: int = 300) -> Tuple[str, Dict]:
        """
        Extract text from PDF using OCR (for scanned documents).

        This is a fallback when pypdf extraction produces poor quality text.
        Requires pytesseract and pdf2image.

        Args:
            file_path: Path to the PDF file
            dpi: Resolution for image conversion (higher = better quality but slower)

        Returns:
            Tuple of (text_content, metadata) where metadata includes:
            - page_count: Total number of pages
            - ocr_method: 'tesseract'
        """
        if not HAS_OCR:
            raise ImportError("OCR libraries not installed. Run: pip install pytesseract pdf2image")

        import tempfile

        metadata = {'ocr_method': 'tesseract'}
        text_content = []
        page_texts = []
        page_map = []
        current_pos = 0

        # Create temp directory for images
        with tempfile.TemporaryDirectory(prefix='ocr_') as temp_dir:
            # Convert PDF to images
            logger.info(f"  Converting PDF to images at {dpi} DPI for OCR...")
            try:
                images = convert_from_path(str(file_path), dpi=dpi, fmt='png')
            except Exception as e:
                logger.error(f"  Failed to convert PDF to images: {e}")
                raise

            metadata['page_count'] = len(images)
            logger.info(f"  Processing {len(images)} pages with Tesseract OCR...")

            for page_num, img in enumerate(images, start=1):
                try:
                    # Preprocess image for better OCR
                    if img.mode != 'L':
                        img = img.convert('L')

                    # Enhance contrast
                    enhancer = ImageEnhance.Contrast(img)
                    img = enhancer.enhance(1.5)

                    # Sharpen
                    img = img.filter(ImageFilter.SHARPEN)

                    # OCR the image
                    text = pytesseract.image_to_string(img, config='--oem 1 --psm 3')

                    if text:
                        text_content.append(text)
                        page_texts.append(text)

                        # Track character positions
                        text_len = len(text) + 2
                        page_map.append({
                            'page': page_num,
                            'start_char': current_pos,
                            'end_char': current_pos + text_len - 2,
                            'char_count': len(text)
                        })
                        current_pos += text_len
                    else:
                        page_texts.append('')

                    if page_num % 10 == 0:
                        logger.info(f"    Processed page {page_num}/{len(images)}")

                except Exception as e:
                    logger.warning(f"  OCR failed for page {page_num}: {e}")
                    page_texts.append('')

        metadata['page_map'] = page_map
        metadata['page_texts'] = page_texts

        return '\n\n'.join(text_content), metadata

    def get_page_for_position(self, char_pos: int, page_map: list) -> int:
        """
        Get the page number for a character position in extracted text.

        Args:
            char_pos: Character position in the joined text
            page_map: List of page mapping dicts from extract_text_from_pdf

        Returns:
            Page number (1-indexed) or 0 if not found
        """
        for pm in page_map:
            if pm['start_char'] <= char_pos <= pm['end_char']:
                return pm['page']
        return 0

    def extract_text_from_docx(self, file_path: Path) -> Tuple[str, Dict]:
        """Extract text content from DOCX file."""
        if not HAS_DOCX:
            raise ImportError("python-docx not installed. Run: pip install python-docx")

        doc = DocxDocument(file_path)
        text_content = []
        metadata = {}

        # Extract core properties
        if doc.core_properties:
            metadata['title'] = doc.core_properties.title or ''
            metadata['author'] = doc.core_properties.author or ''
            metadata['subject'] = doc.core_properties.subject or ''

        # Extract paragraphs
        for para in doc.paragraphs:
            if para.text.strip():
                text_content.append(para.text)

        return '\n\n'.join(text_content), metadata

    def extract_text_from_txt(self, file_path: Path) -> Tuple[str, Dict]:
        """Extract text content from plain text file."""
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
        return content, {}

    def extract_text_from_markdown(self, file_path: Path) -> Tuple[str, Dict]:
        """Extract text content from markdown file."""
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()

        metadata = {}

        # Try to extract YAML front matter
        if content.startswith('---'):
            end_match = re.search(r'\n---\n', content[3:])
            if end_match:
                front_matter = content[3:end_match.start() + 3]
                content = content[end_match.end() + 3:]

                # Parse basic YAML
                for line in front_matter.split('\n'):
                    if ':' in line:
                        key, value = line.split(':', 1)
                        metadata[key.strip()] = value.strip()

        return content, metadata

    def extract_text_from_doc(self, file_path: Path) -> Tuple[str, Dict]:
        """
        Extract text content from legacy DOC file using a fallback chain.

        Tries extraction methods in order of reliability:
        1. pandoc (most reliable, handles complex formatting)
        2. libreoffice (good for complex documents, slower)
        3. antiword (fast but limited, legacy fallback)

        Installation:
            # Recommended (best results)
            sudo apt install pandoc

            # Alternative (handles complex formatting)
            sudo apt install libreoffice

            # Legacy fallback (may fail on complex files)
            sudo apt install antiword
        """
        if not HAS_DOC_EXTRACTOR:
            raise ImportError(
                "No DOC extraction tool available. Install ONE of:\n"
                "  Recommended: sudo apt install pandoc\n"
                "  Alternative: sudo apt install libreoffice\n"
                "  Legacy:      sudo apt install antiword"
            )

        errors = []
        file_path_str = str(file_path)

        # Method 1: pandoc (most reliable)
        if HAS_PANDOC:
            try:
                result = subprocess.run(
                    ['pandoc', '-f', 'doc', '-t', 'plain', file_path_str],
                    capture_output=True,
                    text=True,
                    check=True,
                    timeout=60
                )
                if result.stdout.strip():
                    logger.info(f"Extracted DOC using pandoc: {file_path.name}")
                    return result.stdout, {'extraction_method': 'pandoc'}
            except subprocess.CalledProcessError as e:
                errors.append(f"pandoc: {e.stderr[:200] if e.stderr else 'failed'}")
            except subprocess.TimeoutExpired:
                errors.append("pandoc: timeout")

        # Method 2: LibreOffice (good for complex documents)
        if HAS_LIBREOFFICE:
            try:
                import tempfile
                with tempfile.TemporaryDirectory() as tmpdir:
                    result = subprocess.run(
                        [
                            'libreoffice', '--headless', '--convert-to',
                            'txt:Text', '--outdir', tmpdir, file_path_str
                        ],
                        capture_output=True,
                        check=True,
                        timeout=120
                    )

                    # Find the output file
                    txt_file = Path(tmpdir) / (file_path.stem + '.txt')
                    if txt_file.exists():
                        content = txt_file.read_text(encoding='utf-8', errors='ignore')
                        if content.strip():
                            logger.info(f"Extracted DOC using libreoffice: {file_path.name}")
                            return content, {'extraction_method': 'libreoffice'}

                errors.append("libreoffice: no output file generated")
            except subprocess.CalledProcessError as e:
                errors.append(f"libreoffice: {e.stderr[:200] if e.stderr else 'failed'}")
            except subprocess.TimeoutExpired:
                errors.append("libreoffice: timeout")

        # Method 3: antiword (legacy fallback)
        if HAS_ANTIWORD:
            try:
                result = subprocess.run(
                    ['antiword', file_path_str],
                    capture_output=True,
                    text=True,
                    check=True,
                    timeout=30
                )
                if result.stdout.strip():
                    logger.info(f"Extracted DOC using antiword: {file_path.name}")
                    return result.stdout, {'extraction_method': 'antiword'}
            except subprocess.CalledProcessError as e:
                errors.append(f"antiword: {e.stderr[:200] if e.stderr else 'failed'}")
            except subprocess.TimeoutExpired:
                errors.append("antiword: timeout")

        # All methods failed
        error_summary = "; ".join(errors) if errors else "No extraction methods available"
        raise ValueError(
            f"Could not extract text from {file_path.name}. "
            f"Errors: {error_summary}\n"
            f"Available tools: pandoc={HAS_PANDOC}, libreoffice={HAS_LIBREOFFICE}, antiword={HAS_ANTIWORD}"
        )

    def extract_text_from_rtf(self, file_path: Path) -> Tuple[str, Dict]:
        """Extract text content from RTF file."""
        if not HAS_RTF:
            raise ImportError("striprtf not installed. Run: pip install striprtf")

        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            rtf_content = f.read()

        content = rtf_to_text(rtf_content)
        return content, {}

    def extract_text_from_odt(self, file_path: Path) -> Tuple[str, Dict]:
        """Extract text content from ODT (OpenDocument Text) file."""
        if not HAS_ODT:
            raise ImportError("odfpy not installed. Run: pip install odfpy")

        doc = odf_load(file_path)
        text_content = []
        metadata = {}

        # Extract metadata
        meta = doc.meta
        if meta:
            for child in meta.childNodes:
                if child.tagName == 'dc:title':
                    metadata['title'] = str(child)
                elif child.tagName == 'dc:creator':
                    metadata['author'] = str(child)
                elif child.tagName == 'dc:subject':
                    metadata['subject'] = str(child)

        # Extract text from all paragraphs
        for para in doc.getElementsByType(odf_text.P):
            text = ''
            for node in para.childNodes:
                if hasattr(node, 'data'):
                    text += node.data
                elif hasattr(node, 'childNodes'):
                    for child in node.childNodes:
                        if hasattr(child, 'data'):
                            text += child.data
            if text.strip():
                text_content.append(text.strip())

        return '\n\n'.join(text_content), metadata

    def extract_text_from_html(self, file_path: Path) -> Tuple[str, Dict]:
        """Extract text content from HTML file."""
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            html_content = f.read()

        metadata = {}

        if HAS_BS4:
            soup = BeautifulSoup(html_content, 'html.parser')

            # Extract metadata from head
            title_tag = soup.find('title')
            if title_tag:
                metadata['title'] = title_tag.get_text().strip()

            meta_author = soup.find('meta', attrs={'name': 'author'})
            if meta_author:
                metadata['author'] = meta_author.get('content', '')

            # Remove script and style elements
            for script in soup(['script', 'style', 'nav', 'footer', 'header']):
                script.decompose()

            # Get text
            content = soup.get_text(separator='\n\n')
        else:
            # Basic HTML stripping without BeautifulSoup
            content = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL)
            content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL)
            content = re.sub(r'<[^>]+>', ' ', content)
            content = re.sub(r'\s+', ' ', content)

        # Clean up whitespace
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        content = '\n\n'.join(lines)

        return content, metadata

    def extract_text_from_csv(self, file_path: Path) -> Tuple[str, Dict]:
        """
        Extract text content from CSV file.

        Converts CSV to readable text format with headers and rows.
        """
        text_content = []
        metadata = {'format': 'csv'}

        with open(file_path, 'r', encoding='utf-8', errors='ignore', newline='') as f:
            # Detect dialect
            sample = f.read(4096)
            f.seek(0)

            try:
                dialect = csv.Sniffer().sniff(sample)
            except csv.Error:
                dialect = csv.excel

            reader = csv.reader(f, dialect)
            rows = list(reader)

        if not rows:
            return '', metadata

        # First row as headers
        headers = rows[0]
        metadata['columns'] = len(headers)
        metadata['rows'] = len(rows) - 1

        # Format as readable text
        text_content.append(f"# CSV Data: {file_path.stem}")
        text_content.append(f"Columns: {', '.join(headers)}")
        text_content.append(f"Total Rows: {len(rows) - 1}")
        text_content.append("")

        # Add data rows (limit to prevent huge files)
        max_rows = 1000
        for i, row in enumerate(rows[1:max_rows + 1], 1):
            row_text = []
            for header, value in zip(headers, row):
                if value.strip():
                    row_text.append(f"{header}: {value}")
            if row_text:
                text_content.append(f"Row {i}:")
                text_content.append('\n'.join(row_text))
                text_content.append("")

        if len(rows) > max_rows + 1:
            text_content.append(f"... and {len(rows) - max_rows - 1} more rows")

        return '\n'.join(text_content), metadata

    def extract_text_from_json(self, file_path: Path) -> Tuple[str, Dict]:
        """
        Extract text content from JSON file.

        Converts JSON to readable text format.
        """
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            data = json.load(f)

        metadata = {'format': 'json'}

        def extract_text_values(obj, depth=0, max_depth=10):
            """Recursively extract text values from JSON."""
            if depth > max_depth:
                return []

            texts = []
            if isinstance(obj, dict):
                for key, value in obj.items():
                    if isinstance(value, str) and len(value) > 10:
                        texts.append(f"{key}: {value}")
                    elif isinstance(value, (dict, list)):
                        texts.extend(extract_text_values(value, depth + 1))
            elif isinstance(obj, list):
                for item in obj[:100]:  # Limit list items
                    if isinstance(item, str) and len(item) > 10:
                        texts.append(item)
                    elif isinstance(item, (dict, list)):
                        texts.extend(extract_text_values(item, depth + 1))
            return texts

        text_values = extract_text_values(data)

        content = [f"# JSON Data: {file_path.stem}", ""]
        content.extend(text_values[:500])  # Limit total entries

        return '\n\n'.join(content), metadata

    def extract_text_from_xml(self, file_path: Path) -> Tuple[str, Dict]:
        """Extract text content from XML file."""
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            xml_content = f.read()

        metadata = {'format': 'xml'}

        if HAS_BS4:
            soup = BeautifulSoup(xml_content, 'xml')
            content = soup.get_text(separator='\n\n')
        else:
            # Basic XML text extraction
            content = re.sub(r'<[^>]+>', ' ', xml_content)
            content = re.sub(r'\s+', ' ', content)

        # Clean up
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        content = '\n\n'.join(lines)

        return content, metadata

    def assess_text_quality(self, text: str) -> Dict[str, Any]:
        """
        Assess the quality of extracted text.

        Returns quality metrics and grade.
        """
        if not text or len(text) < 100:
            return {
                'quality_grade': 'very_poor',
                'gibberish_ratio': 1.0,
                'avg_word_length': 0,
                'do_not_process': True,
                'notes': 'Insufficient text content'
            }

        words = text.split()
        total_words = len(words)

        if total_words < 50:
            return {
                'quality_grade': 'very_poor',
                'gibberish_ratio': 1.0,
                'avg_word_length': 0,
                'do_not_process': True,
                'notes': 'Too few words extracted'
            }

        # Calculate metrics
        avg_word_length = sum(len(w) for w in words) / total_words

        # Count potential gibberish (unusual character patterns)
        gibberish_pattern = re.compile(r'[^\w\s.,;:!?\'"()-]')
        gibberish_chars = len(gibberish_pattern.findall(text))
        gibberish_ratio = gibberish_chars / len(text)

        # Count proper sentences
        sentences = re.split(r'[.!?]+', text)
        valid_sentences = sum(1 for s in sentences if len(s.split()) >= 3)
        sentence_quality = valid_sentences / max(len(sentences), 1)

        # Determine grade
        if gibberish_ratio > QUALITY_THRESHOLDS['gibberish_ratio_max']:
            grade = 'poor'
        elif avg_word_length < QUALITY_THRESHOLDS['min_avg_word_length']:
            grade = 'poor'
        elif avg_word_length > QUALITY_THRESHOLDS['max_avg_word_length']:
            grade = 'fair'
        elif sentence_quality < QUALITY_THRESHOLDS['min_sentence_quality']:
            grade = 'fair'
        elif gibberish_ratio < 0.02 and sentence_quality > 0.8:
            grade = 'excellent'
        else:
            grade = 'good'

        do_not_process = grade in ('poor', 'very_poor')

        return {
            'quality_grade': grade,
            'gibberish_ratio': round(gibberish_ratio, 4),
            'avg_word_length': round(avg_word_length, 2),
            'sentence_quality_score': round(sentence_quality, 2),
            'do_not_process': do_not_process,
            'notes': None if not do_not_process else f'Quality grade: {grade}'
        }

    def generate_document_id(self, title: str, sequence: int = None) -> str:
        """Generate a unique document ID."""
        # Clean title for ID
        clean = re.sub(r'[^\w\s]', '', title.lower())
        clean = re.sub(r'\s+', '_', clean.strip())[:30]

        if sequence:
            return f"DOC_{sequence:04d}_{clean.upper()}"

        # Use timestamp if no sequence
        timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
        return f"DOC_{timestamp}_{clean.upper()}"

    def create_markdown_output(
        self,
        text: str,
        metadata: Dict[str, Any],
        quality: Dict[str, Any]
    ) -> str:
        """Create markdown file with YAML front matter."""
        front_matter = [
            '---',
            f"title: \"{metadata.get('title', 'Untitled')}\"",
            f"author: \"{metadata.get('author', 'Unknown')}\"",
        ]

        if metadata.get('year'):
            front_matter.append(f"publication_year: {metadata['year']}")

        front_matter.extend([
            f"language: {metadata.get('language', 'en')}",
            f"source_file: \"{metadata.get('source_file', '')}\"",
            f"quality_grade: {quality.get('quality_grade', 'unknown')}",
            f"word_count: {len(text.split())}",
            f"processed_date: \"{datetime.now().isoformat()}\"",
            '---',
            '',
            f"# {metadata.get('title', 'Untitled')}",
            '',
        ])

        if metadata.get('author'):
            front_matter.append(f"**Author:** {metadata['author']}")
            front_matter.append('')

        front_matter.append(text)

        return '\n'.join(front_matter)

    def process_file(self, file_path: Path) -> bool:
        """
        Process a single file through the ingestion pipeline.

        Returns True if successful, False otherwise.
        """
        logger.info(f"Processing: {file_path.name}")

        try:
            # Check file extension
            ext = file_path.suffix.lower()
            if ext not in PROCESSING_CONFIG['supported_extensions']:
                logger.warning(f"Unsupported file type: {ext}")
                self.stats['skipped'] += 1
                return False

            # Check file size
            file_size = file_path.stat().st_size / (1024 * 1024)  # MB
            if file_size > PROCESSING_CONFIG['max_file_size_mb']:
                logger.warning(f"File too large: {file_size:.2f} MB")
                self.stats['skipped'] += 1
                return False

            # Calculate file hash
            file_hash = self.calculate_file_hash(file_path)

            # Check for duplicates
            existing = execute_query(
                "SELECT document_id FROM documents WHERE content_hash = %s",
                (file_hash,),
                fetch='one'
            )
            if existing:
                logger.info(f"Duplicate detected: {existing['document_id']}")
                self.stats['skipped'] += 1
                return False

            # Extract text based on file type
            extractors = {
                '.pdf': self.extract_text_from_pdf,
                '.docx': self.extract_text_from_docx,
                '.doc': self.extract_text_from_doc,
                '.txt': self.extract_text_from_txt,
                '.md': self.extract_text_from_markdown,
                '.rtf': self.extract_text_from_rtf,
                '.odt': self.extract_text_from_odt,
                '.html': self.extract_text_from_html,
                '.htm': self.extract_text_from_html,
                '.csv': self.extract_text_from_csv,
                '.json': self.extract_text_from_json,
                '.xml': self.extract_text_from_xml,
            }

            extractor = extractors.get(ext)
            if not extractor:
                logger.warning(f"No extractor for: {ext}")
                self.stats['skipped'] += 1
                return False

            text, file_metadata = extractor(file_path)

            # Extract metadata from filename as fallback
            filename_metadata = self.extract_metadata_from_filename(file_path.name)

            # Merge metadata (file metadata takes precedence)
            metadata = {**filename_metadata, **file_metadata}
            metadata['source_file'] = file_path.name

            # Ensure we have a title
            if not metadata.get('title'):
                metadata['title'] = file_path.stem.replace('_', ' ')

            # Assess quality
            quality = self.assess_text_quality(text)

            # Auto-classify document if enabled
            classification = None
            if self.enable_classification and self.taxonomist:
                try:
                    classification = self.taxonomist.classify_document(
                        text,
                        existing_metadata=metadata
                    )
                    logger.info(
                        f"Classified as: {classification.get('primary_category')} "
                        f"(confidence: {classification.get('confidence', 0):.2f})"
                    )
                except Exception as e:
                    logger.warning(f"Classification failed: {e}")
                    classification = None

            if self.dry_run:
                logger.info(f"[DRY RUN] Would process: {metadata['title']}")
                logger.info(f"  Quality: {quality['quality_grade']}")
                logger.info(f"  Words: {len(text.split())}")
                if classification:
                    logger.info(f"  Category: {classification.get('primary_category')}")
                    logger.info(f"  Topics: {classification.get('specific_topics', [])[:3]}")
                    logger.info(f"  Folder: {self.taxonomist.suggest_folder_path(classification, metadata.get('author'))}")
                return True

            # Generate document ID
            doc_id = self.generate_document_id(metadata['title'])

            # Handle low quality documents
            if quality['do_not_process']:
                # Try OCR fallback for PDFs if auto-repair is enabled
                if self.enable_auto_repair and ext == '.pdf':
                    logger.info(f"  Poor quality ({quality['quality_grade']}), attempting OCR repair...")
                    try:
                        ocr_text, ocr_metadata = self.extract_text_via_ocr(file_path)
                        ocr_quality = self.assess_text_quality(ocr_text)

                        # Check if OCR improved quality
                        if not ocr_quality['do_not_process']:
                            logger.info(f"  OCR improved quality: {quality['quality_grade']} -> {ocr_quality['quality_grade']}")
                            text = ocr_text
                            metadata.update(ocr_metadata)
                            quality = ocr_quality
                            self.stats['ocr_repaired'] += 1
                        else:
                            logger.warning(f"  OCR did not improve quality: {ocr_quality['quality_grade']}")
                            # Fall through to low_quality handling
                            dest = PATHS['low_quality'] / file_path.name
                            shutil.move(str(file_path), str(dest))
                            self.stats['low_quality'] += 1
                            return False
                    except Exception as e:
                        logger.error(f"  OCR repair failed: {e}")
                        dest = PATHS['low_quality'] / file_path.name
                        shutil.move(str(file_path), str(dest))
                        self.stats['low_quality'] += 1
                        return False
                else:
                    logger.warning(f"Low quality document: {quality['quality_grade']}")
                    dest = PATHS['low_quality'] / file_path.name
                    shutil.move(str(file_path), str(dest))
                    self.stats['low_quality'] += 1
                    return False

            # Create markdown output
            markdown_content = self.create_markdown_output(text, metadata, quality)

            # Save markdown file
            md_filename = re.sub(r'[^\w\s-]', '', metadata['title'])
            md_filename = re.sub(r'\s+', '_', md_filename.strip())[:80] + '.md'
            md_path = PATHS['markdown_library'] / md_filename

            with open(md_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)

            # Register in database
            doc_data = {
                'document_id': doc_id,
                'title': metadata['title'],
                'author_id': None,  # Would need author lookup/creation
                'publication_year': metadata.get('year'),
                'language_code': metadata.get('language', 'en'),
                'source_file': file_path.name,
                'file_path': str(md_path),
                'content_hash': file_hash,
                'word_count': len(text.split()),
                'page_count': file_metadata.get('page_count', 0),
                'processing_status': 'ingested',
                'pipeline_version': PROCESSING_CONFIG['pipeline_version']
            }

            # Add classification fields if available
            if classification:
                doc_data.update({
                    'primary_category': classification.get('primary_category'),
                    'content_type': classification.get('content_type'),
                    'difficulty_level': classification.get('difficulty_level'),
                    'metadata_source': classification.get('classification_source', 'unknown'),
                    'classification_confidence': classification.get('confidence'),
                    'needs_review': classification.get('confidence', 0) < 0.6
                })

            insert_document(doc_data)

            # Sync classification to topics/concepts tables
            if classification and self.taxonomist:
                try:
                    sync_stats = self.taxonomist.sync_to_database(classification, doc_id)
                    logger.info(f"Synced {sync_stats['topics_linked']} topics, {sync_stats['concepts_linked']} concepts")
                    self.stats['classified'] += 1
                except Exception as e:
                    logger.warning(f"Failed to sync classification: {e}")

            # Move original to organized folder
            # Use logical organization (category/author) if enabled and classification available
            if self.logical_organization and classification and self.taxonomist:
                folder_path = self.taxonomist.suggest_folder_path(
                    classification,
                    author=metadata.get('author')
                )
                organized_dest = PATHS['organized'] / folder_path / file_path.name
            else:
                # Fall back to file-type organization
                organized_dest = PATHS['organized'] / ext[1:].upper() / file_path.name

            organized_dest.parent.mkdir(parents=True, exist_ok=True)
            shutil.copy2(str(file_path), str(organized_dest))

            # Move to completed
            completed_dest = PATHS['completed'] / file_path.name
            shutil.move(str(file_path), str(completed_dest))

            logger.info(f"Successfully ingested: {doc_id}")
            self.stats['processed'] += 1
            return True

        except Exception as e:
            logger.error(f"Error processing {file_path.name}: {e}")

            # Move to failed folder
            if not self.dry_run:
                failed_dest = PATHS['failed'] / file_path.name
                try:
                    shutil.move(str(file_path), str(failed_dest))
                except:
                    pass

            self.stats['failed'] += 1
            return False

    def process_incoming(self, source_folder: str = None) -> Dict[str, int]:
        """
        Process all files from import folders.

        Import folders are checked in order:
        1. NEW_DOCS/           - Primary (root directory, easy access)
        2. library/NEW_DOCS/incoming/  - Legacy (still supported)

        Args:
            source_folder: Optional. Process only this folder ('new_docs' or 'incoming')
        """
        # Determine which folders to check
        folders_to_check = []

        if source_folder:
            if source_folder.lower() in ('new_docs', 'primary'):
                folders_to_check = [('NEW_DOCS', PATHS['new_docs'])]
            elif source_folder.lower() in ('incoming', 'legacy'):
                folders_to_check = [('incoming', PATHS['incoming'])]
            else:
                # Treat as custom path
                custom_path = Path(source_folder)
                if custom_path.exists():
                    folders_to_check = [(source_folder, custom_path)]
        else:
            # Check both folders
            folders_to_check = [
                ('NEW_DOCS', PATHS['new_docs']),
                ('incoming', PATHS['incoming'])
            ]

        all_files = []

        for folder_name, folder_path in folders_to_check:
            if not folder_path.exists():
                logger.debug(f"{folder_name} folder not found: {folder_path}")
                continue

            # Get files (recursively for NEW_DOCS, flat for incoming)
            if folder_name == 'NEW_DOCS':
                # Recursively find all files in NEW_DOCS
                files = [f for f in folder_path.rglob('*')
                        if f.is_file() and not f.name.startswith('.') and f.name != 'README.txt']
            else:
                # Flat listing for legacy incoming folder
                files = [f for f in folder_path.glob('*') if f.is_file()]

            if files:
                logger.info(f"Found {len(files)} files in {folder_name}/")
                all_files.extend(files)

        if not all_files:
            logger.info("No files to process in any import folder")
            logger.info("  Drop files in: NEW_DOCS/ (primary) or library/NEW_DOCS/incoming/ (legacy)")
            return self.stats

        logger.info(f"Total files to process: {len(all_files)}")

        for file_path in all_files:
            self.process_file(file_path)

        return self.stats


def main():
    parser = argparse.ArgumentParser(
        description='Ingest documents into the Research Development Framework'
    )
    parser.add_argument(
        '--file',
        type=Path,
        help='Process a single file instead of import folders'
    )
    parser.add_argument(
        '--source',
        type=str,
        help='Process specific folder: "new_docs" (primary), "incoming" (legacy), or custom path'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Preview processing without making changes'
    )
    parser.add_argument(
        '--no-classify',
        action='store_true',
        help='Disable auto-classification (skip Taxonomist)'
    )
    parser.add_argument(
        '--no-logical-org',
        action='store_true',
        help='Disable logical folder organization (use file-type folders)'
    )
    parser.add_argument(
        '--auto-repair',
        action='store_true',
        help='Auto-OCR PDFs with poor text quality (requires pytesseract, pdf2image)'
    )

    args = parser.parse_args()

    processor = DocumentProcessor(
        dry_run=args.dry_run,
        enable_classification=not args.no_classify,
        logical_organization=not args.no_logical_org,
        enable_auto_repair=args.auto_repair
    )

    # Print configuration
    print("\n" + "=" * 60)
    print("DOCUMENT INGESTION PIPELINE v2.0")
    print("=" * 60)
    print(f"Classification: {'ENABLED' if processor.enable_classification else 'DISABLED'}")
    print(f"Logical Org:    {'ENABLED' if processor.logical_organization else 'DISABLED'}")
    print(f"Auto-Repair:    {'ENABLED' if processor.enable_auto_repair else 'DISABLED'}")
    print(f"Dry Run:        {'YES' if args.dry_run else 'NO'}")
    print("-" * 60)
    print("IMPORT FOLDERS:")
    print(f"  Primary:  NEW_DOCS/                    (root directory)")
    print(f"  Legacy:   library/NEW_DOCS/incoming/   (still supported)")
    print("=" * 60 + "\n")

    if args.file:
        if not args.file.exists():
            logger.error(f"File not found: {args.file}")
            sys.exit(1)
        processor.process_file(args.file)
    else:
        processor.process_incoming(source_folder=args.source)

    # Print summary
    print("\n" + "=" * 60)
    print("INGESTION SUMMARY")
    print("=" * 60)
    print(f"Processed:    {processor.stats['processed']}")
    print(f"Classified:   {processor.stats['classified']}")
    print(f"OCR Repaired: {processor.stats['ocr_repaired']}")
    print(f"Failed:       {processor.stats['failed']}")
    print(f"Skipped:      {processor.stats['skipped']}")
    print(f"Low Quality:  {processor.stats['low_quality']}")
    print("=" * 60)


if __name__ == '__main__':
    main()
