#!/usr/bin/env python3
"""
Academic PDF Parser

Specialized PDF text extraction for academic texts with footnote/endnote handling.
Standard PDF extraction often mangles footnotes by mixing them into the main text
or severing them from their reference markers.

This module:
1. Detects page footer boundaries
2. Identifies footnote markers (superscripts, bracketed numbers)
3. Links footnotes to their call locations in the main text
4. Optionally extracts footnotes as separate metadata
5. Preserves footnote context for chunking

Usage:
    from academic_pdf_parser import AcademicPDFParser

    parser = AcademicPDFParser()
    result = parser.extract(file_path)
    print(result.main_text)
    print(result.footnotes)

CLI:
    python academic_pdf_parser.py document.pdf --output-dir ./extracted
    python academic_pdf_parser.py document.pdf --inline  # Embed footnotes in text
    python academic_pdf_parser.py document.pdf --format json
"""

import argparse
import json
import logging
import re
import sys
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple

# Setup path for imports
sys.path.insert(0, str(Path(__file__).parent))

from config import LOGGING_CONFIG

# Setup logging
LOG_LEVEL = LOGGING_CONFIG.get('level', 'INFO')
logging.basicConfig(
    level=getattr(logging, LOG_LEVEL, logging.INFO),
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Try to import PDF libraries
try:
    import pypdf
    HAS_PYPDF = True
except ImportError:
    HAS_PYPDF = False

try:
    import pdfplumber
    HAS_PDFPLUMBER = True
except ImportError:
    HAS_PDFPLUMBER = False


# =============================================================================
# DATA STRUCTURES
# =============================================================================

@dataclass
class Footnote:
    """A footnote extracted from the document."""
    number: str           # "1", "2", "a", etc.
    text: str             # Footnote content
    page: int             # Page where footnote appears
    call_locations: List[int] = field(default_factory=list)  # Character positions in main text


@dataclass
class PageContent:
    """Content extracted from a single page."""
    page_number: int
    main_text: str        # Main body text
    footer_text: str      # Footer area (footnotes, page numbers)
    footnotes: List[Footnote] = field(default_factory=list)
    has_footnotes: bool = False


@dataclass
class ExtractionResult:
    """Complete extraction result from a PDF."""
    file_path: str
    page_count: int
    main_text: str                    # Cleaned main text
    footnotes: List[Footnote]         # All footnotes
    footnote_text: str               # Concatenated footnote text
    pages: List[PageContent]          # Per-page breakdown
    metadata: Dict[str, Any] = field(default_factory=dict)
    extraction_notes: List[str] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        return {
            'file_path': self.file_path,
            'page_count': self.page_count,
            'main_text_length': len(self.main_text),
            'footnote_count': len(self.footnotes),
            'metadata': self.metadata,
            'extraction_notes': self.extraction_notes,
            'footnotes': [asdict(f) for f in self.footnotes]
        }


# =============================================================================
# FOOTNOTE DETECTION PATTERNS
# =============================================================================

# Superscript footnote markers in main text
FOOTNOTE_CALL_PATTERNS = [
    r'(?<=[a-zA-Z\.\,\"\'])\s*([¹²³⁴⁵⁶⁷⁸⁹⁰]+)',  # Unicode superscripts
    r'(?<=[a-zA-Z\.\,\"\'])\s*\[(\d{1,3})\]',       # [1], [23]
    r'(?<=[a-zA-Z\.\,\"\'])\s*\((\d{1,3})\)',       # (1), (23)
    r'(?<=[a-zA-Z\.\,\"\'])([¹²³⁴⁵⁶⁷⁸⁹⁰]+)\s',     # Superscript after punctuation
]

# Footnote definitions in footer
FOOTNOTE_DEF_PATTERNS = [
    r'^(\d{1,3})[\.\)\:]\s*(.+?)(?=^\d{1,3}[\.\)\:]|\Z)',  # "1. footnote text"
    r'^([¹²³⁴⁵⁶⁷⁸⁹⁰]+)\s*(.+?)(?=^[¹²³⁴⁵⁶⁷⁸⁹⁰]+|\Z)',  # "¹ footnote text"
    r'^\[(\d{1,3})\]\s*(.+?)(?=^\[\d{1,3}\]|\Z)',          # "[1] footnote text"
]

# Common footer boundary indicators
FOOTER_INDICATORS = [
    r'\n_{3,}',           # Horizontal rule (underscores)
    r'\n-{3,}',           # Horizontal rule (dashes)
    r'\n\d{1,3}\s*$',     # Just a page number at the end
    r'\n[¹²³⁴⁵⁶⁷⁸⁹⁰]\s',  # Starts with superscript number
    r'\n1[\.\)\:]\s',     # Starts with "1." "1)" or "1:"
]

# Unicode superscript to digit mapping
SUPERSCRIPT_MAP = {
    '¹': '1', '²': '2', '³': '3', '⁴': '4', '⁵': '5',
    '⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0'
}


def superscript_to_number(s: str) -> str:
    """Convert unicode superscripts to regular numbers."""
    return ''.join(SUPERSCRIPT_MAP.get(c, c) for c in s)


# =============================================================================
# ACADEMIC PDF PARSER
# =============================================================================

class AcademicPDFParser:
    """
    Specialized PDF parser for academic texts with footnote handling.

    Key features:
    - Detects page footer boundaries
    - Extracts footnotes separately
    - Links footnote markers to their definitions
    - Produces clean main text without footer clutter
    """

    def __init__(
        self,
        footer_detection_ratio: float = 0.15,
        min_footnote_chars: int = 20,
        use_pdfplumber: bool = True
    ):
        """
        Initialize parser.

        Args:
            footer_detection_ratio: Bottom portion of page to check for footer (0.0-0.5)
            min_footnote_chars: Minimum characters for valid footnote text
            use_pdfplumber: Use pdfplumber if available (better layout detection)
        """
        self.footer_ratio = footer_detection_ratio
        self.min_footnote_chars = min_footnote_chars
        self.use_pdfplumber = use_pdfplumber and HAS_PDFPLUMBER

        if not HAS_PYPDF and not HAS_PDFPLUMBER:
            raise ImportError("No PDF library available. Install: pip install pypdf pdfplumber")

    def extract(self, file_path: Path) -> ExtractionResult:
        """
        Extract text from PDF with footnote handling.

        Args:
            file_path: Path to PDF file

        Returns:
            ExtractionResult with main text, footnotes, and metadata
        """
        file_path = Path(file_path)
        logger.info(f"Extracting academic PDF: {file_path.name}")

        pages = []
        all_footnotes = []
        all_main_text = []
        extraction_notes = []
        metadata = {}

        if self.use_pdfplumber:
            pages, metadata, extraction_notes = self._extract_with_pdfplumber(file_path)
        else:
            pages, metadata, extraction_notes = self._extract_with_pypdf(file_path)

        # Collect main text and footnotes from all pages
        for page in pages:
            all_main_text.append(page.main_text)
            all_footnotes.extend(page.footnotes)

        # Join main text
        main_text = '\n\n'.join(filter(None, all_main_text))

        # Create footnote text section
        footnote_text = self._format_footnotes(all_footnotes)

        return ExtractionResult(
            file_path=str(file_path),
            page_count=len(pages),
            main_text=main_text,
            footnotes=all_footnotes,
            footnote_text=footnote_text,
            pages=pages,
            metadata=metadata,
            extraction_notes=extraction_notes
        )

    def _extract_with_pdfplumber(
        self,
        file_path: Path
    ) -> Tuple[List[PageContent], Dict[str, Any], List[str]]:
        """Extract using pdfplumber (better layout detection)."""
        pages = []
        notes = []
        metadata = {}

        with pdfplumber.open(file_path) as pdf:
            metadata['page_count'] = len(pdf.pages)

            # Get PDF metadata
            if pdf.metadata:
                metadata['title'] = pdf.metadata.get('Title', '')
                metadata['author'] = pdf.metadata.get('Author', '')

            for page_num, page in enumerate(pdf.pages, start=1):
                try:
                    page_content = self._process_pdfplumber_page(page, page_num)
                    pages.append(page_content)
                except Exception as e:
                    logger.warning(f"Error processing page {page_num}: {e}")
                    notes.append(f"Page {page_num}: extraction error - {str(e)[:50]}")

        return pages, metadata, notes

    def _process_pdfplumber_page(self, page, page_num: int) -> PageContent:
        """Process a single pdfplumber page."""
        # Get page dimensions
        page_height = page.height
        footer_boundary = page_height * (1 - self.footer_ratio)

        # Extract text with bounding boxes
        words = page.extract_words(keep_blank_chars=True)

        main_words = []
        footer_words = []

        for word in words:
            # Words below the footer boundary go to footer
            if word.get('top', 0) > footer_boundary:
                footer_words.append(word)
            else:
                main_words.append(word)

        # Reconstruct text from words
        main_text = self._words_to_text(main_words)
        footer_text = self._words_to_text(footer_words)

        # Parse footnotes from footer
        footnotes = self._parse_footnotes(footer_text, page_num)

        # Clean main text (remove orphaned footnote markers if they appear garbled)
        main_text = self._clean_main_text(main_text)

        return PageContent(
            page_number=page_num,
            main_text=main_text,
            footer_text=footer_text,
            footnotes=footnotes,
            has_footnotes=len(footnotes) > 0
        )

    def _words_to_text(self, words: List[Dict]) -> str:
        """Convert word list back to text, preserving layout roughly."""
        if not words:
            return ""

        # Sort by position (top, then left)
        sorted_words = sorted(words, key=lambda w: (w.get('top', 0), w.get('x0', 0)))

        lines = []
        current_line = []
        last_top = None

        for word in sorted_words:
            top = word.get('top', 0)
            text = word.get('text', '')

            # New line if vertical position changed significantly
            if last_top is not None and abs(top - last_top) > 5:
                lines.append(' '.join(current_line))
                current_line = []

            current_line.append(text)
            last_top = top

        if current_line:
            lines.append(' '.join(current_line))

        return '\n'.join(lines)

    def _extract_with_pypdf(
        self,
        file_path: Path
    ) -> Tuple[List[PageContent], Dict[str, Any], List[str]]:
        """Extract using pypdf (fallback)."""
        pages = []
        notes = []
        metadata = {}

        with open(file_path, 'rb') as f:
            reader = pypdf.PdfReader(f)

            # Get metadata
            if reader.metadata:
                metadata['title'] = reader.metadata.get('/Title', '')
                metadata['author'] = reader.metadata.get('/Author', '')

            metadata['page_count'] = len(reader.pages)

            for page_num, page in enumerate(reader.pages, start=1):
                try:
                    text = page.extract_text() or ""
                    page_content = self._process_pypdf_page(text, page_num)
                    pages.append(page_content)
                except Exception as e:
                    logger.warning(f"Error processing page {page_num}: {e}")
                    notes.append(f"Page {page_num}: extraction error")

        notes.append("Used pypdf (limited layout detection). Consider installing pdfplumber.")
        return pages, metadata, notes

    def _process_pypdf_page(self, text: str, page_num: int) -> PageContent:
        """Process a single pypdf page (text-only, no layout info)."""
        lines = text.split('\n')
        total_lines = len(lines)

        if total_lines == 0:
            return PageContent(page_number=page_num, main_text="", footer_text="")

        # Estimate footer boundary
        footer_start_line = int(total_lines * (1 - self.footer_ratio))

        # Look for footer indicators
        for i, line in enumerate(lines[footer_start_line:], footer_start_line):
            for pattern in FOOTER_INDICATORS:
                if re.search(pattern, '\n' + line):
                    footer_start_line = i
                    break

        main_lines = lines[:footer_start_line]
        footer_lines = lines[footer_start_line:]

        main_text = '\n'.join(main_lines)
        footer_text = '\n'.join(footer_lines)

        # Parse footnotes
        footnotes = self._parse_footnotes(footer_text, page_num)

        main_text = self._clean_main_text(main_text)

        return PageContent(
            page_number=page_num,
            main_text=main_text,
            footer_text=footer_text,
            footnotes=footnotes,
            has_footnotes=len(footnotes) > 0
        )

    def _parse_footnotes(self, footer_text: str, page_num: int) -> List[Footnote]:
        """Parse footnote definitions from footer text."""
        footnotes = []

        if not footer_text or len(footer_text.strip()) < self.min_footnote_chars:
            return footnotes

        # Try each pattern
        for pattern in FOOTNOTE_DEF_PATTERNS:
            matches = re.findall(pattern, footer_text, re.MULTILINE | re.DOTALL)
            if matches:
                for num, text in matches:
                    # Normalize superscripts
                    normalized_num = superscript_to_number(str(num))
                    clean_text = text.strip()

                    if len(clean_text) >= self.min_footnote_chars:
                        footnotes.append(Footnote(
                            number=normalized_num,
                            text=clean_text,
                            page=page_num
                        ))
                break  # Use first matching pattern

        return footnotes

    def _clean_main_text(self, text: str) -> str:
        """Clean main text, preserving footnote call markers."""
        if not text:
            return ""

        # Remove page numbers at start/end of lines
        text = re.sub(r'^\s*\d{1,4}\s*$', '', text, flags=re.MULTILINE)

        # Remove excessive whitespace
        text = re.sub(r'\n{3,}', '\n\n', text)
        text = re.sub(r' {2,}', ' ', text)

        return text.strip()

    def _format_footnotes(self, footnotes: List[Footnote]) -> str:
        """Format all footnotes as a markdown section."""
        if not footnotes:
            return ""

        lines = ["", "---", "## Footnotes", ""]

        for fn in sorted(footnotes, key=lambda f: (f.page, int(f.number) if f.number.isdigit() else 0)):
            lines.append(f"[^{fn.number}]: {fn.text} (p.{fn.page})")
            lines.append("")

        return '\n'.join(lines)

    def extract_with_inline_footnotes(self, file_path: Path) -> str:
        """
        Extract text with footnotes embedded inline.

        Format: Main text with footnotes inserted as [^N: footnote text]
        """
        result = self.extract(file_path)

        text = result.main_text

        # Build footnote lookup
        fn_lookup = {fn.number: fn.text for fn in result.footnotes}

        # Find and expand footnote calls
        for pattern in FOOTNOTE_CALL_PATTERNS:
            def replace_call(match):
                num = superscript_to_number(match.group(1))
                fn_text = fn_lookup.get(num, "")
                if fn_text:
                    return f" [^{num}: {fn_text[:100]}{'...' if len(fn_text) > 100 else ''}] "
                return match.group(0)

            text = re.sub(pattern, replace_call, text)

        return text


# =============================================================================
# CLI INTERFACE
# =============================================================================

def create_parser() -> argparse.ArgumentParser:
    """Create CLI argument parser."""
    parser = argparse.ArgumentParser(
        prog='academic_pdf_parser',
        description='Extract text from academic PDFs with proper footnote handling',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Basic extraction
  python academic_pdf_parser.py document.pdf

  # Save to files
  python academic_pdf_parser.py document.pdf --output-dir ./extracted

  # Inline footnotes
  python academic_pdf_parser.py document.pdf --inline

  # JSON metadata output
  python academic_pdf_parser.py document.pdf --format json
        """
    )

    parser.add_argument(
        'pdf_file',
        type=str,
        help='Path to PDF file to extract'
    )

    parser.add_argument(
        '--output-dir', '-o',
        type=str,
        help='Directory to save extracted text files'
    )

    parser.add_argument(
        '--inline',
        action='store_true',
        help='Embed footnotes inline in the text'
    )

    parser.add_argument(
        '--format', '-f',
        choices=['text', 'json', 'markdown'],
        default='text',
        help='Output format (default: text)'
    )

    parser.add_argument(
        '--footer-ratio',
        type=float,
        default=0.15,
        help='Portion of page to check for footer (0.0-0.5, default: 0.15)'
    )

    parser.add_argument(
        '--no-pdfplumber',
        action='store_true',
        help='Use pypdf instead of pdfplumber'
    )

    return parser


def main():
    """Main entry point."""
    parser = create_parser()
    args = parser.parse_args()

    pdf_path = Path(args.pdf_file)
    if not pdf_path.exists():
        print(f"Error: File not found: {pdf_path}")
        return 1

    # Create parser
    academic_parser = AcademicPDFParser(
        footer_detection_ratio=args.footer_ratio,
        use_pdfplumber=not args.no_pdfplumber
    )

    # Extract
    if args.inline:
        text = academic_parser.extract_with_inline_footnotes(pdf_path)
        print(text)
    else:
        result = academic_parser.extract(pdf_path)

        if args.format == 'json':
            print(json.dumps(result.to_dict(), indent=2))
        elif args.format == 'markdown':
            print(f"# {pdf_path.stem}")
            print(f"\n**Pages:** {result.page_count}")
            print(f"**Footnotes:** {len(result.footnotes)}")
            print("\n---\n")
            print(result.main_text)
            if result.footnotes:
                print(result.footnote_text)
        else:
            print(result.main_text)
            if result.footnotes:
                print("\n\n" + "="*60)
                print(f"FOOTNOTES ({len(result.footnotes)} found):")
                print("="*60)
                for fn in result.footnotes:
                    print(f"\n[{fn.number}] (p.{fn.page}): {fn.text}")

        # Save to files if output dir specified
        if args.output_dir:
            output_dir = Path(args.output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)

            stem = pdf_path.stem

            # Save main text
            with open(output_dir / f"{stem}_main.txt", 'w', encoding='utf-8') as f:
                f.write(result.main_text)

            # Save footnotes
            if result.footnotes:
                with open(output_dir / f"{stem}_footnotes.json", 'w', encoding='utf-8') as f:
                    json.dump([asdict(fn) for fn in result.footnotes], f, indent=2)

            # Save metadata
            with open(output_dir / f"{stem}_metadata.json", 'w', encoding='utf-8') as f:
                json.dump(result.to_dict(), f, indent=2)

            print(f"\nSaved to: {output_dir}/")

    return 0


if __name__ == '__main__':
    sys.exit(main())
