#!/usr/bin/env python3
"""
Quote Extraction Tool

Extract and format quotes from the research library with proper citations.
Designed for Claude Code to quickly build quote banks for chapters.

Usage:
    # Basic search for quotes
    python extract_quotes.py "etheric body definition"

    # With citation style
    python extract_quotes.py "Egyptian mysteries" --style chicago --max 10

    # For a specific project
    python extract_quotes.py "alchemy" --project BOOK_xxx

    # Output as JSON for Claude Code
    python extract_quotes.py "reincarnation" --format json

    # Filter by document
    python extract_quotes.py "karma" --document DOC_023

Output:
    Formatted quotes with proper citations, ready for manuscript insertion.
"""

import argparse
import json
import logging
import re
import sys
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass, asdict

# Setup path for imports
sys.path.insert(0, str(Path(__file__).parent))

from config import LOGGING_CONFIG, BASE_DIR, BOOK_WORKFLOW_CONFIG
from db_utils import (
    execute_query,
    hybrid_search_with_rerank,
    generate_query_embedding,
)

# Setup logging
LOG_LEVEL = LOGGING_CONFIG.get('level', 'INFO')
logging.basicConfig(
    level=getattr(logging, LOG_LEVEL, logging.INFO),
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


# =============================================================================
# DATA STRUCTURES
# =============================================================================

@dataclass
class Quote:
    """A quote extracted from the research library."""
    text: str
    document_id: str
    title: str
    author: str
    year: Optional[int]
    chunk_id: str
    chunk_sequence: int
    relevance_score: float
    citation_short: str  # e.g., "Steiner, *Theosophy*, p.34"
    citation_full: str   # Full bibliography entry
    page_hint: Optional[str] = None  # Estimated page from chunk sequence


@dataclass
class QuoteBank:
    """Collection of quotes on a topic."""
    query: str
    total_found: int
    quotes: List[Quote]
    generated_at: str
    project_id: Optional[str] = None


# =============================================================================
# CITATION FORMATTING
# =============================================================================

def format_author_short(author: str) -> str:
    """Get author's last name for short citations."""
    if not author or author == 'Unknown':
        return 'Unknown'

    # Handle "First Last" format
    parts = author.strip().split()
    if parts:
        return parts[-1]  # Last name
    return author


def format_citation_short(title: str, author: str, year: Optional[int], page: Optional[str] = None) -> str:
    """Format a short citation (author, title, page)."""
    author_short = format_author_short(author)

    # Truncate long titles
    title_short = title
    if len(title) > 40:
        title_short = title[:37] + '...'

    parts = [author_short]
    parts.append(f"*{title_short}*")

    if year:
        parts.append(f"({year})")

    if page:
        parts.append(page)

    return ', '.join(parts)


def format_citation_full(title: str, author: str, year: Optional[int],
                         publisher: Optional[str] = None, style: str = 'chicago') -> str:
    """Format a full bibliography citation."""
    if style == 'chicago':
        parts = []

        # Author (Last, First)
        if author and author != 'Unknown':
            name_parts = author.strip().split()
            if len(name_parts) >= 2:
                inverted = f"{name_parts[-1]}, {' '.join(name_parts[:-1])}"
            else:
                inverted = author
            parts.append(inverted + '.')

        # Title (italicized)
        if title:
            parts.append(f"*{title}*.")

        # Publisher and year
        if publisher and year:
            parts.append(f"{publisher}, {year}.")
        elif year:
            parts.append(f"{year}.")

        return ' '.join(parts)

    elif style == 'simple':
        if year:
            return f"{author}. {title}. {year}."
        return f"{author}. {title}."

    else:  # Default to simple
        return f"{author}. {title}. {year or 'n.d.'}."


def estimate_page_from_chunk(chunk_sequence: int, avg_chars_per_page: int = 2000) -> str:
    """Estimate page number from chunk sequence (rough approximation)."""
    # Assuming ~500 chars per chunk and ~2000 chars per page
    estimated_page = max(1, (chunk_sequence * 500) // avg_chars_per_page + 1)
    return f"~p.{estimated_page}"


# =============================================================================
# QUOTE EXTRACTION
# =============================================================================

def extract_quotable_text(chunk_text: str, query: str, min_length: int = 50, max_length: int = 500) -> List[str]:
    """
    Extract quotable sentences from a chunk that are relevant to the query.

    Returns list of potential quote texts, preferring complete sentences.
    """
    quotes = []

    # Split into sentences (simple sentence boundary detection)
    sentences = re.split(r'(?<=[.!?])\s+', chunk_text)

    # Look for sentences containing query terms
    query_terms = set(query.lower().split())

    for sent in sentences:
        sent = sent.strip()
        if len(sent) < min_length:
            continue
        if len(sent) > max_length:
            # Try to find a good truncation point
            sent = sent[:max_length].rsplit(' ', 1)[0] + '...'

        # Check if sentence contains query terms
        sent_lower = sent.lower()
        matching_terms = sum(1 for term in query_terms if term in sent_lower)

        if matching_terms > 0:
            quotes.append(sent)

    # If no good sentence matches, use the most relevant portion of the chunk
    if not quotes and len(chunk_text) >= min_length:
        # Take the beginning of the chunk
        text = chunk_text[:max_length]
        if len(chunk_text) > max_length:
            text = text.rsplit(' ', 1)[0] + '...'
        quotes.append(text)

    return quotes


def search_for_quotes(
    query: str,
    max_quotes: int = 10,
    style: str = 'chicago',
    document_id: Optional[str] = None,
    min_quote_length: int = 50,
    max_quote_length: int = 500,
    use_rerank: bool = True
) -> QuoteBank:
    """
    Search the library for quotable passages on a topic.

    Args:
        query: Topic to search for quotes about
        max_quotes: Maximum number of quotes to return
        style: Citation style (chicago, simple)
        document_id: Optional filter to specific document
        min_quote_length: Minimum characters for a quote
        max_quote_length: Maximum characters for a quote
        use_rerank: Whether to use cross-encoder reranking

    Returns:
        QuoteBank with extracted quotes and citations
    """
    logger.info(f"Searching for quotes: '{query}'")

    # Search with more results to ensure enough after filtering
    search_limit = max_quotes * 3

    # Use hybrid search with reranking
    results = hybrid_search_with_rerank(
        query_text=query,
        limit=search_limit,
        document_id=document_id,
        use_rerank=use_rerank
    )

    if not results:
        logger.info("No search results found")
        return QuoteBank(
            query=query,
            total_found=0,
            quotes=[],
            generated_at=datetime.now().isoformat()
        )

    logger.info(f"Found {len(results)} search results")

    # Get document metadata for all results
    doc_ids = list(set(r['document_id'] for r in results))
    doc_metadata = _get_documents_metadata(doc_ids)

    # Extract quotes from results
    quotes = []
    seen_texts = set()  # Avoid duplicate quotes

    for result in results:
        if len(quotes) >= max_quotes:
            break

        doc_id = result['document_id']
        chunk_text = result.get('chunk_text', '')
        chunk_id = result.get('chunk_id', '')
        chunk_seq = result.get('chunk_sequence', 0)
        score = result.get('rerank_score', result.get('rrf_score', result.get('relevance', 0)))

        # Get document info
        doc_info = doc_metadata.get(doc_id, {})
        title = doc_info.get('title', 'Unknown Title')
        author = doc_info.get('author', 'Unknown')
        year = doc_info.get('publication_year')
        publisher = doc_info.get('publisher')

        # Extract quotable text from chunk
        quote_texts = extract_quotable_text(
            chunk_text, query,
            min_length=min_quote_length,
            max_length=max_quote_length
        )

        for qtext in quote_texts:
            # Skip if we've seen this text (or very similar)
            text_normalized = ' '.join(qtext.lower().split())
            if text_normalized in seen_texts:
                continue
            seen_texts.add(text_normalized)

            # Estimate page
            page_hint = estimate_page_from_chunk(chunk_seq)

            # Format citations
            citation_short = format_citation_short(title, author, year, page_hint)
            citation_full = format_citation_full(title, author, year, publisher, style)

            quote = Quote(
                text=qtext,
                document_id=doc_id,
                title=title,
                author=author,
                year=year,
                chunk_id=chunk_id,
                chunk_sequence=chunk_seq,
                relevance_score=float(score),
                citation_short=citation_short,
                citation_full=citation_full,
                page_hint=page_hint
            )
            quotes.append(quote)

            if len(quotes) >= max_quotes:
                break

    logger.info(f"Extracted {len(quotes)} quotes")

    return QuoteBank(
        query=query,
        total_found=len(results),
        quotes=quotes,
        generated_at=datetime.now().isoformat()
    )


def _get_documents_metadata(document_ids: List[str]) -> Dict[str, Dict]:
    """Fetch metadata for multiple documents."""
    if not document_ids:
        return {}

    placeholders = ', '.join(['%s'] * len(document_ids))
    query = f"""
        SELECT
            d.document_id,
            d.title,
            a.name as author,
            d.publication_year,
            d.publisher
        FROM documents d
        LEFT JOIN authors a ON d.author_id = a.author_id
        WHERE d.document_id IN ({placeholders})
    """

    try:
        results = execute_query(query, tuple(document_ids), fetch='all')
        return {r['document_id']: dict(r) for r in results}
    except Exception as e:
        logger.error(f"Database error: {e}")
        return {}


# =============================================================================
# OUTPUT FORMATTERS
# =============================================================================

def format_quotes_markdown(quote_bank: QuoteBank) -> str:
    """Format quotes as markdown for direct use in manuscripts."""
    lines = []

    lines.append(f"# Quotes: {quote_bank.query}")
    lines.append("")
    lines.append(f"**Found:** {quote_bank.total_found} chunks | **Extracted:** {len(quote_bank.quotes)} quotes")
    lines.append(f"**Generated:** {quote_bank.generated_at}")
    lines.append("")
    lines.append("---")
    lines.append("")

    for i, quote in enumerate(quote_bank.quotes, 1):
        lines.append(f"## Quote {i}")
        lines.append("")
        lines.append(f"> \"{quote.text}\"")
        lines.append("")
        lines.append(f"— {quote.citation_short}")
        lines.append("")
        lines.append(f"- **Source:** {quote.title}")
        lines.append(f"- **Author:** {quote.author}")
        if quote.year:
            lines.append(f"- **Year:** {quote.year}")
        lines.append(f"- **Document ID:** `{quote.document_id}`")
        lines.append(f"- **Relevance:** {quote.relevance_score:.3f}")
        lines.append("")
        lines.append("**Full Citation:**")
        lines.append(f"> {quote.citation_full}")
        lines.append("")
        lines.append("---")
        lines.append("")

    return '\n'.join(lines)


def format_quotes_json(quote_bank: QuoteBank) -> str:
    """Format quotes as JSON for Claude Code processing."""
    output = {
        'query': quote_bank.query,
        'total_found': quote_bank.total_found,
        'quote_count': len(quote_bank.quotes),
        'generated_at': quote_bank.generated_at,
        'project_id': quote_bank.project_id,
        'quotes': [asdict(q) for q in quote_bank.quotes]
    }
    return json.dumps(output, indent=2)


def format_quotes_simple(quote_bank: QuoteBank) -> str:
    """Format quotes in simple text for quick review."""
    lines = []

    lines.append(f"QUOTES: {quote_bank.query}")
    lines.append(f"Found: {len(quote_bank.quotes)} quotes from {quote_bank.total_found} sources")
    lines.append("=" * 60)
    lines.append("")

    for i, quote in enumerate(quote_bank.quotes, 1):
        lines.append(f"{i}. \"{quote.text}\"")
        lines.append(f"   — {quote.citation_short}")
        lines.append(f"   [{quote.document_id}]")
        lines.append("")

    return '\n'.join(lines)


# =============================================================================
# PROJECT INTEGRATION
# =============================================================================

def save_quotes_to_project(quote_bank: QuoteBank, project_id: str) -> Path:
    """Save quote bank to a book project directory."""
    projects_dir = BOOK_WORKFLOW_CONFIG.get('projects_dir', BASE_DIR / 'book_projects')
    project_dir = projects_dir / project_id

    if not project_dir.exists():
        logger.warning(f"Project directory not found: {project_dir}")
        return None

    # Create quotes subdirectory
    quotes_dir = project_dir / 'quotes'
    quotes_dir.mkdir(exist_ok=True)

    # Generate filename from query
    safe_query = re.sub(r'[^\w\s-]', '', quote_bank.query)[:30]
    safe_query = re.sub(r'\s+', '_', safe_query).lower()
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f"quotes_{safe_query}_{timestamp}.md"

    output_path = quotes_dir / filename

    with open(output_path, 'w') as f:
        f.write(format_quotes_markdown(quote_bank))

    logger.info(f"Saved quotes to: {output_path}")
    return output_path


# =============================================================================
# CLI INTERFACE
# =============================================================================

def create_parser() -> argparse.ArgumentParser:
    """Create CLI argument parser."""
    parser = argparse.ArgumentParser(
        prog='extract_quotes',
        description='Extract and format quotes from the research library',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Search for quotes on a topic
  python extract_quotes.py "etheric body definition"

  # Get more quotes with Chicago citations
  python extract_quotes.py "Egyptian mysteries" --max 20 --style chicago

  # Output as JSON for Claude Code
  python extract_quotes.py "reincarnation karma" --format json

  # Filter to specific document
  python extract_quotes.py "spiritual science" --document DOC_001

  # Save to project
  python extract_quotes.py "alchemy" --project BOOK_xxx --save
        """
    )

    # Required arguments
    parser.add_argument(
        'query',
        type=str,
        help='Topic or query to search for quotes about'
    )

    # Search options
    search_group = parser.add_argument_group('Search Options')
    search_group.add_argument(
        '--max', '-m',
        type=int,
        default=10,
        help='Maximum number of quotes to extract (default: 10)'
    )
    search_group.add_argument(
        '--document', '-d',
        type=str,
        help='Filter to specific document ID'
    )
    search_group.add_argument(
        '--min-length',
        type=int,
        default=50,
        help='Minimum quote length in characters (default: 50)'
    )
    search_group.add_argument(
        '--max-length',
        type=int,
        default=500,
        help='Maximum quote length in characters (default: 500)'
    )
    search_group.add_argument(
        '--no-rerank',
        action='store_true',
        help='Disable cross-encoder reranking'
    )

    # Output options
    output_group = parser.add_argument_group('Output Options')
    output_group.add_argument(
        '--style', '-s',
        type=str,
        choices=['chicago', 'simple'],
        default='chicago',
        help='Citation style (default: chicago)'
    )
    output_group.add_argument(
        '--format', '-f',
        type=str,
        choices=['markdown', 'json', 'simple'],
        default='markdown',
        help='Output format (default: markdown)'
    )
    output_group.add_argument(
        '--output', '-o',
        type=str,
        help='Output file path (prints to stdout if not specified)'
    )

    # Project integration
    project_group = parser.add_argument_group('Project Integration')
    project_group.add_argument(
        '--project', '-p',
        type=str,
        help='Book project ID to associate quotes with'
    )
    project_group.add_argument(
        '--save',
        action='store_true',
        help='Save quotes to project directory (requires --project)'
    )

    return parser


def main():
    """Main entry point."""
    parser = create_parser()
    args = parser.parse_args()

    # Search for quotes
    quote_bank = search_for_quotes(
        query=args.query,
        max_quotes=args.max,
        style=args.style,
        document_id=args.document,
        min_quote_length=args.min_length,
        max_quote_length=args.max_length,
        use_rerank=not args.no_rerank
    )

    # Set project ID if provided
    if args.project:
        quote_bank.project_id = args.project

    # Format output
    if args.format == 'json':
        output = format_quotes_json(quote_bank)
    elif args.format == 'simple':
        output = format_quotes_simple(quote_bank)
    else:
        output = format_quotes_markdown(quote_bank)

    # Save or print
    if args.output:
        output_path = Path(args.output)
        with open(output_path, 'w') as f:
            f.write(output)
        print(f"Saved to: {output_path}")
    elif args.save and args.project:
        saved_path = save_quotes_to_project(quote_bank, args.project)
        if saved_path:
            print(f"Saved to: {saved_path}")
    else:
        print(output)

    # Return status for Claude Code
    if args.format == 'json':
        return 0 if quote_bank.quotes else 1

    return 0


if __name__ == '__main__':
    sys.exit(main())
