#!/usr/bin/env python3
"""
RDF Bibliography Export

Export project bibliographies in various academic formats:
- BibTeX (.bib) - For LaTeX and reference managers
- RIS (.ris) - For EndNote, Zotero, Mendeley
- CSL-JSON (.json) - For Pandoc and Zotero

Usage:
    rdf export bibliography --project BOOK_xxx --format bibtex --output refs.bib
    rdf export bibliography --format ris --output refs.ris
    rdf export bibliography --validate
"""

import argparse
import json
import re
import sys
from datetime import datetime
from pathlib import Path
from typing import Dict, Any, List, Optional

# Setup path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
sys.path.insert(0, str(Path(__file__).parent))

from pipeline.cli_utils import success_response, error_response, ErrorCodes


def get_project_sources(project_id: str) -> List[Dict[str, Any]]:
    """Get sources from a project's sources.json file."""
    sources = []

    # Try book projects
    book_path = Path(f"projects/books/{project_id}/sources.json")
    if book_path.exists():
        with open(book_path, 'r') as f:
            data = json.load(f)
            if isinstance(data, list):
                sources.extend(data)
            elif isinstance(data, dict):
                sources.extend(data.get('library_sources', []))
                sources.extend(data.get('web_sources', []))

    # Try essay projects
    essay_path = Path(f"essays/{project_id}/sources.json")
    if essay_path.exists():
        with open(essay_path, 'r') as f:
            data = json.load(f)
            if isinstance(data, list):
                sources.extend(data)
            elif isinstance(data, dict):
                sources.extend(data.get('sources', []))

    return sources


def get_library_sources() -> List[Dict[str, Any]]:
    """Get all sources from the library database."""
    sources = []
    try:
        from db_utils import get_db_connection
        with get_db_connection() as conn:
            with conn.cursor() as cur:
                cur.execute("""
                    SELECT document_id, title, author, year, source_path, metadata
                    FROM documents
                    ORDER BY author, year, title
                """)
                for row in cur.fetchall():
                    doc_id, title, author, year, path, metadata = row
                    source = {
                        'document_id': doc_id,
                        'title': title,
                        'author': author,
                        'year': year,
                        'source_path': path,
                    }
                    if metadata:
                        if isinstance(metadata, str):
                            metadata = json.loads(metadata)
                        source.update(metadata)
                    sources.append(source)
    except Exception as e:
        print(f"Warning: Could not load library sources: {e}", file=sys.stderr)

    return sources


def normalize_author(author: str) -> str:
    """Normalize author name to 'Last, First' format."""
    if not author:
        return "Unknown"

    # Already in "Last, First" format
    if ',' in author:
        return author

    # Handle "First Last" format
    parts = author.strip().split()
    if len(parts) >= 2:
        return f"{parts[-1]}, {' '.join(parts[:-1])}"

    return author


def generate_bibtex_key(source: Dict[str, Any]) -> str:
    """Generate a BibTeX citation key."""
    author = source.get('author', 'unknown')
    year = source.get('year', '')
    title = source.get('title', '')

    # Extract last name
    if ',' in author:
        last_name = author.split(',')[0].strip()
    else:
        parts = author.split()
        last_name = parts[-1] if parts else 'unknown'

    # Clean last name
    last_name = re.sub(r'[^a-zA-Z]', '', last_name).lower()

    # Extract first word of title
    title_word = ''
    if title:
        words = re.findall(r'[a-zA-Z]+', title)
        if words:
            title_word = words[0].lower()

    key = f"{last_name}{year}{title_word}"
    return key or "unknown"


def escape_bibtex(text: str) -> str:
    """Escape special characters for BibTeX."""
    if not text:
        return ""

    # Escape special LaTeX characters
    replacements = [
        ('&', r'\&'),
        ('%', r'\%'),
        ('$', r'\$'),
        ('#', r'\#'),
        ('_', r'\_'),
        ('{', r'\{'),
        ('}', r'\}'),
        ('~', r'\textasciitilde{}'),
        ('^', r'\textasciicircum{}'),
    ]

    for old, new in replacements:
        text = text.replace(old, new)

    return text


def to_bibtex(sources: List[Dict[str, Any]]) -> str:
    """Convert sources to BibTeX format."""
    entries = []
    used_keys = set()

    for source in sources:
        # Generate unique key
        base_key = generate_bibtex_key(source)
        key = base_key
        counter = 1
        while key in used_keys:
            key = f"{base_key}{chr(96 + counter)}"  # a, b, c, ...
            counter += 1
        used_keys.add(key)

        # Determine entry type
        entry_type = source.get('type', 'book')
        if 'url' in source and 'http' in str(source.get('url', '')):
            entry_type = 'misc'
        elif source.get('journal'):
            entry_type = 'article'

        # Build entry
        lines = [f"@{entry_type}{{{key},"]

        # Required fields
        title = escape_bibtex(source.get('title', 'Unknown Title'))
        lines.append(f"  title = {{{title}}},")

        author = normalize_author(source.get('author', 'Unknown'))
        lines.append(f"  author = {{{escape_bibtex(author)}}},")

        year = source.get('year', '')
        if year:
            lines.append(f"  year = {{{year}}},")

        # Optional fields
        if source.get('publisher'):
            lines.append(f"  publisher = {{{escape_bibtex(source['publisher'])}}},")

        if source.get('journal'):
            lines.append(f"  journal = {{{escape_bibtex(source['journal'])}}},")

        if source.get('volume'):
            lines.append(f"  volume = {{{source['volume']}}},")

        if source.get('pages'):
            lines.append(f"  pages = {{{source['pages']}}},")

        if source.get('url'):
            lines.append(f"  url = {{{source['url']}}},")

        if source.get('doi'):
            lines.append(f"  doi = {{{source['doi']}}},")

        if source.get('isbn'):
            lines.append(f"  isbn = {{{source['isbn']}}},")

        # Add RDF document ID as note
        if source.get('document_id'):
            lines.append(f"  note = {{RDF Document ID: {source['document_id']}}},")

        lines.append("}")
        entries.append("\n".join(lines))

    return "\n\n".join(entries)


def to_ris(sources: List[Dict[str, Any]]) -> str:
    """Convert sources to RIS format."""
    entries = []

    for source in sources:
        lines = []

        # Type
        source_type = source.get('type', 'BOOK')
        ris_type_map = {
            'book': 'BOOK',
            'article': 'JOUR',
            'chapter': 'CHAP',
            'misc': 'GEN',
            'webpage': 'ELEC',
        }
        ris_type = ris_type_map.get(source_type.lower(), 'BOOK')
        lines.append(f"TY  - {ris_type}")

        # Title
        if source.get('title'):
            lines.append(f"TI  - {source['title']}")

        # Author(s)
        author = source.get('author', '')
        if author:
            # Handle multiple authors
            if ';' in author:
                for a in author.split(';'):
                    lines.append(f"AU  - {normalize_author(a.strip())}")
            else:
                lines.append(f"AU  - {normalize_author(author)}")

        # Year
        if source.get('year'):
            lines.append(f"PY  - {source['year']}")

        # Publisher
        if source.get('publisher'):
            lines.append(f"PB  - {source['publisher']}")

        # Journal
        if source.get('journal'):
            lines.append(f"JO  - {source['journal']}")

        # Volume
        if source.get('volume'):
            lines.append(f"VL  - {source['volume']}")

        # Pages
        if source.get('pages'):
            pages = source['pages']
            if '-' in str(pages):
                sp, ep = str(pages).split('-', 1)
                lines.append(f"SP  - {sp.strip()}")
                lines.append(f"EP  - {ep.strip()}")
            else:
                lines.append(f"SP  - {pages}")

        # URL
        if source.get('url'):
            lines.append(f"UR  - {source['url']}")

        # DOI
        if source.get('doi'):
            lines.append(f"DO  - {source['doi']}")

        # ISBN
        if source.get('isbn'):
            lines.append(f"SN  - {source['isbn']}")

        # RDF Document ID
        if source.get('document_id'):
            lines.append(f"N1  - RDF Document ID: {source['document_id']}")

        # End of record
        lines.append("ER  - ")

        entries.append("\n".join(lines))

    return "\n\n".join(entries)


def to_csl_json(sources: List[Dict[str, Any]]) -> str:
    """Convert sources to CSL-JSON format."""
    csl_items = []

    for i, source in enumerate(sources):
        item = {
            'id': source.get('document_id') or f"ref{i+1}",
            'type': 'book',  # Default type
        }

        # Map source type to CSL type
        source_type = source.get('type', 'book').lower()
        type_map = {
            'book': 'book',
            'article': 'article-journal',
            'chapter': 'chapter',
            'misc': 'document',
            'webpage': 'webpage',
        }
        item['type'] = type_map.get(source_type, 'book')

        # Title
        if source.get('title'):
            item['title'] = source['title']

        # Author(s)
        author = source.get('author', '')
        if author:
            authors = []
            author_list = author.split(';') if ';' in author else [author]
            for a in author_list:
                a = a.strip()
                if ',' in a:
                    parts = a.split(',', 1)
                    authors.append({
                        'family': parts[0].strip(),
                        'given': parts[1].strip() if len(parts) > 1 else ''
                    })
                else:
                    parts = a.split()
                    if len(parts) >= 2:
                        authors.append({
                            'family': parts[-1],
                            'given': ' '.join(parts[:-1])
                        })
                    else:
                        authors.append({'family': a})
            item['author'] = authors

        # Date
        if source.get('year'):
            item['issued'] = {'date-parts': [[int(source['year'])]]}

        # Publisher
        if source.get('publisher'):
            item['publisher'] = source['publisher']

        # Journal
        if source.get('journal'):
            item['container-title'] = source['journal']

        # Volume
        if source.get('volume'):
            item['volume'] = str(source['volume'])

        # Pages
        if source.get('pages'):
            item['page'] = str(source['pages'])

        # URL
        if source.get('url'):
            item['URL'] = source['url']

        # DOI
        if source.get('doi'):
            item['DOI'] = source['doi']

        # ISBN
        if source.get('isbn'):
            item['ISBN'] = source['isbn']

        csl_items.append(item)

    return json.dumps(csl_items, indent=2)


def validate_sources(sources: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Validate sources for completeness."""
    issues = []
    warnings = []

    for i, source in enumerate(sources):
        source_id = source.get('document_id') or f"source_{i+1}"

        # Required fields
        if not source.get('title'):
            issues.append(f"{source_id}: Missing title")

        if not source.get('author'):
            warnings.append(f"{source_id}: Missing author")

        if not source.get('year'):
            warnings.append(f"{source_id}: Missing year")

        # Check for publication info
        if not any([source.get('publisher'), source.get('journal'), source.get('url')]):
            warnings.append(f"{source_id}: No publisher, journal, or URL")

    return {
        'total_sources': len(sources),
        'valid_sources': len(sources) - len(issues),
        'issues': issues,
        'warnings': warnings,
        'is_valid': len(issues) == 0
    }


def main():
    parser = argparse.ArgumentParser(
        description="Export bibliography in various academic formats",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Export project bibliography as BibTeX
  rdf export bibliography --project BOOK_xxx --format bibtex --output refs.bib

  # Export all library sources as RIS
  rdf export bibliography --format ris --output library.ris

  # Export as CSL-JSON for Pandoc
  rdf export bibliography --format csl-json --output refs.json

  # Validate sources before export
  rdf export bibliography --project BOOK_xxx --validate
        """
    )

    parser.add_argument(
        "--project", "-p",
        help="Project ID (e.g., BOOK_xxx). If not specified, exports all library sources."
    )
    parser.add_argument(
        "--format", "-f",
        choices=["bibtex", "ris", "csl-json"],
        default="bibtex",
        help="Output format (default: bibtex)"
    )
    parser.add_argument(
        "--output", "-o",
        help="Output file path. If not specified, prints to stdout."
    )
    parser.add_argument(
        "--validate",
        action="store_true",
        help="Validate sources and report issues"
    )
    parser.add_argument(
        "--json-output",
        action="store_true",
        help="Output results as JSON (for validation)"
    )

    args = parser.parse_args()

    # Get sources
    if args.project:
        sources = get_project_sources(args.project)
        if not sources:
            response = error_response(
                ErrorCodes.NOT_FOUND,
                f"No sources found for project: {args.project}"
            )
            if args.json_output:
                response.print_json()
            else:
                print(f"Error: {response.message}")
            return 1
    else:
        sources = get_library_sources()
        if not sources:
            response = error_response(
                ErrorCodes.LIBRARY_EMPTY,
                "No sources found in library"
            )
            if args.json_output:
                response.print_json()
            else:
                print(f"Error: {response.message}")
            return 1

    # Validate mode
    if args.validate:
        validation = validate_sources(sources)
        if args.json_output:
            response = success_response(
                f"Validation complete: {validation['valid_sources']}/{validation['total_sources']} valid",
                data=validation
            )
            response.print_json()
        else:
            print(f"Validation Results")
            print("=" * 40)
            print(f"Total sources: {validation['total_sources']}")
            print(f"Valid sources: {validation['valid_sources']}")

            if validation['issues']:
                print(f"\nIssues ({len(validation['issues'])}):")
                for issue in validation['issues'][:10]:
                    print(f"  - {issue}")
                if len(validation['issues']) > 10:
                    print(f"  ... and {len(validation['issues']) - 10} more")

            if validation['warnings']:
                print(f"\nWarnings ({len(validation['warnings'])}):")
                for warning in validation['warnings'][:10]:
                    print(f"  - {warning}")
                if len(validation['warnings']) > 10:
                    print(f"  ... and {len(validation['warnings']) - 10} more")

        return 0 if validation['is_valid'] else 1

    # Convert to output format
    if args.format == "bibtex":
        output = to_bibtex(sources)
        ext = ".bib"
    elif args.format == "ris":
        output = to_ris(sources)
        ext = ".ris"
    elif args.format == "csl-json":
        output = to_csl_json(sources)
        ext = ".json"
    else:
        output = to_bibtex(sources)
        ext = ".bib"

    # Write output
    if args.output:
        output_path = Path(args.output)
        output_path.write_text(output)

        if args.json_output:
            response = success_response(
                f"Exported {len(sources)} sources to {output_path}",
                data={
                    'sources_exported': len(sources),
                    'format': args.format,
                    'output_file': str(output_path)
                }
            )
            response.print_json()
        else:
            print(f"Exported {len(sources)} sources to {output_path}")
    else:
        # Print to stdout
        if args.json_output:
            response = success_response(
                f"Generated {args.format} for {len(sources)} sources",
                data={
                    'sources_exported': len(sources),
                    'format': args.format,
                    'content': output
                }
            )
            response.print_json()
        else:
            print(output)

    return 0


if __name__ == "__main__":
    sys.exit(main())
