#!/usr/bin/env python3
"""
RDF Assess - Mechanical document assessment

Returns measurable metadata about a document without AI judgment.
Replaces persona-based curation with objective metrics.

Usage:
    rdf assess <DOC_ID> [--format json]
"""

import argparse
import json
import sys
import hashlib
from pathlib import Path

# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from pipeline.cli_utils import success_response, error_response, ErrorCodes
from pipeline.db_utils import get_db_connection


def assess_document(doc_id: str) -> dict:
    """Assess a document and return mechanical metadata."""
    conn = get_db_connection()
    cursor = conn.cursor()

    try:
        # Get document info
        cursor.execute("""
            SELECT
                d.document_id,
                d.title,
                d.author,
                d.year,
                d.source_path,
                d.page_count,
                d.content_hash,
                d.created_at,
                d.classification_category,
                d.classification_confidence,
                d.ocr_applied,
                d.extraction_method
            FROM documents d
            WHERE d.document_id = %s
        """, (doc_id,))

        row = cursor.fetchone()
        if not row:
            return None

        doc_info = {
            "document_id": row[0],
            "title": row[1],
            "author": row[2],
            "year": row[3],
            "source_path": row[4],
            "page_count": row[5],
            "content_hash": row[6],
            "created_at": str(row[7]) if row[7] else None,
            "category": row[8],
            "classification_confidence": row[9],
            "ocr_applied": row[10],
            "extraction_method": row[11]
        }

        # Get chunk statistics
        cursor.execute("""
            SELECT
                COUNT(*) as chunk_count,
                AVG(LENGTH(content)) as avg_chunk_length,
                MIN(LENGTH(content)) as min_chunk_length,
                MAX(LENGTH(content)) as max_chunk_length
            FROM chunks
            WHERE document_id = %s
        """, (doc_id,))

        chunk_row = cursor.fetchone()
        chunk_stats = {
            "chunk_count": chunk_row[0] or 0,
            "avg_chunk_length": round(chunk_row[1], 2) if chunk_row[1] else 0,
            "min_chunk_length": chunk_row[2] or 0,
            "max_chunk_length": chunk_row[3] or 0
        }

        # Get quality assessment if available
        cursor.execute("""
            SELECT quality_score, quality_grade, issues
            FROM document_quality
            WHERE document_id = %s
        """, (doc_id,))

        quality_row = cursor.fetchone()
        if quality_row:
            quality = {
                "score": quality_row[0],
                "grade": quality_row[1],
                "issues": quality_row[2] if quality_row[2] else []
            }
        else:
            quality = None

        # Check for duplicates
        cursor.execute("""
            SELECT COUNT(*) FROM documents
            WHERE content_hash = %s AND document_id != %s
        """, (doc_info["content_hash"], doc_id))

        dup_count = cursor.fetchone()[0]

        # Detect language (simple heuristic)
        cursor.execute("""
            SELECT content FROM chunks
            WHERE document_id = %s
            LIMIT 1
        """, (doc_id,))

        sample_row = cursor.fetchone()
        language = "en"  # Default
        if sample_row and sample_row[0]:
            sample = sample_row[0][:500].lower()
            # Simple language detection heuristics
            if any(w in sample for w in ["der", "die", "das", "und", "ist"]):
                language = "de"
            elif any(w in sample for w in ["le", "la", "les", "et", "est"]):
                language = "fr"
            elif any(w in sample for w in ["el", "la", "los", "y", "es"]):
                language = "es"

        # Check for common extraction issues
        warnings = []
        if chunk_stats["avg_chunk_length"] < 100:
            warnings.append("very_short_chunks")
        if chunk_stats["max_chunk_length"] > 5000:
            warnings.append("oversized_chunks")
        if doc_info["ocr_applied"] and quality and quality["score"] < 0.7:
            warnings.append("low_ocr_quality")
        if not doc_info["title"] or doc_info["title"] == "Unknown":
            warnings.append("missing_title")
        if not doc_info["author"] or doc_info["author"] == "Unknown":
            warnings.append("missing_author")

        # Check for table of contents
        cursor.execute("""
            SELECT COUNT(*) FROM chunks
            WHERE document_id = %s
            AND (content ILIKE '%%table of contents%%'
                 OR content ILIKE '%%chapter%%'
                 OR content ~ '^[IVX]+\\.'
                 OR content ~ '^\\d+\\.')
        """, (doc_id,))

        has_toc = cursor.fetchone()[0] > 0

        return {
            "doc_id": doc_id,
            "metadata": {
                "title": doc_info["title"],
                "author": doc_info["author"],
                "year": doc_info["year"],
                "page_count": doc_info["page_count"]
            },
            "ocr_quality": quality["score"] if quality else None,
            "language": language,
            "has_toc": has_toc,
            "chunk_stats": chunk_stats,
            "extraction_method": doc_info["extraction_method"],
            "extraction_warnings": warnings,
            "duplicate_likelihood": min(dup_count / 10.0, 1.0),
            "duplicate_count": dup_count
        }

    finally:
        cursor.close()
        conn.close()


def main():
    parser = argparse.ArgumentParser(
        description="Mechanical document assessment",
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    parser.add_argument("doc_id", help="Document ID to assess (e.g., DOC_023)")
    parser.add_argument("--format", choices=["json", "text"], default="json",
                        help="Output format")

    args = parser.parse_args()

    try:
        result = assess_document(args.doc_id)

        if not result:
            resp = error_response(
                ErrorCodes.DOCUMENT_NOT_FOUND,
                f"Document not found: {args.doc_id}"
            )
            if args.format == "json":
                resp.print_json()
            else:
                print(f"Error: {resp.message}")
            return 1

        if args.format == "json":
            response = success_response(
                f"Assessed document {args.doc_id}",
                data=result
            )
            response.print_json()
        else:
            print(f"Document: {result['doc_id']}")
            print(f"Title: {result['metadata']['title']}")
            print(f"Author: {result['metadata']['author']}")
            print(f"Pages: {result['metadata']['page_count']}")
            print(f"Language: {result['language']}")
            print(f"OCR Quality: {result['ocr_quality']}")
            print(f"Has TOC: {result['has_toc']}")
            print(f"Chunks: {result['chunk_stats']['chunk_count']}")
            if result['extraction_warnings']:
                print(f"Warnings: {', '.join(result['extraction_warnings'])}")
            if result['duplicate_count'] > 0:
                print(f"Potential duplicates: {result['duplicate_count']}")

        return 0

    except Exception as e:
        resp = error_response(
            ErrorCodes.DATABASE_ERROR,
            str(e)
        )
        if args.format == "json":
            resp.print_json()
        else:
            print(f"Error: {e}")
        return 1


if __name__ == "__main__":
    sys.exit(main())
