#!/usr/bin/env python3
"""
Provenance System v4.0 (Agent Edition)

Minimal provenance tracking for citations and validation.
Replaces the enterprise envelope system with lean, essential fields.

Required Fields:
- doc_id: Document ID
- source_type: "library" or "web"
- page_range: [start, end] page numbers
- chunk_id: Chunk identifier
- extraction: "standard", "academic_pdf", or "ocr"
- content_hash: SHA-256 of chunk content

Usage:
    from provenance import create_provenance, create_used_in_trace

    # Create minimal provenance for a chunk
    prov = create_provenance(
        doc_id="DOC_023",
        page_range=[44, 45],
        chunk_id="chunk_023_045",
        content="The actual content..."
    )

    # Create used-in trace linking provenance to output
    trace = create_used_in_trace(
        prov=prov,
        output_file="chapter_01.md",
        line_range=[41, 47]
    )
"""

import json
import hashlib
from dataclasses import dataclass, asdict
from typing import Optional, List, Dict, Any, Tuple
from pathlib import Path

# Pipeline version
PIPELINE_VERSION = "4.0.0"


# =============================================================================
# MINIMAL PROVENANCE
# =============================================================================

@dataclass
class MinimalProvenance:
    """
    Minimal provenance for v4.0 Agent Edition.

    Enough for citations and validation, not enterprise audit.
    """
    doc_id: str
    source_type: str = "library"  # "library" or "web"
    page_range: Optional[List[int]] = None  # [start, end]
    chunk_id: Optional[str] = None
    extraction: str = "standard"  # standard, academic_pdf, ocr
    content_hash: Optional[str] = None

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary."""
        result = {
            "doc_id": self.doc_id,
            "source_type": self.source_type,
        }
        if self.page_range:
            result["page_range"] = self.page_range
        if self.chunk_id:
            result["chunk_id"] = self.chunk_id
        if self.extraction != "standard":
            result["extraction"] = self.extraction
        if self.content_hash:
            result["content_hash"] = self.content_hash
        return result

    def to_json(self) -> str:
        """Convert to JSON string."""
        return json.dumps(self.to_dict())

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> 'MinimalProvenance':
        """Create from dictionary."""
        return cls(
            doc_id=data.get("doc_id", ""),
            source_type=data.get("source_type", "library"),
            page_range=data.get("page_range"),
            chunk_id=data.get("chunk_id"),
            extraction=data.get("extraction", "standard"),
            content_hash=data.get("content_hash")
        )


@dataclass
class UsedInTrace:
    """
    Links provenance to output artifacts.

    Tracks where source material was used in generated content.
    """
    prov: MinimalProvenance
    file: str
    line_start: int
    line_end: int

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary."""
        return {
            "prov": self.prov.to_dict(),
            "used_in": {
                "file": self.file,
                "line_start": self.line_start,
                "line_end": self.line_end
            }
        }

    def to_json(self) -> str:
        """Convert to JSON string."""
        return json.dumps(self.to_dict(), indent=2)


# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================

def compute_content_hash(content: str) -> str:
    """Compute SHA256 hash of content."""
    return f"sha256:{hashlib.sha256(content.encode('utf-8')).hexdigest()}"


def create_provenance(
    doc_id: str,
    content: str = None,
    page_range: Tuple[int, int] = None,
    chunk_id: str = None,
    extraction: str = "standard",
    source_type: str = "library"
) -> MinimalProvenance:
    """
    Create minimal provenance for a chunk.

    Args:
        doc_id: Document ID (e.g., "DOC_023")
        content: Chunk content (for hash computation)
        page_range: (start_page, end_page) tuple
        chunk_id: Chunk identifier
        extraction: Extraction method
        source_type: "library" or "web"

    Returns:
        MinimalProvenance instance
    """
    content_hash = compute_content_hash(content) if content else None
    page_list = list(page_range) if page_range else None

    return MinimalProvenance(
        doc_id=doc_id,
        source_type=source_type,
        page_range=page_list,
        chunk_id=chunk_id,
        extraction=extraction,
        content_hash=content_hash
    )


def create_used_in_trace(
    prov: MinimalProvenance,
    output_file: str,
    line_range: Tuple[int, int]
) -> UsedInTrace:
    """
    Create a used-in trace linking provenance to output.

    Args:
        prov: Source provenance
        output_file: Path to output file
        line_range: (start_line, end_line) in output

    Returns:
        UsedInTrace instance
    """
    return UsedInTrace(
        prov=prov,
        file=output_file,
        line_start=line_range[0],
        line_end=line_range[1]
    )


def create_web_provenance(
    url: str,
    content: str = None,
    chunk_id: str = None
) -> MinimalProvenance:
    """
    Create provenance for web source.

    Args:
        url: Source URL
        content: Content (for hash)
        chunk_id: Chunk identifier

    Returns:
        MinimalProvenance with source_type="web"
    """
    # Use URL hash as doc_id for web sources
    url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
    doc_id = f"WEB_{url_hash}"

    return create_provenance(
        doc_id=doc_id,
        content=content,
        chunk_id=chunk_id,
        source_type="web"
    )


# =============================================================================
# PROVENANCE STORE (Simple file-based)
# =============================================================================

class ProvenanceStore:
    """
    Simple file-based provenance storage.

    Replaces the complex ProvenanceTracker for v4.0.
    """

    def __init__(self, storage_path: Path = None):
        self.storage_path = storage_path or Path("provenance_store.json")
        self._records: Dict[str, Dict] = {}
        self._load()

    def _load(self):
        """Load existing records."""
        if self.storage_path.exists():
            try:
                with open(self.storage_path, 'r') as f:
                    data = json.load(f)
                    self._records = data.get("records", {})
            except (json.JSONDecodeError, IOError):
                self._records = {}

    def save(self):
        """Save records to disk."""
        self.storage_path.parent.mkdir(parents=True, exist_ok=True)
        with open(self.storage_path, 'w') as f:
            json.dump({
                "version": PIPELINE_VERSION,
                "records": self._records
            }, f, indent=2)

    def store(self, prov: MinimalProvenance) -> str:
        """Store provenance and return key."""
        key = prov.chunk_id or prov.doc_id
        self._records[key] = prov.to_dict()
        return key

    def get(self, key: str) -> Optional[MinimalProvenance]:
        """Retrieve provenance by key."""
        data = self._records.get(key)
        if data:
            return MinimalProvenance.from_dict(data)
        return None

    def get_by_doc(self, doc_id: str) -> List[MinimalProvenance]:
        """Get all provenance records for a document."""
        results = []
        for data in self._records.values():
            if data.get("doc_id") == doc_id:
                results.append(MinimalProvenance.from_dict(data))
        return results


# =============================================================================
# CLI
# =============================================================================

if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(description='Provenance utilities (v4.0)')
    parser.add_argument('--inspect', type=str, help='Inspect provenance by chunk_id')
    parser.add_argument('--doc', type=str, help='List provenance for document')
    parser.add_argument('--storage', type=str, default='provenance_store.json',
                        help='Path to provenance storage file')
    parser.add_argument('--format', choices=['json', 'text'], default='json')

    args = parser.parse_args()

    store = ProvenanceStore(Path(args.storage))

    if args.inspect:
        prov = store.get(args.inspect)
        if prov:
            if args.format == 'json':
                print(prov.to_json())
            else:
                print(f"Document: {prov.doc_id}")
                print(f"Type: {prov.source_type}")
                print(f"Pages: {prov.page_range}")
                print(f"Chunk: {prov.chunk_id}")
        else:
            print(f"Not found: {args.inspect}")

    elif args.doc:
        records = store.get_by_doc(args.doc)
        if args.format == 'json':
            print(json.dumps([r.to_dict() for r in records], indent=2))
        else:
            print(f"Provenance records for {args.doc}: {len(records)}")
            for r in records:
                print(f"  - {r.chunk_id}: pages {r.page_range}")
