#!/usr/bin/env python3
"""
Library Gardener - Automated Library Maintenance and Metadata Correction

This tool orchestrates existing pipeline components to provide automated
library health analysis, metadata correction suggestions, and data hygiene.

Features:
- Library health analysis with actionable recommendations
- Author name deduplication (find/merge variations)
- Automated metadata correction suggestions using taxonomist
- Review queue generation prioritized by issue severity
- Batch auto-fix with confidence thresholds

Usage:
    # Analyze library health
    python library_gardener.py --analyze

    # Find author name variations
    python library_gardener.py --dedupe-authors

    # Generate metadata fix suggestions
    python library_gardener.py --suggest-fixes

    # Auto-fix with high confidence threshold
    python library_gardener.py --auto-fix --min-confidence 0.9 --dry-run

    # Generate prioritized review queue
    python library_gardener.py --review-queue

    # Full gardening pass
    python library_gardener.py --full-pass

    # JSON output for Claude Code integration
    python library_gardener.py --analyze --format json
"""

import os
import sys
import json
import argparse
import logging
from datetime import datetime
from typing import Dict, List, Any, Optional, Tuple
from pathlib import Path
from collections import defaultdict
from dataclasses import dataclass, asdict, field

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
os.chdir(str(Path(__file__).parent.parent))

from pipeline.db_utils import get_db_connection, execute_query, get_dict_cursor
from pipeline.config import INTELLIGENCE_MODE, BASE_DIR
from pipeline.cli_utils import UndoLog

# Try to import fuzzy matching
try:
    from rapidfuzz import fuzz, process
    FUZZY_AVAILABLE = True
except ImportError:
    try:
        from fuzzywuzzy import fuzz, process
        FUZZY_AVAILABLE = True
    except ImportError:
        FUZZY_AVAILABLE = False

# Try to import taxonomist for classification
try:
    from pipeline.taxonomist import Taxonomist
    TAXONOMIST_AVAILABLE = True
except ImportError:
    TAXONOMIST_AVAILABLE = False

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

# Undo log for reversible operations
UNDO_LOG_PATH = BASE_DIR / 'logs' / 'gardener_undo_log.json'


# =============================================================================
# DATA CLASSES
# =============================================================================

@dataclass
class LibraryHealthReport:
    """Overall library health summary."""
    total_documents: int = 0
    total_authors: int = 0
    total_chunks: int = 0

    # Quality distribution
    quality_excellent: int = 0
    quality_good: int = 0
    quality_fair: int = 0
    quality_poor: int = 0
    quality_unusable: int = 0
    quality_unassessed: int = 0

    # Metadata completeness
    missing_author: int = 0
    missing_year: int = 0
    missing_category: int = 0
    missing_title: int = 0  # documents with filename-like titles

    # Issues
    needs_review_count: int = 0
    duplicate_content_hashes: int = 0
    author_variations_found: int = 0

    # Recommendations
    recommendations: List[str] = field(default_factory=list)

    generated_at: str = field(default_factory=lambda: datetime.now().isoformat())


@dataclass
class AuthorVariation:
    """Group of similar author names that may be the same person."""
    canonical_name: str
    variations: List[str]
    document_counts: Dict[str, int]  # name -> doc count
    similarity_scores: Dict[str, float]  # name -> similarity to canonical
    total_documents: int = 0


@dataclass
class MetadataFixSuggestion:
    """Suggested metadata correction for a document."""
    document_id: str
    current_title: str
    field: str
    current_value: Any
    suggested_value: Any
    confidence: float
    reason: str
    auto_fixable: bool = False


@dataclass
class ReviewQueueItem:
    """Document requiring manual review."""
    document_id: str
    title: str
    priority: int  # 1-10, higher = more urgent
    issues: List[str]
    suggested_actions: List[str]
    quality_score: Optional[int] = None
    created_at: Optional[str] = None


# =============================================================================
# LIBRARY HEALTH ANALYSIS
# =============================================================================

def analyze_library_health() -> LibraryHealthReport:
    """
    Comprehensive analysis of library health and data quality.

    Returns a report with statistics and recommendations.
    """
    report = LibraryHealthReport()

    with get_db_connection() as conn:
        with conn.cursor() as cur:
            # Basic counts
            cur.execute("SELECT COUNT(*) FROM documents")
            report.total_documents = cur.fetchone()[0]

            cur.execute("SELECT COUNT(*) FROM authors")
            report.total_authors = cur.fetchone()[0]

            cur.execute("SELECT COUNT(*) FROM chunks")
            report.total_chunks = cur.fetchone()[0]

            # Quality distribution
            cur.execute("""
                SELECT quality_status, COUNT(*)
                FROM documents
                GROUP BY quality_status
            """)
            for row in cur.fetchall():
                status, count = row
                if status == 'excellent':
                    report.quality_excellent = count
                elif status == 'good':
                    report.quality_good = count
                elif status == 'fair':
                    report.quality_fair = count
                elif status == 'poor':
                    report.quality_poor = count
                elif status == 'unusable':
                    report.quality_unusable = count
                elif status is None:
                    report.quality_unassessed = count

            # Missing metadata
            cur.execute("""
                SELECT COUNT(*) FROM documents
                WHERE author_id IS NULL
            """)
            report.missing_author = cur.fetchone()[0]

            cur.execute("""
                SELECT COUNT(*) FROM documents
                WHERE publication_year IS NULL
            """)
            report.missing_year = cur.fetchone()[0]

            cur.execute("""
                SELECT COUNT(*) FROM documents
                WHERE primary_category IS NULL OR primary_category = ''
            """)
            report.missing_category = cur.fetchone()[0]

            # Documents with filename-like titles (potential metadata issue)
            cur.execute("""
                SELECT COUNT(*) FROM documents
                WHERE title ~ '\\.(pdf|docx?|txt|epub|md)$'
                   OR title ~ '^[0-9_-]+$'
                   OR title ~ '^DOC_'
            """)
            report.missing_title = cur.fetchone()[0]

            # Needs review count
            cur.execute("""
                SELECT COUNT(*) FROM documents
                WHERE needs_review = true
            """)
            report.needs_review_count = cur.fetchone()[0]

            # Duplicate content hashes
            cur.execute("""
                SELECT COUNT(*) FROM (
                    SELECT content_hash
                    FROM documents
                    WHERE content_hash IS NOT NULL
                    GROUP BY content_hash
                    HAVING COUNT(*) > 1
                ) dupes
            """)
            report.duplicate_content_hashes = cur.fetchone()[0]

    # Check for author variations
    if FUZZY_AVAILABLE:
        variations = find_author_variations(threshold=85)
        report.author_variations_found = len(variations)

    # Generate recommendations
    report.recommendations = _generate_recommendations(report)

    return report


def _generate_recommendations(report: LibraryHealthReport) -> List[str]:
    """Generate actionable recommendations based on health report."""
    recommendations = []

    # Quality issues
    poor_quality_pct = 0
    if report.total_documents > 0:
        poor_quality_pct = (report.quality_poor + report.quality_unusable) / report.total_documents * 100

    if report.quality_unassessed > 0:
        recommendations.append(
            f"Run quality assessment on {report.quality_unassessed} unassessed documents: "
            f"`python pipeline/assess_quality.py`"
        )

    if poor_quality_pct > 10:
        recommendations.append(
            f"{poor_quality_pct:.1f}% of documents have poor/unusable quality. "
            f"Consider re-OCR: `python pipeline/reocr_document.py --quality-below 50`"
        )

    # Metadata issues
    if report.missing_author > 0:
        recommendations.append(
            f"{report.missing_author} documents missing author. "
            f"Run: `python library_gardener.py --suggest-fixes --field author`"
        )

    if report.missing_category > 0:
        recommendations.append(
            f"{report.missing_category} documents missing category. "
            f"Run: `python library_gardener.py --auto-fix --field category --min-confidence 0.8`"
        )

    if report.missing_title > 0:
        recommendations.append(
            f"{report.missing_title} documents have filename-like titles. "
            f"Review: `python library_gardener.py --review-queue --issue bad-title`"
        )

    # Author deduplication
    if report.author_variations_found > 0:
        recommendations.append(
            f"Found {report.author_variations_found} potential author name variations. "
            f"Review: `python library_gardener.py --dedupe-authors`"
        )

    # Duplicates
    if report.duplicate_content_hashes > 0:
        recommendations.append(
            f"Found {report.duplicate_content_hashes} duplicate document groups. "
            f"Review: `python library_gardener.py --find-duplicates`"
        )

    # Review queue
    if report.needs_review_count > 0:
        recommendations.append(
            f"{report.needs_review_count} documents flagged for review. "
            f"Process: `python pipeline/edit_metadata.py --list-review`"
        )

    if not recommendations:
        recommendations.append("Library is in good health! No immediate actions needed.")

    return recommendations


# =============================================================================
# AUTHOR DEDUPLICATION
# =============================================================================

def find_author_variations(threshold: int = 80) -> List[AuthorVariation]:
    """
    Find potential author name variations using fuzzy matching.

    Args:
        threshold: Minimum similarity score (0-100) to consider a match

    Returns:
        List of AuthorVariation groups
    """
    if not FUZZY_AVAILABLE:
        logger.warning("Fuzzy matching not available. Install: pip install rapidfuzz")
        return []

    # Get all authors with their document counts
    with get_db_connection() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT a.author_id, a.name, a.name_normalized,
                       COUNT(d.document_id) as doc_count
                FROM authors a
                LEFT JOIN documents d ON a.author_id = d.author_id
                GROUP BY a.author_id, a.name, a.name_normalized
                ORDER BY a.name
            """)
            authors = cur.fetchall()

    if len(authors) < 2:
        return []

    # Build list of names for matching
    author_data = {row[1]: {'id': row[0], 'normalized': row[2], 'doc_count': row[3]}
                   for row in authors}
    author_names = list(author_data.keys())

    # Find similar names
    processed = set()
    variations = []

    for name in author_names:
        if name in processed:
            continue

        # Find similar names
        matches = process.extract(name, author_names, scorer=fuzz.token_sort_ratio, limit=10)
        similar = [(match[0], match[1]) for match in matches
                   if match[1] >= threshold and match[0] != name]

        if similar:
            # Create variation group
            all_names = [name] + [m[0] for m in similar]

            # Pick canonical name (most documents or longest name)
            canonical = max(all_names,
                          key=lambda n: (author_data[n]['doc_count'], len(n)))

            variation = AuthorVariation(
                canonical_name=canonical,
                variations=[n for n in all_names if n != canonical],
                document_counts={n: author_data[n]['doc_count'] for n in all_names},
                similarity_scores={m[0]: m[1] for m in similar},
                total_documents=sum(author_data[n]['doc_count'] for n in all_names)
            )
            variations.append(variation)

            # Mark all as processed
            processed.update(all_names)

    return variations


def merge_authors(source_names: List[str], target_name: str, dry_run: bool = True) -> Dict[str, Any]:
    """
    Merge multiple author records into one.

    Args:
        source_names: Author names to merge from
        target_name: Author name to merge into (canonical)
        dry_run: If True, only preview changes

    Returns:
        Dictionary with merge results
    """
    result = {
        'target': target_name,
        'sources': source_names,
        'documents_updated': 0,
        'authors_removed': 0,
        'dry_run': dry_run
    }

    with get_db_connection() as conn:
        with conn.cursor() as cur:
            # Get target author ID
            cur.execute("SELECT author_id FROM authors WHERE name = %s", (target_name,))
            row = cur.fetchone()
            if not row:
                result['error'] = f"Target author '{target_name}' not found"
                return result
            target_id = row[0]

            # Get source author IDs
            source_ids = []
            for name in source_names:
                cur.execute("SELECT author_id FROM authors WHERE name = %s", (name,))
                row = cur.fetchone()
                if row:
                    source_ids.append(row[0])

            if not source_ids:
                result['error'] = "No source authors found"
                return result

            # Count documents to update
            cur.execute("""
                SELECT COUNT(*) FROM documents
                WHERE author_id = ANY(%s)
            """, (source_ids,))
            result['documents_updated'] = cur.fetchone()[0]
            result['authors_removed'] = len(source_ids)

            if not dry_run:
                # Update documents to point to target author
                cur.execute("""
                    UPDATE documents
                    SET author_id = %s, updated_at = NOW()
                    WHERE author_id = ANY(%s)
                """, (target_id, source_ids))

                # Delete source authors
                cur.execute("""
                    DELETE FROM authors
                    WHERE author_id = ANY(%s)
                """, (source_ids,))

                conn.commit()
                logger.info(f"Merged {len(source_ids)} authors into '{target_name}'")

    return result


# =============================================================================
# METADATA FIX SUGGESTIONS
# =============================================================================

def suggest_metadata_fixes(
    field: Optional[str] = None,
    limit: int = 100,
    min_confidence: float = 0.0
) -> List[MetadataFixSuggestion]:
    """
    Generate metadata correction suggestions using taxonomist and heuristics.

    Args:
        field: Specific field to check (author, category, title) or None for all
        limit: Maximum suggestions to return
        min_confidence: Minimum confidence threshold

    Returns:
        List of MetadataFixSuggestion objects
    """
    suggestions = []

    # Get documents needing fixes
    with get_db_connection() as conn:
        with conn.cursor() as cur:
            # Build query based on field
            conditions = []
            if field == 'author' or field is None:
                conditions.append("d.author_id IS NULL")
            if field == 'category' or field is None:
                conditions.append("(d.primary_category IS NULL OR d.primary_category = '')")
            if field == 'title' or field is None:
                conditions.append("""(
                    d.title ~ '\\.(pdf|docx?|txt|epub|md)$'
                    OR d.title ~ '^[0-9_-]+$'
                    OR d.title ~ '^DOC_'
                )""")

            if not conditions:
                return suggestions

            query = f"""
                SELECT d.document_id, d.title, d.author_id, d.primary_category,
                       a.name as author_name,
                       (SELECT chunk_text FROM chunks
                        WHERE document_id = d.document_id
                        ORDER BY chunk_sequence LIMIT 1) as first_chunk
                FROM documents d
                LEFT JOIN authors a ON d.author_id = a.author_id
                WHERE {' OR '.join(conditions)}
                LIMIT %s
            """
            cur.execute(query, (limit * 2,))  # Get extra for filtering
            documents = cur.fetchall()

    if not documents:
        return suggestions

    # Use taxonomist if available for intelligent suggestions
    taxonomist = None
    if TAXONOMIST_AVAILABLE:
        try:
            taxonomist = Taxonomist()
        except Exception as e:
            logger.warning(f"Could not initialize taxonomist: {e}")

    for doc in documents:
        doc_id, title, author_id, category, author_name, first_chunk = doc

        if not first_chunk:
            continue

        # Use taxonomist for classification
        if taxonomist and (field in ('category', 'author') or field is None):
            try:
                classification = taxonomist.classify_document(first_chunk[:4000])

                # Category suggestion
                if (field == 'category' or field is None) and not category:
                    if classification.get('primary_category'):
                        conf = classification.get('confidence', 0.5)
                        if conf >= min_confidence:
                            suggestions.append(MetadataFixSuggestion(
                                document_id=doc_id,
                                current_title=title,
                                field='primary_category',
                                current_value=category,
                                suggested_value=classification['primary_category'],
                                confidence=conf,
                                reason=f"Taxonomist classification: {classification.get('summary', 'N/A')}",
                                auto_fixable=conf >= 0.8
                            ))

                # Author suggestion
                if (field == 'author' or field is None) and not author_id:
                    detected_author = classification.get('detected_author')
                    if detected_author:
                        conf = classification.get('confidence', 0.5)
                        if conf >= min_confidence:
                            suggestions.append(MetadataFixSuggestion(
                                document_id=doc_id,
                                current_title=title,
                                field='author',
                                current_value=None,
                                suggested_value=detected_author,
                                confidence=conf,
                                reason="Author detected from document text",
                                auto_fixable=conf >= 0.85
                            ))

            except Exception as e:
                logger.debug(f"Taxonomist error for {doc_id}: {e}")

        # Title suggestion (heuristic-based)
        if field == 'title' or field is None:
            if _is_bad_title(title):
                # Try to extract title from first chunk
                extracted_title = _extract_title_from_text(first_chunk)
                if extracted_title:
                    suggestions.append(MetadataFixSuggestion(
                        document_id=doc_id,
                        current_title=title,
                        field='title',
                        current_value=title,
                        suggested_value=extracted_title,
                        confidence=0.6,
                        reason="Title appears to be filename; extracted from document text",
                        auto_fixable=False  # Title changes need human review
                    ))

        if len(suggestions) >= limit:
            break

    # Sort by confidence descending
    suggestions.sort(key=lambda s: s.confidence, reverse=True)

    return suggestions[:limit]


def _is_bad_title(title: str) -> bool:
    """Check if title appears to be a filename or placeholder."""
    if not title:
        return True
    import re
    patterns = [
        r'\.(pdf|docx?|txt|epub|md)$',  # File extension
        r'^[0-9_-]+$',                   # Only numbers/separators
        r'^DOC_',                         # Document ID
        r'^untitled',                     # Placeholder
        r'^document',                     # Generic
    ]
    for pattern in patterns:
        if re.search(pattern, title, re.IGNORECASE):
            return True
    return False


def _extract_title_from_text(text: str) -> Optional[str]:
    """Attempt to extract a title from document text."""
    if not text:
        return None

    import re

    # Look for title patterns in first 500 chars
    sample = text[:500]

    # Pattern 1: ALL CAPS line at start
    match = re.search(r'^([A-Z][A-Z\s]{10,80})\n', sample, re.MULTILINE)
    if match:
        return match.group(1).strip().title()

    # Pattern 2: "Title:" prefix
    match = re.search(r'title[:\s]+([^\n]{5,100})', sample, re.IGNORECASE)
    if match:
        return match.group(1).strip()

    # Pattern 3: First non-empty line that looks like a title
    lines = sample.split('\n')
    for line in lines[:5]:
        line = line.strip()
        if len(line) > 10 and len(line) < 100 and not line.startswith('#'):
            # Check it's not a sentence (no period at end unless abbreviation)
            if not line.endswith('.') or line.endswith('Dr.') or line.endswith('etc.'):
                return line

    return None


def apply_metadata_fixes(
    suggestions: List[MetadataFixSuggestion],
    min_confidence: float = 0.9,
    dry_run: bool = True
) -> Dict[str, Any]:
    """
    Apply auto-fixable metadata suggestions.

    Args:
        suggestions: List of suggestions to apply
        min_confidence: Minimum confidence for auto-fix
        dry_run: If True, only preview changes

    Returns:
        Summary of applied fixes
    """
    result = {
        'total_suggestions': len(suggestions),
        'applied': 0,
        'skipped': 0,
        'errors': 0,
        'dry_run': dry_run,
        'details': [],
        'undo_log_path': str(UNDO_LOG_PATH) if not dry_run else None
    }

    # Initialize undo log for this batch
    undo_log = UndoLog(UNDO_LOG_PATH) if not dry_run else None

    for suggestion in suggestions:
        if not suggestion.auto_fixable or suggestion.confidence < min_confidence:
            result['skipped'] += 1
            continue

        try:
            if not dry_run:
                # Log the change before applying
                undo_log.log_change(
                    operation="update_metadata",
                    table="documents",
                    affected_ids=[suggestion.document_id],
                    before_state={suggestion.field: suggestion.current_value},
                    after_state={suggestion.field: suggestion.suggested_value},
                    notes=f"Auto-fix: {suggestion.reason}"
                )
                _apply_single_fix(suggestion)

            result['applied'] += 1
            result['details'].append({
                'document_id': suggestion.document_id,
                'field': suggestion.field,
                'old_value': suggestion.current_value,
                'new_value': suggestion.suggested_value,
                'confidence': suggestion.confidence
            })
        except Exception as e:
            result['errors'] += 1
            logger.error(f"Error applying fix to {suggestion.document_id}: {e}")

    return result


def _apply_single_fix(suggestion: MetadataFixSuggestion) -> None:
    """Apply a single metadata fix to the database."""
    with get_db_connection() as conn:
        with conn.cursor() as cur:
            if suggestion.field == 'author':
                # First, ensure author exists
                cur.execute("""
                    INSERT INTO authors (name, name_normalized)
                    VALUES (%s, LOWER(%s))
                    ON CONFLICT (name_normalized) DO UPDATE SET name = EXCLUDED.name
                    RETURNING author_id
                """, (suggestion.suggested_value, suggestion.suggested_value))
                author_id = cur.fetchone()[0]

                cur.execute("""
                    UPDATE documents
                    SET author_id = %s, updated_at = NOW()
                    WHERE document_id = %s
                """, (author_id, suggestion.document_id))
            else:
                # Direct field update
                cur.execute(f"""
                    UPDATE documents
                    SET {suggestion.field} = %s, updated_at = NOW()
                    WHERE document_id = %s
                """, (suggestion.suggested_value, suggestion.document_id))

            conn.commit()


# =============================================================================
# REVIEW QUEUE GENERATION
# =============================================================================

def generate_review_queue(
    issue_type: Optional[str] = None,
    limit: int = 50
) -> List[ReviewQueueItem]:
    """
    Generate prioritized review queue based on detected issues.

    Args:
        issue_type: Filter by issue type (quality, metadata, duplicate, bad-title)
        limit: Maximum items to return

    Returns:
        List of ReviewQueueItem sorted by priority
    """
    queue = []

    with get_db_connection() as conn:
        with conn.cursor() as cur:
            # Get documents with various issues
            cur.execute("""
                SELECT
                    d.document_id,
                    d.title,
                    d.quality_score,
                    d.quality_status,
                    d.needs_review,
                    d.author_id,
                    d.primary_category,
                    d.publication_year,
                    d.created_at,
                    a.name as author_name
                FROM documents d
                LEFT JOIN authors a ON d.author_id = a.author_id
                WHERE d.needs_review = true
                   OR d.quality_status IN ('poor', 'unusable')
                   OR d.author_id IS NULL
                   OR d.primary_category IS NULL
                   OR d.title ~ '\\.(pdf|docx?|txt|epub|md)$'
                ORDER BY
                    CASE WHEN d.quality_status = 'unusable' THEN 1
                         WHEN d.quality_status = 'poor' THEN 2
                         WHEN d.needs_review = true THEN 3
                         ELSE 4 END,
                    d.created_at DESC
                LIMIT %s
            """, (limit * 2,))

            for row in cur.fetchall():
                doc_id, title, q_score, q_status, needs_review, author_id, \
                    category, year, created_at, author_name = row

                issues = []
                actions = []
                priority = 5  # Default medium priority

                # Quality issues (highest priority)
                if q_status == 'unusable':
                    issues.append("Quality: UNUSABLE - text extraction failed")
                    actions.append("Re-OCR with different method or mark as do_not_process")
                    priority = 10
                elif q_status == 'poor':
                    issues.append(f"Quality: POOR (score: {q_score})")
                    actions.append("Consider re-OCR or manual text cleanup")
                    priority = max(priority, 8)

                # Manual review flag
                if needs_review:
                    issues.append("Flagged for manual review")
                    priority = max(priority, 7)

                # Metadata issues
                if not author_id:
                    issues.append("Missing author")
                    actions.append("Run: python library_gardener.py --suggest-fixes --field author")
                    priority = max(priority, 6)

                if not category:
                    issues.append("Missing category")
                    actions.append("Run: python library_gardener.py --auto-fix --field category")
                    priority = max(priority, 5)

                if _is_bad_title(title):
                    issues.append("Title appears to be filename")
                    actions.append("Edit title manually or extract from document")
                    priority = max(priority, 6)

                if not year:
                    issues.append("Missing publication year")
                    priority = max(priority, 4)

                # Filter by issue type if specified
                if issue_type:
                    if issue_type == 'quality' and q_status not in ('poor', 'unusable'):
                        continue
                    elif issue_type == 'metadata' and author_id and category:
                        continue
                    elif issue_type == 'bad-title' and not _is_bad_title(title):
                        continue

                if issues:  # Only add if there are issues
                    queue.append(ReviewQueueItem(
                        document_id=doc_id,
                        title=title,
                        priority=priority,
                        issues=issues,
                        suggested_actions=actions,
                        quality_score=q_score,
                        created_at=created_at.isoformat() if created_at else None
                    ))

    # Sort by priority (descending) and limit
    queue.sort(key=lambda x: x.priority, reverse=True)
    return queue[:limit]


# =============================================================================
# DUPLICATE DETECTION
# =============================================================================

def find_duplicate_documents() -> List[Dict[str, Any]]:
    """
    Find documents with duplicate content hashes.

    Returns:
        List of duplicate groups with document details
    """
    duplicates = []

    with get_db_connection() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT content_hash,
                       array_agg(document_id) as doc_ids,
                       array_agg(title) as titles,
                       COUNT(*) as count
                FROM documents
                WHERE content_hash IS NOT NULL
                GROUP BY content_hash
                HAVING COUNT(*) > 1
                ORDER BY COUNT(*) DESC
            """)

            for row in cur.fetchall():
                content_hash, doc_ids, titles, count = row
                duplicates.append({
                    'content_hash': content_hash,
                    'count': count,
                    'documents': [
                        {'document_id': did, 'title': t}
                        for did, t in zip(doc_ids, titles)
                    ]
                })

    return duplicates


# =============================================================================
# FULL GARDENING PASS
# =============================================================================

def run_full_gardening_pass(
    auto_fix: bool = False,
    min_confidence: float = 0.9,
    dry_run: bool = True
) -> Dict[str, Any]:
    """
    Run a complete library gardening pass.

    Args:
        auto_fix: Whether to auto-apply high-confidence fixes
        min_confidence: Minimum confidence for auto-fixes
        dry_run: If True, don't actually apply changes

    Returns:
        Comprehensive gardening report
    """
    logger.info("Starting full library gardening pass...")

    results = {
        'timestamp': datetime.now().isoformat(),
        'dry_run': dry_run,
        'phases': {}
    }

    # Phase 1: Health Analysis
    logger.info("Phase 1: Analyzing library health...")
    health = analyze_library_health()
    results['phases']['health_analysis'] = asdict(health)

    # Phase 2: Author Deduplication Analysis
    logger.info("Phase 2: Finding author variations...")
    if FUZZY_AVAILABLE:
        variations = find_author_variations(threshold=85)
        results['phases']['author_variations'] = {
            'groups_found': len(variations),
            'variations': [asdict(v) for v in variations[:10]]  # Top 10
        }
    else:
        results['phases']['author_variations'] = {
            'error': 'Fuzzy matching not available'
        }

    # Phase 3: Metadata Fix Suggestions
    logger.info("Phase 3: Generating fix suggestions...")
    suggestions = suggest_metadata_fixes(limit=100, min_confidence=0.5)
    results['phases']['fix_suggestions'] = {
        'total': len(suggestions),
        'auto_fixable': len([s for s in suggestions if s.auto_fixable]),
        'suggestions': [asdict(s) for s in suggestions[:20]]  # Top 20
    }

    # Phase 4: Apply Auto-Fixes (if enabled)
    if auto_fix:
        logger.info(f"Phase 4: Applying auto-fixes (min_confidence={min_confidence})...")
        fix_results = apply_metadata_fixes(
            suggestions,
            min_confidence=min_confidence,
            dry_run=dry_run
        )
        results['phases']['auto_fixes'] = fix_results

    # Phase 5: Duplicate Detection
    logger.info("Phase 5: Detecting duplicates...")
    duplicates = find_duplicate_documents()
    results['phases']['duplicates'] = {
        'groups_found': len(duplicates),
        'duplicates': duplicates[:10]  # Top 10
    }

    # Phase 6: Generate Review Queue
    logger.info("Phase 6: Generating review queue...")
    queue = generate_review_queue(limit=50)
    results['phases']['review_queue'] = {
        'total_items': len(queue),
        'by_priority': {
            'critical (8-10)': len([q for q in queue if q.priority >= 8]),
            'high (6-7)': len([q for q in queue if 6 <= q.priority < 8]),
            'medium (4-5)': len([q for q in queue if 4 <= q.priority < 6]),
            'low (1-3)': len([q for q in queue if q.priority < 4])
        },
        'items': [asdict(q) for q in queue[:20]]  # Top 20
    }

    logger.info("Gardening pass complete!")
    return results


# =============================================================================
# CLI OUTPUT FORMATTERS
# =============================================================================

def print_health_report(report: LibraryHealthReport, format: str = 'text') -> None:
    """Print health report in specified format."""
    if format == 'json':
        print(json.dumps(asdict(report), indent=2))
        return

    print("\n" + "=" * 70)
    print("LIBRARY HEALTH REPORT")
    print("=" * 70)
    print(f"Generated: {report.generated_at}")
    print()

    # Overview
    print("📊 OVERVIEW")
    print(f"  Total Documents: {report.total_documents:,}")
    print(f"  Total Authors:   {report.total_authors:,}")
    print(f"  Total Chunks:    {report.total_chunks:,}")
    print()

    # Quality Distribution
    print("📈 QUALITY DISTRIBUTION")
    total = report.total_documents or 1
    print(f"  Excellent:   {report.quality_excellent:4} ({report.quality_excellent/total*100:5.1f}%)")
    print(f"  Good:        {report.quality_good:4} ({report.quality_good/total*100:5.1f}%)")
    print(f"  Fair:        {report.quality_fair:4} ({report.quality_fair/total*100:5.1f}%)")
    print(f"  Poor:        {report.quality_poor:4} ({report.quality_poor/total*100:5.1f}%)")
    print(f"  Unusable:    {report.quality_unusable:4} ({report.quality_unusable/total*100:5.1f}%)")
    print(f"  Unassessed:  {report.quality_unassessed:4} ({report.quality_unassessed/total*100:5.1f}%)")
    print()

    # Issues
    print("⚠️  ISSUES FOUND")
    print(f"  Missing Author:     {report.missing_author}")
    print(f"  Missing Year:       {report.missing_year}")
    print(f"  Missing Category:   {report.missing_category}")
    print(f"  Bad Titles:         {report.missing_title}")
    print(f"  Needs Review:       {report.needs_review_count}")
    print(f"  Duplicate Groups:   {report.duplicate_content_hashes}")
    print(f"  Author Variations:  {report.author_variations_found}")
    print()

    # Recommendations
    print("💡 RECOMMENDATIONS")
    for i, rec in enumerate(report.recommendations, 1):
        print(f"  {i}. {rec}")
    print()


def print_author_variations(variations: List[AuthorVariation], format: str = 'text') -> None:
    """Print author variations in specified format."""
    if format == 'json':
        print(json.dumps([asdict(v) for v in variations], indent=2))
        return

    print("\n" + "=" * 70)
    print("AUTHOR NAME VARIATIONS")
    print("=" * 70)
    print(f"Found {len(variations)} potential merge groups\n")

    for i, var in enumerate(variations, 1):
        print(f"Group {i}: {var.canonical_name} (recommended)")
        print(f"  Total documents: {var.total_documents}")
        print(f"  Variations:")
        for name in var.variations:
            score = var.similarity_scores.get(name, 0)
            docs = var.document_counts.get(name, 0)
            print(f"    - {name} ({docs} docs, {score:.0f}% similar)")
        print(f"  Merge command:")
        sources = ' '.join(f'"{n}"' for n in var.variations)
        print(f'    python library_gardener.py --merge-authors {sources} --into "{var.canonical_name}"')
        print()


def print_suggestions(suggestions: List[MetadataFixSuggestion], format: str = 'text') -> None:
    """Print fix suggestions in specified format."""
    if format == 'json':
        print(json.dumps([asdict(s) for s in suggestions], indent=2))
        return

    print("\n" + "=" * 70)
    print("METADATA FIX SUGGESTIONS")
    print("=" * 70)
    print(f"Found {len(suggestions)} suggestions\n")

    auto_fixable = [s for s in suggestions if s.auto_fixable]
    print(f"Auto-fixable (confidence >= 0.8): {len(auto_fixable)}")
    print()

    for i, s in enumerate(suggestions[:30], 1):
        auto = "✓ AUTO" if s.auto_fixable else "  MANUAL"
        print(f"{i}. [{auto}] {s.document_id}")
        print(f"   Title: {s.current_title[:50]}...")
        print(f"   Field: {s.field}")
        print(f"   Current: {s.current_value}")
        print(f"   Suggested: {s.suggested_value}")
        print(f"   Confidence: {s.confidence:.0%}")
        print(f"   Reason: {s.reason}")
        print()


def print_review_queue(queue: List[ReviewQueueItem], format: str = 'text') -> None:
    """Print review queue in specified format."""
    if format == 'json':
        print(json.dumps([asdict(q) for q in queue], indent=2))
        return

    print("\n" + "=" * 70)
    print("REVIEW QUEUE")
    print("=" * 70)
    print(f"Total items: {len(queue)}\n")

    # Priority breakdown
    critical = len([q for q in queue if q.priority >= 8])
    high = len([q for q in queue if 6 <= q.priority < 8])
    medium = len([q for q in queue if 4 <= q.priority < 6])
    low = len([q for q in queue if q.priority < 4])

    print(f"By Priority: Critical={critical}, High={high}, Medium={medium}, Low={low}\n")

    for i, item in enumerate(queue[:30], 1):
        priority_label = "🔴" if item.priority >= 8 else "🟠" if item.priority >= 6 else "🟡" if item.priority >= 4 else "🟢"
        print(f"{i}. {priority_label} [{item.priority}] {item.document_id}")
        print(f"   Title: {item.title[:50]}...")
        print(f"   Issues:")
        for issue in item.issues:
            print(f"     - {issue}")
        if item.suggested_actions:
            print(f"   Actions:")
            for action in item.suggested_actions[:2]:
                print(f"     → {action}")
        print()


# =============================================================================
# CLI INTERFACE
# =============================================================================

def main():
    parser = argparse.ArgumentParser(
        description="Library Gardener - Automated library maintenance and metadata correction",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python library_gardener.py --analyze
  python library_gardener.py --dedupe-authors --threshold 85
  python library_gardener.py --suggest-fixes --field category
  python library_gardener.py --auto-fix --min-confidence 0.9 --dry-run
  python library_gardener.py --review-queue --limit 20
  python library_gardener.py --full-pass --format json
        """
    )

    # Main operations
    ops = parser.add_argument_group('Operations')
    ops.add_argument('--analyze', action='store_true',
                    help='Analyze library health and generate report')
    ops.add_argument('--dedupe-authors', action='store_true',
                    help='Find author name variations')
    ops.add_argument('--suggest-fixes', action='store_true',
                    help='Generate metadata fix suggestions')
    ops.add_argument('--auto-fix', action='store_true',
                    help='Apply auto-fixable metadata corrections')
    ops.add_argument('--review-queue', action='store_true',
                    help='Generate prioritized review queue')
    ops.add_argument('--find-duplicates', action='store_true',
                    help='Find duplicate documents')
    ops.add_argument('--full-pass', action='store_true',
                    help='Run complete gardening pass')

    # Undo operations
    undo_group = parser.add_argument_group('Undo/History')
    undo_group.add_argument('--undo', action='store_true',
                           help='Undo the last auto-fix operation')
    undo_group.add_argument('--undo-history', action='store_true',
                           help='Show recent undo history')

    # Author merge
    merge = parser.add_argument_group('Author Merging')
    merge.add_argument('--merge-authors', nargs='+', metavar='NAME',
                      help='Author names to merge')
    merge.add_argument('--into', metavar='NAME',
                      help='Target author name for merge')

    # Options
    opts = parser.add_argument_group('Options')
    opts.add_argument('--threshold', type=int, default=85,
                     help='Similarity threshold for fuzzy matching (0-100, default: 85)')
    opts.add_argument('--min-confidence', type=float, default=0.9,
                     help='Minimum confidence for auto-fixes (0-1, default: 0.9)')
    opts.add_argument('--field', choices=['author', 'category', 'title'],
                     help='Specific field to analyze')
    opts.add_argument('--issue', choices=['quality', 'metadata', 'bad-title', 'duplicate'],
                     help='Filter review queue by issue type')
    opts.add_argument('--limit', type=int, default=50,
                     help='Maximum results to return (default: 50)')
    opts.add_argument('--dry-run', action='store_true',
                     help='Preview changes without applying')
    opts.add_argument('--format', choices=['text', 'json'], default='text',
                     help='Output format (default: text)')

    args = parser.parse_args()

    # Default to --analyze if no operation specified
    if not any([args.analyze, args.dedupe_authors, args.suggest_fixes,
                args.auto_fix, args.review_queue, args.find_duplicates,
                args.full_pass, args.merge_authors, args.undo, args.undo_history]):
        args.analyze = True

    try:
        # Undo operations (check first before other operations)
        if args.undo:
            undo_log = UndoLog(UNDO_LOG_PATH)
            last_entry = undo_log.get_last_entry()
            if not last_entry:
                print("No undo history found.")
                sys.exit(0)

            print(f"\n=== UNDO LAST OPERATION ===")
            print(f"Operation: {last_entry.operation}")
            print(f"Table: {last_entry.table}")
            print(f"Affected: {', '.join(last_entry.affected_ids)}")
            print(f"Timestamp: {last_entry.timestamp}")
            print(f"\nSQL to undo:")
            for sql in undo_log.get_undo_sql(last_entry):
                print(f"  {sql}")

            if not args.dry_run:
                # Execute undo SQL
                with get_db_connection() as conn:
                    with conn.cursor() as cur:
                        for sql in undo_log.get_undo_sql(last_entry):
                            if not sql.startswith('--'):  # Skip comments
                                cur.execute(sql)
                        conn.commit()
                undo_log.pop_last()
                print("\nUndo applied successfully.")
            else:
                print("\n[DRY RUN] Add --no-dry-run to actually apply undo.")
            sys.exit(0)

        elif args.undo_history:
            undo_log = UndoLog(UNDO_LOG_PATH)
            entries = undo_log.get_entries(limit=10)
            if not entries:
                print("No undo history found.")
                sys.exit(0)

            if args.format == 'json':
                from dataclasses import asdict
                print(json.dumps([asdict(e) for e in entries], indent=2))
            else:
                print(f"\n=== UNDO HISTORY (last {len(entries)} operations) ===\n")
                for i, entry in enumerate(reversed(entries), 1):
                    print(f"{i}. [{entry.timestamp}] {entry.operation}")
                    print(f"   Table: {entry.table}")
                    print(f"   Affected: {', '.join(entry.affected_ids[:3])}" +
                          (f" +{len(entry.affected_ids)-3} more" if len(entry.affected_ids) > 3 else ""))
                    if entry.notes:
                        print(f"   Notes: {entry.notes[:50]}")
                    print()
                print(f"Run with --undo to revert the most recent change.")
            sys.exit(0)

        # Health Analysis
        elif args.analyze:
            report = analyze_library_health()
            print_health_report(report, args.format)

        # Author Deduplication
        elif args.dedupe_authors:
            if not FUZZY_AVAILABLE:
                print("Error: Fuzzy matching requires 'rapidfuzz' or 'fuzzywuzzy'")
                print("Install: pip install rapidfuzz")
                sys.exit(1)
            variations = find_author_variations(threshold=args.threshold)
            print_author_variations(variations, args.format)

        # Author Merge
        elif args.merge_authors:
            if not args.into:
                print("Error: --merge-authors requires --into TARGET_NAME")
                sys.exit(1)
            result = merge_authors(args.merge_authors, args.into, dry_run=args.dry_run)
            if args.format == 'json':
                print(json.dumps(result, indent=2))
            else:
                if result.get('error'):
                    print(f"Error: {result['error']}")
                else:
                    action = "Would merge" if args.dry_run else "Merged"
                    print(f"{action} {result['authors_removed']} authors into '{result['target']}'")
                    print(f"Documents updated: {result['documents_updated']}")

        # Fix Suggestions
        elif args.suggest_fixes:
            suggestions = suggest_metadata_fixes(
                field=args.field,
                limit=args.limit,
                min_confidence=0.0
            )
            print_suggestions(suggestions, args.format)

        # Auto-Fix
        elif args.auto_fix:
            suggestions = suggest_metadata_fixes(
                field=args.field,
                limit=args.limit,
                min_confidence=args.min_confidence
            )
            result = apply_metadata_fixes(
                suggestions,
                min_confidence=args.min_confidence,
                dry_run=args.dry_run
            )
            if args.format == 'json':
                print(json.dumps(result, indent=2))
            else:
                action = "Would apply" if args.dry_run else "Applied"
                print(f"\n{action} {result['applied']} fixes")
                print(f"Skipped: {result['skipped']}")
                print(f"Errors: {result['errors']}")
                if result['details']:
                    print("\nDetails:")
                    for d in result['details'][:10]:
                        print(f"  {d['document_id']}: {d['field']} = {d['new_value']}")

        # Review Queue
        elif args.review_queue:
            queue = generate_review_queue(issue_type=args.issue, limit=args.limit)
            print_review_queue(queue, args.format)

        # Find Duplicates
        elif args.find_duplicates:
            duplicates = find_duplicate_documents()
            if args.format == 'json':
                print(json.dumps(duplicates, indent=2))
            else:
                print(f"\nFound {len(duplicates)} duplicate groups:\n")
                for dup in duplicates:
                    print(f"Content hash: {dup['content_hash'][:16]}...")
                    print(f"  Count: {dup['count']}")
                    for doc in dup['documents']:
                        print(f"    - {doc['document_id']}: {doc['title'][:40]}...")
                    print()

        # Full Pass
        elif args.full_pass:
            results = run_full_gardening_pass(
                auto_fix=args.auto_fix,
                min_confidence=args.min_confidence,
                dry_run=args.dry_run
            )
            if args.format == 'json':
                print(json.dumps(results, indent=2))
            else:
                print("\n" + "=" * 70)
                print("FULL GARDENING PASS COMPLETE")
                print("=" * 70)
                print(f"Timestamp: {results['timestamp']}")
                print(f"Dry Run: {results['dry_run']}")
                print()

                # Summary of each phase
                phases = results['phases']

                print("Phase 1 - Health Analysis:")
                health = phases.get('health_analysis', {})
                print(f"  Documents: {health.get('total_documents', 0)}")
                print(f"  Issues: {health.get('missing_author', 0)} missing authors, "
                      f"{health.get('missing_category', 0)} missing categories")
                print()

                print("Phase 2 - Author Variations:")
                av = phases.get('author_variations', {})
                print(f"  Groups found: {av.get('groups_found', 0)}")
                print()

                print("Phase 3 - Fix Suggestions:")
                fs = phases.get('fix_suggestions', {})
                print(f"  Total: {fs.get('total', 0)}")
                print(f"  Auto-fixable: {fs.get('auto_fixable', 0)}")
                print()

                if 'auto_fixes' in phases:
                    print("Phase 4 - Auto Fixes:")
                    af = phases['auto_fixes']
                    print(f"  Applied: {af.get('applied', 0)}")
                    print(f"  Skipped: {af.get('skipped', 0)}")
                    print()

                print("Phase 5 - Duplicates:")
                dup = phases.get('duplicates', {})
                print(f"  Groups found: {dup.get('groups_found', 0)}")
                print()

                print("Phase 6 - Review Queue:")
                rq = phases.get('review_queue', {})
                print(f"  Total items: {rq.get('total_items', 0)}")
                bp = rq.get('by_priority', {})
                print(f"  By priority: Critical={bp.get('critical (8-10)', 0)}, "
                      f"High={bp.get('high (6-7)', 0)}, Medium={bp.get('medium (4-5)', 0)}")

    except KeyboardInterrupt:
        print("\nCancelled.")
        sys.exit(1)
    except Exception as e:
        logger.error(f"Error: {e}")
        if args.format == 'json':
            print(json.dumps({'error': str(e)}))
        sys.exit(1)


if __name__ == '__main__':
    main()
