#!/usr/bin/env python3
"""
Auto-Taxonomist Module for intelligent document classification.

This module provides automatic classification, topic detection, and concept
extraction for documents during ingestion. It supports three intelligence tiers:

1. Cloud (OpenAI): Full LLM-powered classification
2. Local (Ollama): Local LLM classification (free & private)
3. Statistical: TF-IDF/RAKE keyword extraction (fully offline)

The system automatically selects the best available method.

Usage:
    from taxonomist import Taxonomist

    taxonomist = Taxonomist()
    classification = taxonomist.classify_document(text)
    taxonomist.sync_to_database(classification, document_id)
"""

import json
import re
import logging
from typing import Dict, List, Any, Optional, Tuple
from pathlib import Path
from collections import Counter

from config import (
    OPENAI_API_KEY, OPENAI_ENABLED, PATHS,
    INTELLIGENCE_MODE, LOCAL_LLM_ENDPOINT, LOCAL_LLM_MODEL,
    STATISTICAL_CONFIG, CLOUD_MODELS
)
from db_utils import execute_query, get_db_connection

# Try to import OpenAI (also works for Ollama OpenAI-compatible API)
try:
    from openai import OpenAI
    HAS_OPENAI = True
except ImportError:
    HAS_OPENAI = False

# Try to import statistical NLP libraries
try:
    from multi_rake import Rake
    HAS_RAKE = True
except ImportError:
    HAS_RAKE = False

try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    HAS_SKLEARN = True
except ImportError:
    HAS_SKLEARN = False

# Try YAKE as alternative keyword extractor
try:
    import yake
    HAS_YAKE = True
except ImportError:
    HAS_YAKE = False

logger = logging.getLogger(__name__)


class Taxonomist:
    """
    Intelligent document classification using LLM analysis with rule-based fallback.

    The Taxonomist analyzes document text to:
    - Determine primary and secondary categories
    - Extract specific topics and concepts
    - Identify content type and difficulty level
    - Suggest logical folder organization

    When OpenAI API is available, it uses LLM for nuanced classification.
    Otherwise, it falls back to keyword-based heuristics.
    """

    # Primary categories for document classification
    CATEGORIES = [
        'Philosophy',
        'Religion/Spirituality',
        'Education',
        'Science',
        'Agriculture',
        'History',
        'Psychology',
        'Arts',
        'Technical',
        'Biography',
        'Medicine/Health',
        'Social Science',
        'Reference',
        'Literature'
    ]

    # Content types
    CONTENT_TYPES = [
        'lecture',
        'essay',
        'book_chapter',
        'academic_paper',
        'manual',
        'letter',
        'interview',
        'treatise',
        'commentary',
        'other'
    ]

    # Difficulty levels
    DIFFICULTY_LEVELS = [
        'introductory',
        'intermediate',
        'advanced',
        'expert'
    ]

    # Classification prompt for LLM
    CLASSIFICATION_PROMPT = """You are a research librarian specializing in diverse topics including philosophy, spirituality, education, agriculture, science, and technical subjects.

Analyze this text excerpt and provide classification metadata.

TEXT EXCERPT:
{text_sample}

Respond ONLY with valid JSON (no markdown, no explanation):
{{
    "primary_category": "One of: Philosophy, Religion/Spirituality, Education, Science, Agriculture, History, Psychology, Arts, Technical, Biography, Medicine/Health, Social Science, Reference, Literature",
    "secondary_categories": ["Up to 2 additional categories from the same list"],
    "specific_topics": ["3-5 specific topic tags, e.g., 'Waldorf Education', 'Soil Health', 'Consciousness Studies'"],
    "key_concepts": ["3-7 core concepts discussed in the text"],
    "detected_author": "If author name is identifiable from the text content, otherwise null",
    "time_period": "If discernible (e.g., 'Early 20th Century', '1920s'), otherwise null",
    "content_type": "One of: lecture, essay, book_chapter, academic_paper, manual, letter, interview, treatise, commentary, other",
    "difficulty_level": "One of: introductory, intermediate, advanced, expert",
    "language_detected": "ISO 639-1 code (e.g., 'en', 'de', 'fr')",
    "summary": "One sentence summary of the document's main topic",
    "confidence": 0.0-1.0
}}"""

    # Keyword patterns for fallback classification
    CATEGORY_KEYWORDS = {
        'Philosophy': [
            'philosophy', 'metaphysics', 'epistemology', 'ontology', 'ethics',
            'consciousness', 'thinking', 'cognition', 'phenomenology', 'logic',
            'being', 'existence', 'truth', 'knowledge', 'wisdom', 'soul', 'spirit'
        ],
        'Religion/Spirituality': [
            'spiritual', 'esoteric', 'occult', 'mystical', 'divine', 'sacred',
            'meditation', 'prayer', 'enlightenment', 'initiation', 'theosophy',
            'anthroposophy', 'karma', 'reincarnation', 'christ', 'buddha'
        ],
        'Education': [
            'education', 'teaching', 'learning', 'pedagogy', 'curriculum',
            'waldorf', 'steiner school', 'child development', 'classroom',
            'student', 'teacher', 'instruction', 'training', 'school'
        ],
        'Science': [
            'science', 'scientific', 'experiment', 'hypothesis', 'theory',
            'physics', 'chemistry', 'biology', 'mathematics', 'research',
            'empirical', 'observation', 'natural science', 'laboratory'
        ],
        'Agriculture': [
            'agriculture', 'farming', 'biodynamic', 'soil', 'crop',
            'harvest', 'planting', 'compost', 'organic', 'garden',
            'livestock', 'animal husbandry', 'permaculture'
        ],
        'History': [
            'history', 'historical', 'century', 'ancient', 'medieval',
            'civilization', 'empire', 'war', 'revolution', 'dynasty',
            'archaeology', 'chronicle', 'era', 'period', 'epoch'
        ],
        'Psychology': [
            'psychology', 'psyche', 'mind', 'behavior', 'emotion',
            'cognition', 'personality', 'therapy', 'mental', 'unconscious',
            'freud', 'jung', 'developmental', 'temperament'
        ],
        'Arts': [
            'art', 'artistic', 'painting', 'sculpture', 'music',
            'architecture', 'aesthetic', 'creative', 'beauty', 'form',
            'color', 'composition', 'eurythmy', 'drama', 'poetry'
        ],
        'Medicine/Health': [
            'medicine', 'health', 'healing', 'therapy', 'disease',
            'treatment', 'diagnosis', 'patient', 'clinical', 'medical',
            'anthroposophic medicine', 'holistic', 'wellness'
        ],
        'Biography': [
            'biography', 'life', 'born', 'died', 'childhood',
            'memoirs', 'autobiography', 'personal history', 'career'
        ]
    }

    # Author detection patterns
    KNOWN_AUTHORS = {
        'rudolf steiner': ['steiner', 'dr. steiner', 'rudolf steiner'],
        'johann wolfgang von goethe': ['goethe', 'johann goethe'],
        'carl jung': ['jung', 'carl jung', 'c.g. jung'],
        'maria montessori': ['montessori', 'maria montessori'],
        'owen barfield': ['barfield', 'owen barfield'],
    }

    def __init__(self, mode: str = None):
        """
        Initialize the Taxonomist.

        Args:
            mode: Intelligence mode override ('cloud', 'local', 'statistical', or None for auto)
        """
        self.mode = mode or INTELLIGENCE_MODE
        self.client = None
        self.local_client = None

        # Initialize based on mode
        if self.mode == 'cloud' and OPENAI_ENABLED and HAS_OPENAI:
            try:
                self.client = OpenAI(api_key=OPENAI_API_KEY)
                self.model = CLOUD_MODELS['classification']
                logger.info(f"Taxonomist initialized in CLOUD mode (model: {self.model})")
            except Exception as e:
                logger.warning(f"Failed to initialize OpenAI client: {e}")
                self.mode = 'statistical'

        elif self.mode == 'local' and HAS_OPENAI:
            try:
                self.local_client = OpenAI(
                    base_url=LOCAL_LLM_ENDPOINT,
                    api_key="not-needed"  # Ollama doesn't require API key
                )
                self.model = LOCAL_LLM_MODEL
                logger.info(f"Taxonomist initialized in LOCAL mode (model: {self.model})")
            except Exception as e:
                logger.warning(f"Failed to initialize local LLM client: {e}")
                self.mode = 'statistical'

        if self.mode == 'statistical':
            logger.info("Taxonomist initialized in STATISTICAL mode (offline)")

        # Always initialize statistical extractors as fallback
        self._init_statistical_extractors()

        # Cache for database lookups
        self._topic_cache = None
        self._concept_cache = None
        self._category_cache = None

    def _init_statistical_extractors(self):
        """Initialize statistical NLP extractors."""
        self.rake = None
        self.yake_extractor = None
        self.tfidf = None

        method = STATISTICAL_CONFIG.get('keyword_method', 'rake')

        if method == 'rake' and HAS_RAKE:
            try:
                self.rake = Rake()
                logger.debug("RAKE keyword extractor initialized")
            except Exception as e:
                logger.warning(f"Failed to initialize RAKE: {e}")

        if method == 'yake' and HAS_YAKE:
            try:
                self.yake_extractor = yake.KeywordExtractor(
                    lan="en",
                    n=2,  # Max ngram size
                    dedupLim=0.9,
                    top=STATISTICAL_CONFIG.get('max_keywords', 10)
                )
                logger.debug("YAKE keyword extractor initialized")
            except Exception as e:
                logger.warning(f"Failed to initialize YAKE: {e}")

        if HAS_SKLEARN:
            try:
                self.tfidf = TfidfVectorizer(
                    max_features=STATISTICAL_CONFIG.get('tfidf_max_features', 5000),
                    ngram_range=STATISTICAL_CONFIG.get('tfidf_ngram_range', (1, 2)),
                    stop_words='english'
                )
                logger.debug("TF-IDF vectorizer initialized")
            except Exception as e:
                logger.warning(f"Failed to initialize TF-IDF: {e}")

    def classify_document(
        self,
        text: str,
        max_tokens: int = 2000,
        existing_metadata: Dict[str, Any] = None
    ) -> Dict[str, Any]:
        """
        Classify document using the configured intelligence mode.

        Modes:
        - cloud: OpenAI LLM classification
        - local: Local LLM (Ollama) classification
        - statistical: TF-IDF/RAKE keyword extraction

        Args:
            text: Full document text
            max_tokens: Maximum tokens to send for classification (uses ~4 chars per token)
            existing_metadata: Any metadata already extracted (from filename, etc.)

        Returns:
            Classification metadata dict with keys:
            - primary_category
            - secondary_categories
            - specific_topics
            - key_concepts
            - content_type
            - difficulty_level
            - detected_author
            - confidence
            - classification_source
        """
        existing_metadata = existing_metadata or {}

        # Use first ~2000 tokens (abstract/intro typically most informative)
        text_sample = text[:max_tokens * 4]

        classification = None

        # Route to appropriate classification method based on mode
        if self.mode == 'cloud' and self.client:
            classification = self._llm_classify(text_sample, client=self.client)
        elif self.mode == 'local' and self.local_client:
            classification = self._llm_classify(text_sample, client=self.local_client)

        # Fall back to statistical if LLM failed or not available
        if classification is None:
            if self.mode == 'statistical':
                classification = self._statistical_classify(text)
            else:
                # LLM failed, use statistical fallback
                classification = self._statistical_classify(text)

        # Merge with existing metadata (existing takes precedence for author, year)
        classification = self._merge_metadata(classification, existing_metadata)
        return classification

    def _llm_classify(self, text_sample: str, client=None) -> Optional[Dict[str, Any]]:
        """
        Classify using LLM (cloud or local).

        Args:
            text_sample: Text excerpt to analyze
            client: OpenAI client (or Ollama-compatible client)

        Returns:
            Classification dict or None if failed
        """
        if client is None:
            client = self.client or self.local_client

        if client is None:
            return None

        try:
            response = client.chat.completions.create(
                model=self.model,
                messages=[
                    {
                        "role": "system",
                        "content": "You are a precise research librarian. Always respond with valid JSON only, no markdown formatting."
                    },
                    {
                        "role": "user",
                        "content": self.CLASSIFICATION_PROMPT.format(text_sample=text_sample)
                    }
                ],
                temperature=0.3,  # Lower temperature for consistency
                max_tokens=600
            )

            content = response.choices[0].message.content.strip()

            # Clean up potential markdown formatting
            if content.startswith('```'):
                content = re.sub(r'^```(?:json)?\n?', '', content)
                content = re.sub(r'\n?```$', '', content)

            result = json.loads(content)

            # Validate and normalize
            result['primary_category'] = self._validate_category(result.get('primary_category'))
            result['content_type'] = self._validate_content_type(result.get('content_type'))
            result['difficulty_level'] = self._validate_difficulty(result.get('difficulty_level'))
            result['classification_source'] = f'llm_{self.mode}'

            # Ensure lists
            result['secondary_categories'] = result.get('secondary_categories') or []
            result['specific_topics'] = result.get('specific_topics') or []
            result['key_concepts'] = result.get('key_concepts') or []

            logger.info(f"LLM classification ({self.mode}): {result['primary_category']} (confidence: {result.get('confidence', 'N/A')})")
            return result

        except json.JSONDecodeError as e:
            logger.warning(f"Failed to parse LLM response as JSON: {e}")
            return None
        except Exception as e:
            logger.warning(f"LLM classification failed: {e}")
            return None

    def _statistical_classify(self, text: str) -> Dict[str, Any]:
        """
        Classify using statistical NLP methods (fully offline).

        Uses RAKE/YAKE for keyword extraction and TF-IDF for topic weighting.
        Infers category from keyword-to-category mapping.

        Args:
            text: Full document text

        Returns:
            Classification dict
        """
        # Extract keywords using available method
        keywords = self._extract_keywords(text)

        # Infer category from keywords
        category_scores = self._score_categories_from_keywords(keywords, text)

        # Sort categories by score
        sorted_categories = sorted(category_scores.items(), key=lambda x: x[1], reverse=True)
        primary_category = sorted_categories[0][0] if sorted_categories else 'Reference'
        secondary_categories = [cat for cat, _ in sorted_categories[1:3] if _ > 0]

        # Get top keywords as topics (filter to meaningful ones)
        specific_topics = [kw for kw, score in keywords[:7] if len(kw) > 3]

        # Extract potential concepts (longer phrases)
        key_concepts = [kw for kw, score in keywords if len(kw.split()) >= 2][:5]

        # Detect author and content type using rules
        detected_author = self._detect_author(text.lower())
        content_type = self._detect_content_type(text.lower())
        difficulty = self._estimate_difficulty(text)

        result = {
            'primary_category': primary_category,
            'secondary_categories': secondary_categories,
            'specific_topics': specific_topics[:5],
            'key_concepts': key_concepts,
            'detected_author': detected_author,
            'content_type': content_type,
            'difficulty_level': difficulty,
            'time_period': None,
            'language_detected': 'en',
            'summary': None,
            'confidence': 0.5,  # Medium confidence for statistical
            'classification_source': 'statistical'
        }

        logger.info(f"Statistical classification: {primary_category} (keywords: {len(keywords)})")
        return result

    def _extract_keywords(self, text: str) -> List[Tuple[str, float]]:
        """
        Extract keywords using available statistical method.

        Returns list of (keyword, score) tuples sorted by score.
        """
        # Limit text for performance
        text_sample = text[:20000]

        keywords = []

        # Try RAKE first
        if self.rake:
            try:
                rake_result = self.rake.apply(text_sample)
                # RAKE returns (keyword, score) tuples
                keywords = [(kw, score) for kw, score in rake_result[:30]]
                logger.debug(f"RAKE extracted {len(keywords)} keywords")
            except Exception as e:
                logger.warning(f"RAKE extraction failed: {e}")

        # Try YAKE if RAKE not available or failed
        if not keywords and self.yake_extractor:
            try:
                yake_result = self.yake_extractor.extract_keywords(text_sample)
                # YAKE returns (keyword, score) - lower score is better, so invert
                keywords = [(kw, 1.0 / (score + 0.01)) for kw, score in yake_result]
                logger.debug(f"YAKE extracted {len(keywords)} keywords")
            except Exception as e:
                logger.warning(f"YAKE extraction failed: {e}")

        # Fallback to simple word frequency
        if not keywords:
            keywords = self._simple_keyword_extraction(text_sample)

        # Filter by minimum score
        min_score = STATISTICAL_CONFIG.get('min_keyword_score', 0.3)
        if keywords and keywords[0][1] > 0:
            max_score = keywords[0][1]
            keywords = [(kw, score) for kw, score in keywords if score >= max_score * min_score]

        return keywords[:STATISTICAL_CONFIG.get('max_keywords', 10)]

    def _simple_keyword_extraction(self, text: str) -> List[Tuple[str, float]]:
        """Simple word frequency-based keyword extraction as ultimate fallback."""
        # Tokenize and clean
        words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())

        # Remove common stop words
        stop_words = {
            'the', 'and', 'but', 'for', 'with', 'from', 'this', 'that', 'which',
            'have', 'has', 'had', 'are', 'was', 'were', 'been', 'being', 'will',
            'would', 'could', 'should', 'may', 'might', 'must', 'can', 'not',
            'all', 'any', 'some', 'such', 'than', 'then', 'when', 'where', 'what',
            'who', 'how', 'why', 'also', 'only', 'very', 'just', 'more', 'most',
            'other', 'into', 'over', 'after', 'before', 'between', 'under', 'through'
        }

        words = [w for w in words if w not in stop_words]

        # Count frequencies
        word_counts = Counter(words)
        total = sum(word_counts.values())

        # Return as (word, frequency_ratio) tuples
        return [(word, count / total) for word, count in word_counts.most_common(30)]

    def _score_categories_from_keywords(
        self,
        keywords: List[Tuple[str, float]],
        text: str
    ) -> Dict[str, float]:
        """Score categories based on extracted keywords and text patterns."""
        scores = {cat: 0.0 for cat in self.CATEGORIES}

        # Get all keyword strings
        keyword_strings = [kw.lower() for kw, _ in keywords]
        text_lower = text.lower()

        # Score based on keyword matches to category keywords
        for category, cat_keywords in self.CATEGORY_KEYWORDS.items():
            for cat_kw in cat_keywords:
                # Check if category keyword appears in extracted keywords
                for kw in keyword_strings:
                    if cat_kw in kw or kw in cat_kw:
                        scores[category] += 2.0

                # Check if category keyword appears in text
                if cat_kw in text_lower:
                    scores[category] += 1.0

        return scores

    def _fallback_classify(self, text: str) -> Dict[str, Any]:
        """
        Rule-based fallback classification using keyword matching.

        Args:
            text: Full document text

        Returns:
            Classification dict
        """
        text_lower = text.lower()

        # Score each category by keyword matches
        category_scores = {}
        for category, keywords in self.CATEGORY_KEYWORDS.items():
            score = sum(1 for kw in keywords if kw in text_lower)
            if score > 0:
                category_scores[category] = score

        # Sort by score
        sorted_categories = sorted(category_scores.items(), key=lambda x: x[1], reverse=True)

        # Get primary and secondary categories
        primary_category = sorted_categories[0][0] if sorted_categories else 'Reference'
        secondary_categories = [cat for cat, _ in sorted_categories[1:3]]

        # Try to detect author
        detected_author = self._detect_author(text_lower)

        # Simple content type detection
        content_type = self._detect_content_type(text_lower)

        # Estimate difficulty (simple heuristic based on vocabulary)
        difficulty = self._estimate_difficulty(text)

        # Extract potential topics from frequently occurring capitalized phrases
        topics = self._extract_topics_fallback(text)

        # Extract potential concepts
        concepts = self._extract_concepts_fallback(text)

        result = {
            'primary_category': primary_category,
            'secondary_categories': secondary_categories,
            'specific_topics': topics[:5],
            'key_concepts': concepts[:7],
            'detected_author': detected_author,
            'content_type': content_type,
            'difficulty_level': difficulty,
            'time_period': None,
            'language_detected': 'en',  # Default assumption
            'summary': None,
            'confidence': 0.4,  # Lower confidence for fallback
            'classification_source': 'fallback'
        }

        logger.info(f"Fallback classification: {primary_category}")
        return result

    def _detect_author(self, text_lower: str) -> Optional[str]:
        """Detect author from known author patterns."""
        for author_name, patterns in self.KNOWN_AUTHORS.items():
            for pattern in patterns:
                if pattern in text_lower:
                    return author_name.title()
        return None

    def _detect_content_type(self, text_lower: str) -> str:
        """Detect content type from text patterns."""
        if 'lecture' in text_lower or 'given on' in text_lower:
            return 'lecture'
        if 'dear ' in text_lower and ('yours' in text_lower or 'sincerely' in text_lower):
            return 'letter'
        if 'chapter' in text_lower[:500]:
            return 'book_chapter'
        if 'abstract' in text_lower[:200] or 'introduction' in text_lower[:500]:
            return 'academic_paper'
        return 'essay'

    def _estimate_difficulty(self, text: str) -> str:
        """Estimate difficulty level based on vocabulary complexity."""
        words = text.split()
        if not words:
            return 'intermediate'

        # Average word length as proxy for complexity
        avg_word_length = sum(len(w) for w in words[:1000]) / min(len(words), 1000)

        # Count complex vocabulary indicators
        complex_words = [
            'epistemological', 'phenomenological', 'ontological', 'metaphysical',
            'transcendental', 'dialectical', 'hermeneutic', 'supersensible'
        ]
        text_lower = text.lower()
        complex_count = sum(1 for w in complex_words if w in text_lower)

        if avg_word_length > 6 and complex_count >= 3:
            return 'expert'
        elif avg_word_length > 5.5 or complex_count >= 2:
            return 'advanced'
        elif avg_word_length > 5:
            return 'intermediate'
        else:
            return 'introductory'

    def _extract_topics_fallback(self, text: str) -> List[str]:
        """Extract potential topics from capitalized phrases."""
        # Find capitalized phrases (potential proper nouns/topics)
        pattern = r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})\b'
        matches = re.findall(pattern, text[:5000])

        # Count occurrences
        topic_counts = {}
        for match in matches:
            # Filter out common words and short matches
            if len(match) > 3 and match.lower() not in ['the', 'this', 'that', 'these', 'those']:
                topic_counts[match] = topic_counts.get(match, 0) + 1

        # Return most frequent
        sorted_topics = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)
        return [topic for topic, _ in sorted_topics[:10]]

    def _extract_concepts_fallback(self, text: str) -> List[str]:
        """Extract potential concepts from text."""
        # Look for definition-like patterns
        text_lower = text.lower()

        concepts = []

        # Pattern: "X is the/a ..." or "the X of ..."
        definition_patterns = [
            r'the (\w+(?:\s+\w+)?)\s+(?:is|are|means|refers)',
            r'(\w+(?:\s+\w+)?)\s+(?:is defined as|can be understood as)',
        ]

        for pattern in definition_patterns:
            matches = re.findall(pattern, text_lower[:10000])
            concepts.extend(matches)

        # Remove duplicates and common words
        stop_words = {'the', 'a', 'an', 'this', 'that', 'it', 'which', 'who', 'what'}
        concepts = [c.strip() for c in concepts if c.strip().lower() not in stop_words]
        concepts = list(dict.fromkeys(concepts))  # Remove duplicates preserving order

        return concepts[:10]

    def _validate_category(self, category: str) -> str:
        """Validate and normalize category."""
        if not category:
            return 'Reference'

        # Try exact match
        for valid in self.CATEGORIES:
            if category.lower() == valid.lower():
                return valid

        # Try partial match
        for valid in self.CATEGORIES:
            if category.lower() in valid.lower() or valid.lower() in category.lower():
                return valid

        return 'Reference'

    def _validate_content_type(self, content_type: str) -> str:
        """Validate and normalize content type."""
        if not content_type:
            return 'other'

        content_type_lower = content_type.lower().replace(' ', '_')
        if content_type_lower in self.CONTENT_TYPES:
            return content_type_lower

        return 'other'

    def _validate_difficulty(self, difficulty: str) -> str:
        """Validate and normalize difficulty level."""
        if not difficulty:
            return 'intermediate'

        difficulty_lower = difficulty.lower()
        if difficulty_lower in self.DIFFICULTY_LEVELS:
            return difficulty_lower

        return 'intermediate'

    def _merge_metadata(
        self,
        classification: Dict[str, Any],
        existing: Dict[str, Any]
    ) -> Dict[str, Any]:
        """
        Merge classification with existing metadata.

        Existing metadata (from filename, etc.) takes precedence for
        author, year, and title. Classification takes precedence for
        categories, topics, and concepts.
        """
        # Fields where existing takes precedence
        if existing.get('author'):
            classification['detected_author'] = existing['author']
        if existing.get('year'):
            classification['publication_year'] = existing['year']
        if existing.get('title'):
            classification['title'] = existing['title']
        if existing.get('language'):
            classification['language_detected'] = existing['language']

        return classification

    def sync_to_database(
        self,
        classification: Dict[str, Any],
        document_id: str,
        auto_create: bool = True
    ) -> Dict[str, int]:
        """
        Sync classification results to database.

        Creates new topics and concepts if they don't exist (when auto_create=True)
        and links them to the document.

        Args:
            classification: Output from classify_document()
            document_id: Document to link
            auto_create: Whether to create new topics/concepts automatically

        Returns:
            Dict with counts of linked topics and concepts
        """
        stats = {'topics_linked': 0, 'concepts_linked': 0, 'topics_created': 0, 'concepts_created': 0}

        # Update document classification fields
        self._update_document_classification(document_id, classification)

        # Sync topics
        for topic_name in classification.get('specific_topics', []):
            if not topic_name or len(topic_name) < 2:
                continue

            topic_id = self._get_or_create_topic(topic_name, auto_create)
            if topic_id:
                self._link_document_topic(
                    document_id, topic_id,
                    relevance=classification.get('confidence', 0.5)
                )
                stats['topics_linked'] += 1

        # Sync concepts
        for concept_name in classification.get('key_concepts', []):
            if not concept_name or len(concept_name) < 2:
                continue

            concept_id = self._get_or_create_concept(
                concept_name,
                category=classification.get('primary_category', 'General'),
                auto_create=auto_create
            )
            if concept_id:
                self._link_document_concept(document_id, concept_id)
                stats['concepts_linked'] += 1

        # Link to category
        if classification.get('primary_category'):
            self._link_document_category(document_id, classification['primary_category'])

        logger.info(f"Synced classification for {document_id}: {stats}")
        return stats

    def _update_document_classification(
        self,
        document_id: str,
        classification: Dict[str, Any]
    ) -> None:
        """Update document record with classification fields."""
        query = """
            UPDATE documents SET
                primary_category = %s,
                content_type = %s,
                difficulty_level = %s,
                metadata_source = %s,
                classification_confidence = %s,
                needs_review = %s,
                updated_at = CURRENT_TIMESTAMP
            WHERE document_id = %s
        """

        needs_review = classification.get('confidence', 0) < 0.6

        params = (
            classification.get('primary_category'),
            classification.get('content_type'),
            classification.get('difficulty_level'),
            classification.get('classification_source', 'unknown'),
            classification.get('confidence'),
            needs_review,
            document_id
        )

        with get_db_connection() as conn:
            with conn.cursor() as cur:
                cur.execute(query, params)

    def _load_caches(self) -> None:
        """Load topic, concept, and category caches from database."""
        if self._topic_cache is None:
            topics = execute_query(
                "SELECT topic_id, LOWER(name) as name FROM topics",
                fetch='all'
            ) or []
            self._topic_cache = {t['name']: t['topic_id'] for t in topics}

        if self._concept_cache is None:
            concepts = execute_query(
                "SELECT concept_id, LOWER(name) as name FROM concepts",
                fetch='all'
            ) or []
            self._concept_cache = {c['name']: c['concept_id'] for c in concepts}

        if self._category_cache is None:
            categories = execute_query(
                "SELECT category_id, LOWER(name) as name FROM categories",
                fetch='all'
            ) or []
            self._category_cache = {c['name']: c['category_id'] for c in categories}

    def _get_or_create_topic(self, name: str, auto_create: bool = True) -> Optional[int]:
        """Get existing topic or create new one."""
        self._load_caches()

        normalized = name.lower().strip()
        if normalized in self._topic_cache:
            return self._topic_cache[normalized]

        if not auto_create:
            return None

        # Create new topic (flagged as auto-created for review)
        try:
            result = execute_query(
                """INSERT INTO topics (name, description, auto_created)
                   VALUES (%s, %s, TRUE)
                   ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name
                   RETURNING topic_id""",
                (name, f'Auto-created from document classification'),
                fetch='one'
            )

            if result:
                topic_id = result['topic_id']
                self._topic_cache[normalized] = topic_id
                logger.debug(f"Created new topic: {name}")
                return topic_id
        except Exception as e:
            logger.warning(f"Failed to create topic '{name}': {e}")

        return None

    def _get_or_create_concept(
        self,
        name: str,
        category: str,
        auto_create: bool = True
    ) -> Optional[int]:
        """Get existing concept or create new one."""
        self._load_caches()

        normalized = name.lower().strip()
        if normalized in self._concept_cache:
            return self._concept_cache[normalized]

        if not auto_create:
            return None

        # Create new concept (flagged as auto-created for review)
        try:
            result = execute_query(
                """INSERT INTO concepts (name, category, auto_created)
                   VALUES (%s, %s, TRUE)
                   ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name
                   RETURNING concept_id""",
                (name, category),
                fetch='one'
            )

            if result:
                concept_id = result['concept_id']
                self._concept_cache[normalized] = concept_id
                logger.debug(f"Created new concept: {name}")
                return concept_id
        except Exception as e:
            logger.warning(f"Failed to create concept '{name}': {e}")

        return None

    def _get_or_create_category(self, name: str) -> Optional[int]:
        """Get existing category or return None (don't auto-create categories)."""
        self._load_caches()

        normalized = name.lower().strip()
        if normalized in self._category_cache:
            return self._category_cache[normalized]

        # Try partial match
        for cached_name, cat_id in self._category_cache.items():
            if normalized in cached_name or cached_name in normalized:
                return cat_id

        return None

    def _link_document_topic(
        self,
        doc_id: str,
        topic_id: int,
        relevance: float = 0.5
    ) -> None:
        """Link document to topic."""
        try:
            execute_query(
                """INSERT INTO document_topics (document_id, topic_id, relevance_score)
                   VALUES (%s, %s, %s)
                   ON CONFLICT (document_id, topic_id) DO UPDATE
                   SET relevance_score = GREATEST(document_topics.relevance_score, EXCLUDED.relevance_score)""",
                (doc_id, topic_id, relevance),
                fetch='none'
            )
        except Exception as e:
            logger.warning(f"Failed to link document {doc_id} to topic {topic_id}: {e}")

    def _link_document_concept(self, doc_id: str, concept_id: int) -> None:
        """Link document to concept."""
        try:
            execute_query(
                """INSERT INTO document_concepts (document_id, concept_id, mention_count)
                   VALUES (%s, %s, 1)
                   ON CONFLICT (document_id, concept_id)
                   DO UPDATE SET mention_count = document_concepts.mention_count + 1""",
                (doc_id, concept_id),
                fetch='none'
            )
        except Exception as e:
            logger.warning(f"Failed to link document {doc_id} to concept {concept_id}: {e}")

    def _link_document_category(self, doc_id: str, category_name: str) -> None:
        """Link document to category."""
        category_id = self._get_or_create_category(category_name)
        if not category_id:
            return

        try:
            execute_query(
                """INSERT INTO document_categories (document_id, category_id)
                   VALUES (%s, %s)
                   ON CONFLICT DO NOTHING""",
                (doc_id, category_id),
                fetch='none'
            )
        except Exception as e:
            logger.warning(f"Failed to link document {doc_id} to category {category_name}: {e}")

    def suggest_folder_path(
        self,
        classification: Dict[str, Any],
        author: str = None
    ) -> str:
        """
        Suggest logical folder organization based on classification.

        Instead of: ORGANIZED/PDF/
        Returns:    Philosophy/Rudolf_Steiner/

        Args:
            classification: Output from classify_document()
            author: Author name (overrides detected_author)

        Returns:
            Suggested folder path relative to ORGANIZED directory
        """
        category = classification.get('primary_category', 'Uncategorized')
        category = category.replace('/', '_')  # Handle "Religion/Spirituality"

        author_name = author or classification.get('detected_author')

        if author_name:
            # Clean author name for folder
            author_folder = re.sub(r'[^\w\s-]', '', author_name)
            author_folder = re.sub(r'\s+', '_', author_folder.strip())
            return f"{category}/{author_folder}"

        return category

    def invalidate_caches(self) -> None:
        """Clear all caches (call after bulk database changes)."""
        self._topic_cache = None
        self._concept_cache = None
        self._category_cache = None


# =============================================================================
# CLI Interface
# =============================================================================

def main():
    """Command-line interface for testing the Taxonomist."""
    import argparse

    parser = argparse.ArgumentParser(description='Test document classification')
    parser.add_argument('file', type=Path, help='Document file to classify')
    parser.add_argument('--no-llm', action='store_true', help='Force fallback mode')

    args = parser.parse_args()

    if not args.file.exists():
        print(f"Error: File not found: {args.file}")
        return 1

    # Read file
    text = args.file.read_text(encoding='utf-8', errors='ignore')

    # Create taxonomist
    taxonomist = Taxonomist()
    if args.no_llm:
        taxonomist.client = None

    # Classify
    print(f"\nClassifying: {args.file.name}")
    print("=" * 60)

    classification = taxonomist.classify_document(text)

    # Print results
    print(f"Primary Category:    {classification.get('primary_category')}")
    print(f"Secondary:           {classification.get('secondary_categories')}")
    print(f"Topics:              {classification.get('specific_topics')}")
    print(f"Key Concepts:        {classification.get('key_concepts')}")
    print(f"Content Type:        {classification.get('content_type')}")
    print(f"Difficulty:          {classification.get('difficulty_level')}")
    print(f"Detected Author:     {classification.get('detected_author')}")
    print(f"Confidence:          {classification.get('confidence')}")
    print(f"Source:              {classification.get('classification_source')}")
    print(f"\nSuggested Folder:    {taxonomist.suggest_folder_path(classification)}")
    print("=" * 60)

    return 0


if __name__ == '__main__':
    import sys
    sys.exit(main())
