#!/usr/bin/env python3
"""
Concept Extraction Pipeline for Research Development Framework.

This script extracts concepts and topics from documents:
1. Analyze document content for key concepts
2. Map concepts to document and chunk levels
3. Enable concept-based discovery and browsing
4. Support glossary-based concept boosting

Usage:
    python extract_concepts.py                       # Process all documents
    python extract_concepts.py --document DOC_001    # Process specific document
    python extract_concepts.py --add-concept "Name"  # Add new concept to taxonomy
    python extract_concepts.py --glossary glossary.txt  # Use weighted glossary
    python extract_concepts.py --import-glossary glossary.txt  # Import glossary terms
"""

import re
import logging
from pathlib import Path
from typing import List, Dict, Any, Set, Tuple, Optional
from collections import Counter
import argparse

from config import PROCESSING_CONFIG, LOGGING_CONFIG
from db_utils import (
    get_db_connection,
    execute_query,
    update_document_status
)

# Optional fuzzy matching for active entity linking
try:
    from rapidfuzz import fuzz, process as fuzz_process
    FUZZY_AVAILABLE = True
except ImportError:
    FUZZY_AVAILABLE = False

# Setup logging
logging.basicConfig(
    level=getattr(logging, LOGGING_CONFIG['level']),
    format=LOGGING_CONFIG['format']
)
logger = logging.getLogger(__name__)


class ActiveEntityLinker:
    """
    Real-time concept deduplication during ingestion.

    Detects when new entities are aliases of existing concepts using:
    1. Exact prefix/suffix matching ("R. Steiner" → "Rudolf Steiner")
    2. Fuzzy string matching (configurable threshold)
    3. Common abbreviation patterns
    4. Learned alias patterns from the database

    This prevents taxonomy drift by catching duplicates at ingestion time
    rather than relying on periodic cleanup.
    """

    # Common name abbreviation patterns
    NAME_PATTERNS = [
        (r'^([A-Z])\.\s*(.+)$', r'\1[a-z]+ \2'),  # "R. Steiner" → "R* Steiner"
        (r'^Dr\.?\s+(.+)$', r'\1'),                # "Dr. Smith" → "Smith"
        (r'^Prof\.?\s+(.+)$', r'\1'),              # "Prof. Jones" → "Jones"
        (r'^Saint\s+(.+)$', r'St\.? \1'),          # "Saint Augustine" → "St Augustine"
        (r'^St\.?\s+(.+)$', r'Saint \1'),          # "St Augustine" → "Saint Augustine"
    ]

    def __init__(
        self,
        fuzzy_threshold: float = 85.0,
        enable_fuzzy: bool = True,
        concept_cache: Dict[str, Dict] = None
    ):
        """
        Initialize the entity linker.

        Args:
            fuzzy_threshold: Minimum fuzzy match score (0-100) for auto-linking
            enable_fuzzy: Enable fuzzy matching (requires rapidfuzz)
            concept_cache: Pre-loaded concept cache from ConceptExtractor
        """
        self.fuzzy_threshold = fuzzy_threshold
        self.enable_fuzzy = enable_fuzzy and FUZZY_AVAILABLE
        self.concept_cache = concept_cache or {}

        # Build reverse index for efficient matching
        self._build_search_index()

        # Track linking decisions for audit
        self.linking_log = []

        if enable_fuzzy and not FUZZY_AVAILABLE:
            logger.warning("Fuzzy matching requested but rapidfuzz not installed. "
                          "Install with: pip install rapidfuzz")

    def _build_search_index(self):
        """Build indices for efficient entity matching."""
        # All known names (for fuzzy search)
        self.all_names = list(self.concept_cache.keys())

        # Word-based index for partial matches
        self.word_index = {}
        for name, info in self.concept_cache.items():
            words = name.lower().split()
            for word in words:
                if len(word) > 2:  # Skip very short words
                    if word not in self.word_index:
                        self.word_index[word] = []
                    self.word_index[word].append(name)

        # Last-name index (for person names like "Steiner" → "Rudolf Steiner")
        self.last_name_index = {}
        for name, info in self.concept_cache.items():
            if info.get('category') in ('People', 'Person', 'Author'):
                parts = name.split()
                if len(parts) >= 2:
                    last_name = parts[-1].lower()
                    if last_name not in self.last_name_index:
                        self.last_name_index[last_name] = []
                    self.last_name_index[last_name].append(name)

    def find_match(
        self,
        entity_text: str,
        entity_type: str = None,
        min_confidence: float = 0.7
    ) -> Optional[Dict]:
        """
        Find if an entity matches an existing concept.

        Args:
            entity_text: The entity text to match
            entity_type: Optional entity type hint (Person, Concept, etc.)
            min_confidence: Minimum confidence for returning a match

        Returns:
            Dict with match info or None:
            {
                'concept_id': int,
                'matched_name': str,
                'match_type': str,  # 'exact', 'alias', 'fuzzy', 'abbreviation', 'last_name'
                'confidence': float,
                'original': str
            }
        """
        entity_lower = entity_text.lower().strip()

        # 1. Exact match
        if entity_lower in self.concept_cache:
            return {
                'concept_id': self.concept_cache[entity_lower]['concept_id'],
                'matched_name': self.concept_cache[entity_lower]['name'],
                'match_type': 'exact',
                'confidence': 1.0,
                'original': entity_text
            }

        # 2. Abbreviation expansion (R. Steiner → Rudolf Steiner)
        abbrev_match = self._match_abbreviation(entity_lower)
        if abbrev_match and abbrev_match['confidence'] >= min_confidence:
            return abbrev_match

        # 3. Last name match for people (Steiner → Rudolf Steiner)
        if entity_type in ('Person', 'People', 'Author', None):
            last_name_match = self._match_last_name(entity_lower)
            if last_name_match and last_name_match['confidence'] >= min_confidence:
                return last_name_match

        # 4. Fuzzy matching (if enabled)
        if self.enable_fuzzy and self.all_names:
            fuzzy_match = self._match_fuzzy(entity_lower)
            if fuzzy_match and fuzzy_match['confidence'] >= min_confidence:
                return fuzzy_match

        return None

    def _match_abbreviation(self, entity_lower: str) -> Optional[Dict]:
        """Match abbreviated names to full names."""
        # Check for "Initial. LastName" pattern
        match = re.match(r'^([a-z])\.\s*(.+)$', entity_lower)
        if match:
            initial, rest = match.groups()
            # Look for names starting with this initial and ending with rest
            for name in self.all_names:
                name_parts = name.split()
                if len(name_parts) >= 2:
                    if (name_parts[0].startswith(initial) and
                        ' '.join(name_parts[1:]).lower() == rest.lower()):
                        return {
                            'concept_id': self.concept_cache[name]['concept_id'],
                            'matched_name': self.concept_cache[name]['name'],
                            'match_type': 'abbreviation',
                            'confidence': 0.9,
                            'original': entity_lower
                        }

        # Check for "FirstName Initial. LastName" → "FirstName MiddleName LastName"
        match = re.match(r'^([a-z]+)\s+([a-z])\.\s+(.+)$', entity_lower)
        if match:
            first, middle_init, last = match.groups()
            for name in self.all_names:
                name_lower = name.lower()
                if (name_lower.startswith(first) and
                    name_lower.endswith(last) and
                    len(name.split()) >= 3):
                    # Check middle name starts with initial
                    parts = name.split()
                    if len(parts) >= 3 and parts[1].lower().startswith(middle_init):
                        return {
                            'concept_id': self.concept_cache[name]['concept_id'],
                            'matched_name': self.concept_cache[name]['name'],
                            'match_type': 'abbreviation',
                            'confidence': 0.85,
                            'original': entity_lower
                        }

        return None

    def _match_last_name(self, entity_lower: str) -> Optional[Dict]:
        """Match last names to full names (for people)."""
        # Only match if it's a single word that could be a last name
        if ' ' in entity_lower:
            return None

        candidates = self.last_name_index.get(entity_lower, [])

        if len(candidates) == 1:
            # Unambiguous match
            name = candidates[0]
            return {
                'concept_id': self.concept_cache[name]['concept_id'],
                'matched_name': self.concept_cache[name]['name'],
                'match_type': 'last_name',
                'confidence': 0.8,
                'original': entity_lower
            }
        elif len(candidates) > 1:
            # Multiple people with same last name - don't auto-link
            logger.debug(f"Ambiguous last name '{entity_lower}': {candidates}")
            return None

        return None

    def _match_fuzzy(self, entity_lower: str) -> Optional[Dict]:
        """Use fuzzy string matching to find similar concepts."""
        if not self.all_names:
            return None

        # Use rapidfuzz to find best match
        result = fuzz_process.extractOne(
            entity_lower,
            self.all_names,
            scorer=fuzz.WRatio,
            score_cutoff=self.fuzzy_threshold
        )

        if result:
            matched_name, score, _ = result
            # Convert score to confidence (0-1)
            confidence = score / 100.0

            return {
                'concept_id': self.concept_cache[matched_name]['concept_id'],
                'matched_name': self.concept_cache[matched_name]['name'],
                'match_type': 'fuzzy',
                'confidence': confidence,
                'original': entity_lower,
                'fuzzy_score': score
            }

        return None

    def link_entity(
        self,
        entity_text: str,
        entity_type: str = None,
        auto_add_alias: bool = True,
        source_chunk_id: str = None
    ) -> Tuple[Optional[int], str]:
        """
        Link an entity to an existing concept or flag as new.

        Args:
            entity_text: The entity text to link
            entity_type: Optional entity type hint
            auto_add_alias: If True, automatically add as alias when linked
            source_chunk_id: Chunk ID where entity was found (for logging)

        Returns:
            Tuple of (concept_id or None, action_taken)
            action_taken: 'linked', 'new', 'ambiguous'
        """
        match = self.find_match(entity_text, entity_type)

        if match:
            # Log the linking decision
            self.linking_log.append({
                'entity': entity_text,
                'linked_to': match['matched_name'],
                'match_type': match['match_type'],
                'confidence': match['confidence'],
                'chunk_id': source_chunk_id
            })

            # Optionally add as an alias
            if auto_add_alias and match['match_type'] != 'exact':
                self._add_alias(match['concept_id'], entity_text)

            logger.debug(f"Linked '{entity_text}' → '{match['matched_name']}' "
                        f"({match['match_type']}, conf={match['confidence']:.2f})")

            return match['concept_id'], 'linked'

        return None, 'new'

    def _add_alias(self, concept_id: int, alias: str):
        """Add an alias to a concept in the database."""
        try:
            with get_db_connection() as conn:
                with conn.cursor() as cur:
                    # Get current aliases
                    cur.execute(
                        "SELECT aliases FROM concepts WHERE concept_id = %s",
                        (concept_id,)
                    )
                    result = cur.fetchone()
                    current_aliases = result[0] if result and result[0] else []

                    # Add new alias if not already present
                    alias_lower = alias.lower().strip()
                    if alias_lower not in [a.lower() for a in current_aliases]:
                        current_aliases.append(alias)
                        cur.execute(
                            "UPDATE concepts SET aliases = %s WHERE concept_id = %s",
                            (current_aliases, concept_id)
                        )
                        logger.info(f"Added alias '{alias}' to concept {concept_id}")

                        # Update local cache
                        self.concept_cache[alias_lower] = self.concept_cache.get(
                            next((k for k, v in self.concept_cache.items()
                                  if v['concept_id'] == concept_id), None),
                            {}
                        ).copy()
                        if self.concept_cache.get(alias_lower):
                            self.all_names.append(alias_lower)

        except Exception as e:
            logger.warning(f"Failed to add alias: {e}")

    def get_linking_report(self) -> Dict:
        """Get a summary of linking decisions."""
        if not self.linking_log:
            return {'total': 0, 'by_type': {}}

        by_type = {}
        for entry in self.linking_log:
            match_type = entry['match_type']
            by_type[match_type] = by_type.get(match_type, 0) + 1

        return {
            'total': len(self.linking_log),
            'by_type': by_type,
            'log': self.linking_log
        }


def load_glossary(glossary_path: str) -> Dict[str, float]:
    """
    Load a glossary file for concept weighting.

    Format:
        # Comment lines start with #
        Term:Weight
        Term  (default weight 1.5)

    Example:
        Anthroposophy:3.0
        Etheric Body:2.5
        Freedom

    Returns:
        Dict mapping lowercase terms to their weights
    """
    glossary = {}
    path = Path(glossary_path)

    if not path.exists():
        logger.warning(f"Glossary file not found: {glossary_path}")
        return glossary

    try:
        with open(path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                line = line.strip()

                # Skip comments and empty lines
                if not line or line.startswith('#'):
                    continue

                # Parse "Term:Weight" or "Term"
                if ':' in line:
                    # Split only on last colon (term might contain colons)
                    parts = line.rsplit(':', 1)
                    term = parts[0].strip()
                    try:
                        weight = float(parts[1].strip())
                        if weight <= 0:
                            logger.warning(f"Line {line_num}: Weight must be positive, using 1.5")
                            weight = 1.5
                    except ValueError:
                        logger.warning(f"Line {line_num}: Invalid weight '{parts[1]}', using 1.5")
                        weight = 1.5
                else:
                    term = line.strip()
                    weight = 1.5  # Default boost

                if term:
                    glossary[term.lower()] = weight

        logger.info(f"Loaded {len(glossary)} glossary terms from {glossary_path}")
        return glossary

    except Exception as e:
        logger.error(f"Error loading glossary: {e}")
        return {}


def import_glossary_to_db(glossary_path: str, category: str = None) -> Tuple[int, int]:
    """
    Import glossary terms into the concepts database table.

    Args:
        glossary_path: Path to glossary file
        category: Optional category for all imported terms

    Returns:
        Tuple of (added_count, updated_count)
    """
    glossary = load_glossary(glossary_path)

    if not glossary:
        return (0, 0)

    added = 0
    updated = 0

    with get_db_connection() as conn:
        with conn.cursor() as cur:
            for term, weight in glossary.items():
                # Check if concept exists
                cur.execute(
                    "SELECT concept_id FROM concepts WHERE LOWER(name) = %s",
                    (term,)
                )
                existing = cur.fetchone()

                if existing:
                    # Update weight if stored
                    updated += 1
                else:
                    # Insert new concept (capitalize first letter of each word)
                    display_name = ' '.join(word.capitalize() for word in term.split())
                    cur.execute(
                        """
                        INSERT INTO concepts (name, category)
                        VALUES (%s, %s)
                        """,
                        (display_name, category)
                    )
                    added += 1

    logger.info(f"Imported glossary: {added} added, {updated} already existed")
    return (added, updated)


class ConceptExtractor:
    """Handles concept extraction from document text."""

    def __init__(
        self,
        glossary_path: str = None,
        use_ner: bool = False,
        ner_extractor: str = 'gliner',
        ner_entity_types: List[str] = None,
        extract_relations: bool = False,
        relation_backend: str = 'openai',
        openai_model: str = 'gpt-4o-mini',
        active_linking: bool = False,
        fuzzy_threshold: float = 85.0
    ):
        """
        Initialize the concept extractor.

        Args:
            glossary_path: Optional path to glossary file for weighted boosting
            use_ner: Enable NER-based entity extraction (GLiNER/LLaMA/OpenAI)
            ner_extractor: NER backend ('gliner', 'llama', 'openai', 'hybrid')
            ner_entity_types: Entity types for NER extraction
            extract_relations: Extract S-P-O triples (requires LLaMA/OpenAI)
            relation_backend: For hybrid mode, which backend for relations ('openai', 'llama')
            openai_model: OpenAI model to use ('gpt-4o', 'gpt-4o-mini')
            active_linking: Enable real-time entity deduplication during ingestion
            fuzzy_threshold: Minimum fuzzy match score for active linking (0-100)
        """
        self.concept_cache = {}
        self.glossary_weights = {}
        self.use_ner = use_ner
        self.extract_relations = extract_relations
        self.entity_extractor = None
        self.active_linking = active_linking
        self.entity_linker = None

        self._load_concepts()

        # Load glossary if provided
        if glossary_path:
            self.glossary_weights = load_glossary(glossary_path)
            logger.info(f"Using glossary with {len(self.glossary_weights)} weighted terms")

        # Initialize Active Entity Linker if enabled
        if active_linking:
            self.entity_linker = ActiveEntityLinker(
                fuzzy_threshold=fuzzy_threshold,
                enable_fuzzy=True,
                concept_cache=self.concept_cache
            )
            logger.info(f"Active Entity Linking enabled (threshold: {fuzzy_threshold})")

        # Initialize NER extractor if enabled
        if use_ner:
            self._init_ner_extractor(
                ner_extractor,
                ner_entity_types,
                relation_backend=relation_backend,
                openai_model=openai_model
            )

        self.stats = {
            'documents_processed': 0,
            'concepts_found': 0,
            'chunk_mappings': 0,
            'glossary_boosts': 0,
            'ner_entities_found': 0,
            'ner_new_concepts': 0,
            'relations_extracted': 0,
            'entities_linked': 0,
            'aliases_added': 0,
            'errors': 0
        }

    def _init_ner_extractor(
        self,
        extractor_type: str,
        entity_types: List[str] = None,
        relation_backend: str = 'openai',
        openai_model: str = 'gpt-4o-mini'
    ):
        """Initialize NER entity extractor."""
        try:
            from entity_extractors import get_entity_extractor, check_extractor_availability

            availability = check_extractor_availability()

            # Check if requested extractor is available
            ext_info = availability.get(extractor_type, {})
            if isinstance(ext_info, dict):
                is_available = ext_info.get('available', False)
            else:
                is_available = ext_info

            if not is_available:
                logger.warning(f"NER extractor '{extractor_type}' not available")
                gliner_info = availability.get('gliner', {})
                if isinstance(gliner_info, dict) and gliner_info.get('available'):
                    logger.info("Falling back to GLiNER")
                    extractor_type = 'gliner'
                elif gliner_info:
                    extractor_type = 'gliner'
                else:
                    logger.warning("No NER extractors available. Install with: pip install gliner")
                    self.use_ner = False
                    return

            # Build kwargs based on extractor type
            kwargs = {'entity_types': entity_types}

            if extractor_type == 'hybrid':
                kwargs['relation_extractor'] = relation_backend
                kwargs['openai_model'] = openai_model
            elif extractor_type == 'openai':
                kwargs['model'] = openai_model

            self.entity_extractor = get_entity_extractor(extractor_type, **kwargs)

            if self.entity_extractor:
                self.entity_extractor.initialize()
                logger.info(f"Initialized {extractor_type} entity extractor")
                if extractor_type == 'hybrid':
                    logger.info(f"  Relation backend: {relation_backend}")
            else:
                self.use_ner = False
                logger.warning("Failed to initialize NER extractor")

        except ImportError as e:
            logger.warning(f"Could not load entity extractors: {e}")
            self.use_ner = False

    def _load_concepts(self):
        """Load concept taxonomy from database."""
        concepts = execute_query(
            "SELECT concept_id, name, category, aliases FROM concepts",
            fetch='all'
        )

        self.concept_cache = {}
        for c in concepts:
            # Primary name
            name_lower = c['name'].lower()
            self.concept_cache[name_lower] = {
                'concept_id': c['concept_id'],
                'name': c['name'],
                'category': c['category']
            }

            # Aliases
            if c.get('aliases'):
                for alias in c['aliases']:
                    self.concept_cache[alias.lower()] = {
                        'concept_id': c['concept_id'],
                        'name': c['name'],
                        'category': c['category']
                    }

        logger.info(f"Loaded {len(concepts)} concepts with {len(self.concept_cache)} terms")

    def find_concepts_in_text(self, text: str) -> Dict[int, float]:
        """
        Find all concept mentions in text with optional glossary weighting.

        Returns dict of concept_id -> weighted_count

        When a glossary is loaded, mention counts are multiplied by the
        glossary weight for matching terms. This allows domain-specific
        concepts to have higher relevance scores.
        """
        text_lower = text.lower()
        concept_counts = Counter()

        for term, concept_info in self.concept_cache.items():
            # Use word boundaries for matching
            pattern = r'\b' + re.escape(term) + r'\b'
            matches = re.findall(pattern, text_lower)

            if matches:
                raw_count = len(matches)

                # Apply glossary weighting if available
                if self.glossary_weights and term in self.glossary_weights:
                    weight = self.glossary_weights[term]
                    weighted_count = raw_count * weight
                    self.stats['glossary_boosts'] += 1
                    logger.debug(f"Boosting '{term}': {raw_count} -> {weighted_count:.1f} (x{weight})")
                else:
                    weighted_count = float(raw_count)

                concept_counts[concept_info['concept_id']] += weighted_count

        return dict(concept_counts)

    def extract_with_ner(self, text: str, chunk_id: str = None) -> Tuple[Dict[int, float], List[Dict]]:
        """
        Extract entities using NER and map to concepts.

        When active_linking is enabled, uses fuzzy matching to deduplicate
        entities against existing concepts in real-time.

        Returns:
            Tuple of (concept_counts, new_entities)
            - concept_counts: Dict mapping concept_id to count (for existing concepts)
            - new_entities: List of new entities to potentially add to taxonomy
        """
        if not self.use_ner or not self.entity_extractor:
            return {}, []

        concept_counts = Counter()
        new_entities = []
        relations = []

        try:
            # Run NER extraction
            result = self.entity_extractor.extract(
                text,
                extract_relations=self.extract_relations
            )

            for entity in result.entities:
                entity_text_lower = entity.text.lower().strip()
                self.stats['ner_entities_found'] += 1

                # Check if entity matches existing concept (exact match)
                if entity_text_lower in self.concept_cache:
                    concept_id = self.concept_cache[entity_text_lower]['concept_id']
                    # Weight by confidence score
                    concept_counts[concept_id] += entity.score

                # Try active entity linking (fuzzy/abbreviation matching)
                elif self.active_linking and self.entity_linker:
                    linked_id, action = self.entity_linker.link_entity(
                        entity.text,
                        entity_type=entity.label,
                        auto_add_alias=True,
                        source_chunk_id=chunk_id
                    )

                    if linked_id:
                        # Successfully linked to existing concept
                        concept_counts[linked_id] += entity.score
                        self.stats['entities_linked'] += 1
                    else:
                        # New entity - track for potential addition
                        new_entities.append({
                            'text': entity.text,
                            'label': entity.label,
                            'score': entity.score,
                            'chunk_id': chunk_id,
                            'normalized': entity.normalized
                        })
                else:
                    # No active linking - track as new entity
                    new_entities.append({
                        'text': entity.text,
                        'label': entity.label,
                        'score': entity.score,
                        'chunk_id': chunk_id,
                        'normalized': entity.normalized
                    })

            # Handle extracted relations (if LLaMA is used)
            if result.triples:
                for triple in result.triples:
                    self.stats['relations_extracted'] += 1
                    relations.append({
                        'subject': triple.subject,
                        'predicate': triple.predicate,
                        'object': triple.object,
                        'confidence': triple.confidence,
                        'chunk_id': chunk_id
                    })

                # Store relations if we have the table
                if relations:
                    self._store_relations(relations)

        except Exception as e:
            logger.warning(f"NER extraction failed: {e}")

        return dict(concept_counts), new_entities

    def _store_relations(self, relations: List[Dict]) -> None:
        """Store extracted relations in concept_relationships table."""
        try:
            with get_db_connection() as conn:
                with conn.cursor() as cur:
                    # Check if table exists
                    cur.execute("""
                        SELECT EXISTS (
                            SELECT FROM information_schema.tables
                            WHERE table_name = 'concept_relationships'
                        )
                    """)
                    if not cur.fetchone()[0]:
                        logger.debug("concept_relationships table not found, skipping relation storage")
                        return

                    for rel in relations:
                        # Try to find concept IDs for subject and object
                        subject_lower = rel['subject'].lower().strip()
                        object_lower = rel['object'].lower().strip()

                        subject_id = None
                        object_id = None

                        if subject_lower in self.concept_cache:
                            subject_id = self.concept_cache[subject_lower]['concept_id']
                        if object_lower in self.concept_cache:
                            object_id = self.concept_cache[object_lower]['concept_id']

                        # Only store if both endpoints are known concepts
                        if subject_id and object_id:
                            cur.execute("""
                                INSERT INTO concept_relationships
                                    (source_concept_id, target_concept_id, relationship_type,
                                     confidence, extraction_method)
                                VALUES (%s, %s, %s, %s, 'ner_llama')
                                ON CONFLICT (source_concept_id, target_concept_id, relationship_type)
                                DO UPDATE SET confidence = GREATEST(
                                    concept_relationships.confidence,
                                    EXCLUDED.confidence
                                )
                            """, (subject_id, object_id, rel['predicate'], rel['confidence']))

        except Exception as e:
            logger.warning(f"Failed to store relations: {e}")

    def auto_add_ner_entities(
        self,
        entities: List[Dict],
        min_score: float = 0.7,
        min_occurrences: int = 2,
        category_map: Dict[str, str] = None
    ) -> int:
        """
        Automatically add high-confidence NER entities to concept taxonomy.

        Args:
            entities: List of entity dicts from extract_with_ner
            min_score: Minimum confidence score to auto-add
            min_occurrences: Minimum number of occurrences across chunks
            category_map: Map NER labels to concept categories

        Returns:
            Number of concepts added
        """
        if category_map is None:
            category_map = {
                'Person': 'People',
                'Concept': 'Concepts',
                'Work': 'Works',
                'Organization': 'Organizations',
                'Location': 'Locations',
                'Event': 'Events',
                'Term': 'Terms',
            }

        # Count occurrences of each entity
        entity_counts = Counter()
        entity_info = {}

        for ent in entities:
            key = ent['normalized'].lower()
            entity_counts[key] += 1
            if key not in entity_info or ent['score'] > entity_info[key]['score']:
                entity_info[key] = ent

        added = 0
        for key, count in entity_counts.items():
            if count >= min_occurrences:
                info = entity_info[key]
                if info['score'] >= min_score:
                    # Add to database
                    category = category_map.get(info['label'], 'Extracted')
                    concept_id = self.add_concept(
                        info['normalized'],
                        category=category
                    )
                    if concept_id:
                        added += 1
                        self.stats['ner_new_concepts'] += 1
                        # Update cache
                        self.concept_cache[key] = {
                            'concept_id': concept_id,
                            'name': info['normalized'],
                            'category': category
                        }
                        logger.info(f"Auto-added NER concept: {info['normalized']} ({info['label']})")

        return added

    def get_documents_to_process(self, document_id: str = None) -> List[Dict]:
        """Get documents that need concept extraction."""
        if document_id:
            return execute_query(
                "SELECT document_id, title, file_path FROM documents WHERE document_id = %s",
                (document_id,),
                fetch='all'
            )

        # Get documents that don't have concept mappings yet
        return execute_query(
            """
            SELECT d.document_id, d.title, d.file_path
            FROM documents d
            LEFT JOIN document_concepts dc ON d.document_id = dc.document_id
            WHERE dc.document_id IS NULL
              AND d.processing_status IN ('chunked', 'embedded', 'completed')
            """,
            fetch='all'
        )

    def extract_for_document(self, document: Dict, auto_add_entities: bool = False) -> Dict[str, int]:
        """
        Extract concepts for a single document.

        Args:
            document: Document dict with document_id, title, file_path
            auto_add_entities: If True, auto-add high-confidence NER entities to taxonomy

        Returns dict with concept counts.
        """
        document_id = document['document_id']
        logger.info(f"Extracting concepts for: {document_id}")

        try:
            # Get all chunks for this document
            chunks = execute_query(
                "SELECT chunk_id, chunk_text FROM chunks WHERE document_id = %s",
                (document_id,),
                fetch='all'
            )

            if not chunks:
                logger.warning(f"No chunks found for {document_id}")
                return {}

            # Track document-level and chunk-level concepts
            document_concepts = Counter()
            chunk_concepts_to_insert = []
            all_new_entities = []  # For NER entities not yet in taxonomy

            for chunk in chunks:
                # Standard glossary/pattern matching
                chunk_concepts = self.find_concepts_in_text(chunk['chunk_text'])

                # NER-based extraction (if enabled)
                if self.use_ner:
                    ner_concepts, new_entities = self.extract_with_ner(
                        chunk['chunk_text'],
                        chunk_id=chunk['chunk_id']
                    )
                    # Merge NER results with pattern matches
                    for concept_id, count in ner_concepts.items():
                        chunk_concepts[concept_id] = chunk_concepts.get(concept_id, 0) + count
                    all_new_entities.extend(new_entities)

                # Accumulate for document level
                for concept_id, count in chunk_concepts.items():
                    document_concepts[concept_id] += count

                # Prepare chunk-level mappings
                for concept_id, count in chunk_concepts.items():
                    chunk_concepts_to_insert.append({
                        'chunk_id': chunk['chunk_id'],
                        'concept_id': concept_id,
                        'mention_count': count
                    })

            # Auto-add high-confidence NER entities if enabled
            if auto_add_entities and all_new_entities:
                added = self.auto_add_ner_entities(all_new_entities)
                if added > 0:
                    logger.info(f"Auto-added {added} new concepts from NER")

            # Insert document-level concepts
            with get_db_connection() as conn:
                with conn.cursor() as cur:
                    # Clear existing mappings
                    cur.execute(
                        "DELETE FROM document_concepts WHERE document_id = %s",
                        (document_id,)
                    )
                    cur.execute(
                        "DELETE FROM chunk_concepts WHERE chunk_id IN (SELECT chunk_id FROM chunks WHERE document_id = %s)",
                        (document_id,)
                    )

                    # Insert document concepts (round weighted counts to int for storage)
                    for concept_id, count in document_concepts.items():
                        cur.execute(
                            """
                            INSERT INTO document_concepts (document_id, concept_id, mention_count)
                            VALUES (%s, %s, %s)
                            ON CONFLICT (document_id, concept_id)
                            DO UPDATE SET mention_count = EXCLUDED.mention_count
                            """,
                            (document_id, concept_id, round(count))
                        )

                    # Insert chunk concepts (round weighted counts to int for storage)
                    for mapping in chunk_concepts_to_insert:
                        cur.execute(
                            """
                            INSERT INTO chunk_concepts (chunk_id, concept_id, mention_count)
                            VALUES (%s, %s, %s)
                            ON CONFLICT (chunk_id, concept_id)
                            DO UPDATE SET mention_count = EXCLUDED.mention_count
                            """,
                            (mapping['chunk_id'], mapping['concept_id'], round(mapping['mention_count']))
                        )

            self.stats['concepts_found'] += len(document_concepts)
            self.stats['chunk_mappings'] += len(chunk_concepts_to_insert)

            logger.info(f"Found {len(document_concepts)} unique concepts, "
                       f"{len(chunk_concepts_to_insert)} chunk mappings")

            return dict(document_concepts)

        except Exception as e:
            logger.error(f"Error extracting concepts for {document_id}: {e}")
            self.stats['errors'] += 1
            raise

    def process_all(self, document_id: str = None) -> Dict[str, int]:
        """Process all documents that need concept extraction."""
        documents = self.get_documents_to_process(document_id)

        if not documents:
            logger.info("No documents need concept extraction")
            return self.stats

        logger.info(f"Found {len(documents)} documents to process")

        for doc in documents:
            try:
                self.extract_for_document(doc)
                self.stats['documents_processed'] += 1
            except Exception as e:
                logger.error(f"Failed to process {doc['document_id']}: {e}")

        return self.stats

    @staticmethod
    def add_concept(name: str, category: str = None, aliases: List[str] = None) -> int:
        """
        Add a new concept to the taxonomy.

        Returns the concept_id.
        """
        with get_db_connection() as conn:
            with conn.cursor() as cur:
                cur.execute(
                    """
                    INSERT INTO concepts (name, category, aliases)
                    VALUES (%s, %s, %s)
                    ON CONFLICT (name) DO UPDATE
                    SET category = EXCLUDED.category,
                        aliases = EXCLUDED.aliases
                    RETURNING concept_id
                    """,
                    (name, category, aliases)
                )
                result = cur.fetchone()
                return result[0]

    @staticmethod
    def list_concepts() -> List[Dict]:
        """List all concepts in the taxonomy."""
        return execute_query(
            """
            SELECT c.concept_id, c.name, c.category, c.aliases,
                   COUNT(dc.document_id) as document_count
            FROM concepts c
            LEFT JOIN document_concepts dc ON c.concept_id = dc.concept_id
            GROUP BY c.concept_id
            ORDER BY c.category, c.name
            """,
            fetch='all'
        )


def main():
    parser = argparse.ArgumentParser(
        description='Extract concepts for the Research Development Framework',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Glossary File Format:
  # Comment lines start with #
  Term:Weight
  Term        (default weight 1.5)

Example glossary.txt:
  # Anthroposophy glossary
  Anthroposophy:3.0
  Etheric Body:2.5
  Astral Body:2.5
  Freedom:1.2

Examples:
  # Standard extraction
  python extract_concepts.py

  # With glossary weighting
  python extract_concepts.py --glossary glossary.txt

  # Import glossary terms to database
  python extract_concepts.py --import-glossary glossary.txt --category "Esotericism"

  # NER-based extraction with GLiNER (recommended)
  python extract_concepts.py --use-ner

  # NER with auto-add new entities to taxonomy
  python extract_concepts.py --use-ner --auto-add-entities

  # Use LLaMA for richer extraction (requires GPU)
  python extract_concepts.py --use-ner --ner-extractor llama --extract-relations

  # Use OpenAI for highest-quality relations (recommended)
  python extract_concepts.py --use-ner --ner-extractor hybrid --extract-relations

  # Hybrid with explicit OpenAI backend
  python extract_concepts.py --use-ner --ner-extractor hybrid --relation-backend openai --extract-relations

  # Check available NER extractors
  python entity_extractors.py --check
"""
    )
    parser.add_argument(
        '--document',
        type=str,
        help='Process a specific document by ID'
    )
    parser.add_argument(
        '--add-concept',
        type=str,
        metavar='NAME',
        help='Add a new concept to the taxonomy'
    )
    parser.add_argument(
        '--category',
        type=str,
        help='Category for the new concept (use with --add-concept or --import-glossary)'
    )
    parser.add_argument(
        '--aliases',
        type=str,
        help='Comma-separated aliases for the concept'
    )
    parser.add_argument(
        '--list-concepts',
        action='store_true',
        help='List all concepts in the taxonomy'
    )
    parser.add_argument(
        '--glossary',
        type=str,
        metavar='FILE',
        help='Path to glossary file for weighted concept boosting'
    )
    parser.add_argument(
        '--import-glossary',
        type=str,
        metavar='FILE',
        help='Import glossary terms into the concepts database table'
    )

    # NER options
    ner_group = parser.add_argument_group('NER Entity Extraction')
    ner_group.add_argument(
        '--use-ner',
        action='store_true',
        help='Enable NER-based entity extraction (uses GLiNER by default)'
    )
    ner_group.add_argument(
        '--ner-extractor',
        type=str,
        choices=['gliner', 'llama', 'openai', 'hybrid'],
        default='gliner',
        help='NER backend: gliner (fast, CPU), llama (GPU), openai (cloud), hybrid (recommended)'
    )
    ner_group.add_argument(
        '--relation-backend',
        type=str,
        choices=['openai', 'llama'],
        default='openai',
        help='Backend for relations in hybrid mode (default: openai)'
    )
    ner_group.add_argument(
        '--openai-model',
        type=str,
        default='gpt-4o-mini',
        help='OpenAI model for extraction (gpt-4o, gpt-4o-mini)'
    )
    ner_group.add_argument(
        '--ner-types',
        type=str,
        nargs='+',
        default=['Person', 'Concept', 'Work', 'Organization', 'Term'],
        help='Entity types to extract with NER'
    )
    ner_group.add_argument(
        '--extract-relations',
        action='store_true',
        help='Extract S-P-O relationship triples (requires LLaMA or hybrid)'
    )
    ner_group.add_argument(
        '--auto-add-entities',
        action='store_true',
        help='Auto-add high-confidence NER entities to concept taxonomy'
    )
    ner_group.add_argument(
        '--ner-threshold',
        type=float,
        default=0.7,
        help='Minimum confidence for auto-adding entities (default: 0.7)'
    )
    ner_group.add_argument(
        '--check-ner',
        action='store_true',
        help='Check which NER extractors are available'
    )

    # Active Entity Linking options
    link_group = parser.add_argument_group('Active Entity Linking (Deduplication)')
    link_group.add_argument(
        '--active-linking',
        action='store_true',
        help='Enable real-time entity deduplication during ingestion (prevents taxonomy drift)'
    )
    link_group.add_argument(
        '--fuzzy-threshold',
        type=float,
        default=85.0,
        help='Minimum fuzzy match score (0-100) for auto-linking entities (default: 85)'
    )
    link_group.add_argument(
        '--linking-report',
        action='store_true',
        help='Show detailed report of entity linking decisions'
    )

    args = parser.parse_args()

    # Check NER availability
    if args.check_ner:
        try:
            from entity_extractors import check_extractor_availability
            availability = check_extractor_availability()
            print("\nNER Extractor Availability:")
            print("=" * 40)
            for name, info in availability.items():
                if isinstance(info, dict):
                    status = "Available" if info.get('available') else "Not installed"
                    note = info.get('note', '')
                    print(f"  {name:10} {status}")
                    if note:
                        print(f"             {note}")
                else:
                    status = "Available" if info else "Not installed"
                    print(f"  {name:10} {status}")
            print("\nInstallation:")
            print("  GLiNER:  pip install gliner")
            print("  LLaMA:   pip install llama-cpp-python")
            print("  OpenAI:  pip install openai (+ OPENAI_API_KEY env var)")
            print("=" * 40)
        except ImportError:
            print("entity_extractors module not found")
        return

    if args.add_concept:
        aliases = [a.strip() for a in args.aliases.split(',')] if args.aliases else None
        concept_id = ConceptExtractor.add_concept(
            args.add_concept,
            args.category,
            aliases
        )
        print(f"Added concept '{args.add_concept}' with ID {concept_id}")
        return

    if args.import_glossary:
        print(f"\nImporting glossary from: {args.import_glossary}")
        added, existing = import_glossary_to_db(args.import_glossary, args.category)
        print(f"Added {added} new concepts, {existing} already existed")
        if args.category:
            print(f"Category: {args.category}")
        return

    if args.list_concepts:
        concepts = ConceptExtractor.list_concepts()
        print("\n" + "=" * 70)
        print("CONCEPT TAXONOMY")
        print("=" * 70)
        current_category = None
        for c in concepts:
            if c['category'] != current_category:
                current_category = c['category']
                print(f"\n[{current_category or 'Uncategorized'}]")
            aliases_str = f" ({', '.join(c['aliases'])})" if c['aliases'] else ""
            print(f"  - {c['name']}{aliases_str}: {c['document_count']} documents")
        print("=" * 70)
        return

    # Initialize extractor with options
    extractor = ConceptExtractor(
        glossary_path=args.glossary,
        use_ner=args.use_ner,
        ner_extractor=args.ner_extractor,
        ner_entity_types=args.ner_types,
        extract_relations=args.extract_relations,
        relation_backend=args.relation_backend,
        openai_model=args.openai_model,
        active_linking=args.active_linking,
        fuzzy_threshold=args.fuzzy_threshold
    )

    # Process documents
    documents = extractor.get_documents_to_process(args.document)
    if not documents:
        print("No documents need concept extraction")
        return

    print(f"\nProcessing {len(documents)} documents...")
    if args.use_ner:
        print(f"NER extractor: {args.ner_extractor}")
        print(f"Entity types: {', '.join(args.ner_types)}")
        if args.extract_relations:
            print("Relation extraction: enabled")
            if args.ner_extractor == 'hybrid':
                print(f"Relation backend: {args.relation_backend}")
            if args.relation_backend == 'openai' or args.ner_extractor == 'openai':
                print(f"OpenAI model: {args.openai_model}")
    if args.active_linking:
        print(f"Active Entity Linking: enabled (threshold: {args.fuzzy_threshold})")

    for doc in documents:
        try:
            extractor.extract_for_document(doc, auto_add_entities=args.auto_add_entities)
            extractor.stats['documents_processed'] += 1
        except Exception as e:
            logger.error(f"Failed to process {doc['document_id']}: {e}")

    stats = extractor.stats

    # Print summary
    print("\n" + "=" * 50)
    print("CONCEPT EXTRACTION SUMMARY")
    print("=" * 50)
    print(f"Documents Processed: {stats['documents_processed']}")
    print(f"Concepts Found:      {stats['concepts_found']}")
    print(f"Chunk Mappings:      {stats['chunk_mappings']}")
    if args.glossary:
        print(f"Glossary Boosts:     {stats['glossary_boosts']}")
    if args.use_ner:
        print(f"NER Entities Found:  {stats['ner_entities_found']}")
        print(f"NER New Concepts:    {stats['ner_new_concepts']}")
        if args.extract_relations:
            print(f"Relations Extracted: {stats['relations_extracted']}")
    if args.active_linking:
        print(f"Entities Linked:     {stats['entities_linked']}")
        # Show detailed linking report if requested
        if args.linking_report and extractor.entity_linker:
            report = extractor.entity_linker.get_linking_report()
            print("\n--- Entity Linking Report ---")
            print(f"Total linkings: {report['total']}")
            if report['by_type']:
                print("By match type:")
                for match_type, count in report['by_type'].items():
                    print(f"  {match_type}: {count}")
    print(f"Errors:              {stats['errors']}")
    print("=" * 50)


if __name__ == '__main__':
    main()
