#!/usr/bin/env python3
"""
Knowledge Graph Extraction for Research Development Framework v3.0

This module extracts entities and relationships from documents to build
a knowledge graph enabling GraphRAG capabilities.

Features:
- Named entity extraction (people, concepts, works, organizations)
- Relationship extraction (influenced, contradicts, develops, etc.)
- Co-occurrence analysis
- Graph traversal for relationship queries

Usage:
    python knowledge_graph.py                    # Process all documents
    python knowledge_graph.py --document DOC_001 # Process specific document
    python knowledge_graph.py --rebuild          # Rebuild entire graph
"""

import os
import re
import json
import logging
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple, Set
from dataclasses import dataclass, field

# Import configuration
from config import (
    INTELLIGENCE_MODE, OPENAI_ENABLED, OPENAI_API_KEY,
    LOCAL_LLM_ENDPOINT, LOCAL_LLM_MODEL, CLOUD_MODELS, LOGGING_CONFIG
)
from db_utils import get_db_connection, execute_query

# Optional imports
try:
    from openai import OpenAI
    HAS_OPENAI = True
except ImportError:
    HAS_OPENAI = False

try:
    import spacy
    HAS_SPACY = True
except ImportError:
    HAS_SPACY = False

# Setup logging
logging.basicConfig(
    level=getattr(logging, LOGGING_CONFIG['level']),
    format=LOGGING_CONFIG['format']
)
logger = logging.getLogger(__name__)


@dataclass
class Entity:
    """Represents a named entity."""
    name: str
    entity_type: str
    aliases: List[str] = field(default_factory=list)
    description: str = ""
    confidence: float = 1.0


@dataclass
class Relationship:
    """Represents a relationship between two entities."""
    source: str
    relationship: str
    target: str
    evidence: str = ""
    confidence: float = 1.0


class KnowledgeGraphExtractor:
    """
    Extracts entities and relationships from text to build a knowledge graph.

    Supports multiple modes:
    - Cloud: Uses OpenAI for extraction
    - Local: Uses Ollama for extraction
    - Statistical: Uses pattern matching and NER
    """

    # Known entity patterns for Anthroposophical/esoteric texts
    KNOWN_PEOPLE = {
        'rudolf steiner': {'type': 'person', 'aliases': ['steiner', 'dr. steiner', 'r. steiner']},
        'johann wolfgang von goethe': {'type': 'person', 'aliases': ['goethe', 'j.w. goethe']},
        'carl jung': {'type': 'person', 'aliases': ['jung', 'c.g. jung']},
        'maria montessori': {'type': 'person', 'aliases': ['montessori']},
        'owen barfield': {'type': 'person', 'aliases': ['barfield']},
        'albert steffen': {'type': 'person', 'aliases': ['steffen']},
        'marie steiner': {'type': 'person', 'aliases': ['marie steiner-von sivers']},
        'edouard schure': {'type': 'person', 'aliases': ['schure', 'schuré']},
    }

    KNOWN_CONCEPTS = {
        'anthroposophy': {'type': 'concept', 'aliases': ['spiritual science']},
        'eurythmy': {'type': 'concept', 'aliases': ['eurhythmy']},
        'waldorf education': {'type': 'concept', 'aliases': ['steiner education']},
        'threefold social order': {'type': 'concept', 'aliases': ['social threefolding']},
        'biodynamic agriculture': {'type': 'concept', 'aliases': ['biodynamics']},
        'goethean science': {'type': 'concept', 'aliases': ['goethean observation']},
        'higher worlds': {'type': 'concept', 'aliases': ['supersensible worlds']},
        'imagination': {'type': 'concept', 'aliases': []},
        'inspiration': {'type': 'concept', 'aliases': []},
        'intuition': {'type': 'concept', 'aliases': []},
    }

    EXTRACTION_PROMPT = """Analyze this text excerpt and extract:
1. Named entities (people, concepts, works, organizations, places)
2. Relationships between entities

Return JSON format:
{{
    "entities": [
        {{"name": "Entity Name", "type": "person|concept|work|organization|place", "aliases": [], "description": "brief description"}}
    ],
    "relationships": [
        {{"source": "Entity A", "relationship": "influenced|contradicts|develops|mentions|authored|related_to", "target": "Entity B", "evidence": "quote from text"}}
    ]
}}

Rules:
- Focus on philosophical, spiritual, and educational entities
- Include relationships like "influenced", "contradicts", "develops", "mentions"
- Provide brief evidence quotes for relationships
- Normalize entity names (e.g., "Rudolf Steiner" not "Steiner")

Text:
{text}"""

    def __init__(self, mode: str = None):
        """Initialize the knowledge graph extractor."""
        self.mode = mode or INTELLIGENCE_MODE
        self.client = None
        self.nlp = None

        # Initialize based on mode
        if self.mode == 'cloud' and OPENAI_ENABLED and HAS_OPENAI:
            try:
                self.client = OpenAI(api_key=OPENAI_API_KEY)
                self.model = CLOUD_MODELS.get('classification', 'gpt-4o-mini')
                logger.info(f"KnowledgeGraphExtractor initialized in CLOUD mode")
            except Exception as e:
                logger.warning(f"Failed to initialize OpenAI: {e}")
                self.mode = 'statistical'

        elif self.mode == 'local' and HAS_OPENAI:
            try:
                self.client = OpenAI(
                    base_url=LOCAL_LLM_ENDPOINT,
                    api_key="not-needed"
                )
                self.model = LOCAL_LLM_MODEL
                logger.info(f"KnowledgeGraphExtractor initialized in LOCAL mode")
            except Exception as e:
                logger.warning(f"Failed to initialize local LLM: {e}")
                self.mode = 'statistical'

        if self.mode == 'statistical':
            logger.info("KnowledgeGraphExtractor initialized in STATISTICAL mode")
            if HAS_SPACY:
                try:
                    self.nlp = spacy.load("en_core_web_sm")
                except:
                    logger.warning("spaCy model not found. Using pattern matching only.")

    def extract_entities_llm(self, text: str, client=None) -> Tuple[List[Entity], List[Relationship]]:
        """Extract entities and relationships using LLM."""
        if client is None:
            client = self.client

        if client is None:
            return [], []

        try:
            response = client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are a precise knowledge extraction system. Return valid JSON only."},
                    {"role": "user", "content": self.EXTRACTION_PROMPT.format(text=text[:4000])}
                ],
                temperature=0.3,
                max_tokens=2000
            )

            content = response.choices[0].message.content.strip()

            # Clean up markdown formatting
            if content.startswith('```'):
                content = re.sub(r'^```(?:json)?\n?', '', content)
                content = re.sub(r'\n?```$', '', content)

            result = json.loads(content)

            entities = [
                Entity(
                    name=e['name'],
                    entity_type=e.get('type', 'concept'),
                    aliases=e.get('aliases', []),
                    description=e.get('description', '')
                )
                for e in result.get('entities', [])
            ]

            relationships = [
                Relationship(
                    source=r['source'],
                    relationship=r['relationship'],
                    target=r['target'],
                    evidence=r.get('evidence', '')
                )
                for r in result.get('relationships', [])
            ]

            return entities, relationships

        except Exception as e:
            logger.warning(f"LLM extraction failed: {e}")
            return [], []

    def extract_entities_statistical(self, text: str) -> Tuple[List[Entity], List[Relationship]]:
        """Extract entities using pattern matching and optional NER."""
        entities = []
        relationships = []
        text_lower = text.lower()

        # Pattern-based extraction for known entities
        for name, info in self.KNOWN_PEOPLE.items():
            if name in text_lower or any(alias in text_lower for alias in info['aliases']):
                entities.append(Entity(
                    name=name.title(),
                    entity_type='person',
                    aliases=info['aliases']
                ))

        for name, info in self.KNOWN_CONCEPTS.items():
            if name in text_lower or any(alias in text_lower for alias in info['aliases']):
                entities.append(Entity(
                    name=name.title(),
                    entity_type='concept',
                    aliases=info['aliases']
                ))

        # Use spaCy for additional NER if available
        if self.nlp:
            doc = self.nlp(text[:10000])  # Limit for performance
            for ent in doc.ents:
                if ent.label_ in ('PERSON', 'ORG', 'GPE', 'WORK_OF_ART'):
                    entity_type = {
                        'PERSON': 'person',
                        'ORG': 'organization',
                        'GPE': 'place',
                        'WORK_OF_ART': 'work'
                    }.get(ent.label_, 'concept')

                    # Avoid duplicates
                    if not any(e.name.lower() == ent.text.lower() for e in entities):
                        entities.append(Entity(
                            name=ent.text,
                            entity_type=entity_type
                        ))

        # Simple relationship extraction via patterns
        relationship_patterns = [
            (r'(\w+)\s+influenced\s+(\w+)', 'influenced'),
            (r'(\w+)\s+contradicts?\s+(\w+)', 'contradicts'),
            (r'(\w+)\s+developed?\s+from\s+(\w+)', 'develops'),
            (r'according\s+to\s+(\w+)', 'mentions'),
        ]

        for pattern, rel_type in relationship_patterns:
            matches = re.findall(pattern, text_lower)
            for match in matches:
                if len(match) == 2:
                    relationships.append(Relationship(
                        source=match[0].title(),
                        relationship=rel_type,
                        target=match[1].title()
                    ))

        return entities, relationships

    def extract(self, text: str) -> Tuple[List[Entity], List[Relationship]]:
        """
        Extract entities and relationships from text.

        Args:
            text: Document or chunk text

        Returns:
            Tuple of (entities, relationships)
        """
        if self.mode in ('cloud', 'local') and self.client:
            entities, relationships = self.extract_entities_llm(text)
            if entities or relationships:
                return entities, relationships

        # Fallback or statistical mode
        return self.extract_entities_statistical(text)

    def get_or_create_entity(self, entity: Entity) -> int:
        """Get existing entity ID or create new one."""
        name_normalized = entity.name.lower().strip()

        # Get entity type ID
        type_result = execute_query(
            "SELECT type_id FROM entity_types WHERE name = %s",
            (entity.entity_type,),
            fetch='one'
        )
        type_id = type_result['type_id'] if type_result else 1

        # Check if entity exists
        existing = execute_query(
            "SELECT entity_id FROM entities WHERE name_normalized = %s AND type_id = %s",
            (name_normalized, type_id),
            fetch='one'
        )

        if existing:
            return existing['entity_id']

        # Create new entity
        result = execute_query(
            """
            INSERT INTO entities (name, name_normalized, type_id, description, aliases)
            VALUES (%s, %s, %s, %s, %s)
            RETURNING entity_id
            """,
            (entity.name, name_normalized, type_id, entity.description, entity.aliases),
            fetch='one'
        )

        return result['entity_id'] if result else None

    def get_or_create_relationship_type(self, rel_name: str) -> int:
        """Get relationship type ID or create new one."""
        existing = execute_query(
            "SELECT rel_type_id FROM relationship_types WHERE name = %s",
            (rel_name.lower(),),
            fetch='one'
        )

        if existing:
            return existing['rel_type_id']

        # Create new relationship type
        result = execute_query(
            """
            INSERT INTO relationship_types (name, description)
            VALUES (%s, %s)
            RETURNING rel_type_id
            """,
            (rel_name.lower(), f"Auto-created relationship type: {rel_name}"),
            fetch='one'
        )

        return result['rel_type_id'] if result else None

    def store_extraction(
        self,
        entities: List[Entity],
        relationships: List[Relationship],
        document_id: str,
        chunk_id: str = None
    ) -> Dict[str, int]:
        """
        Store extracted entities and relationships in the database.

        Returns:
            Dict with counts of stored items
        """
        stored = {'entities': 0, 'relationships': 0, 'mentions': 0}
        entity_id_map = {}

        # Store entities
        for entity in entities:
            try:
                entity_id = self.get_or_create_entity(entity)
                if entity_id:
                    entity_id_map[entity.name.lower()] = entity_id
                    stored['entities'] += 1

                    # Store mention if chunk_id provided
                    if chunk_id:
                        execute_query(
                            """
                            INSERT INTO entity_mentions (entity_id, chunk_id, document_id, mention_text, confidence)
                            VALUES (%s, %s, %s, %s, %s)
                            ON CONFLICT DO NOTHING
                            """,
                            (entity_id, chunk_id, document_id, entity.name, entity.confidence)
                        )
                        stored['mentions'] += 1

            except Exception as e:
                logger.warning(f"Failed to store entity {entity.name}: {e}")

        # Store relationships
        for rel in relationships:
            try:
                source_id = entity_id_map.get(rel.source.lower())
                target_id = entity_id_map.get(rel.target.lower())

                if not source_id or not target_id:
                    continue

                rel_type_id = self.get_or_create_relationship_type(rel.relationship)
                if not rel_type_id:
                    continue

                execute_query(
                    """
                    INSERT INTO entity_relationships
                        (source_entity_id, target_entity_id, rel_type_id, confidence, evidence_text, source_chunk_id, source_document_id, extraction_method)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                    ON CONFLICT (source_entity_id, target_entity_id, rel_type_id) DO UPDATE SET
                        confidence = GREATEST(entity_relationships.confidence, EXCLUDED.confidence),
                        evidence_text = COALESCE(EXCLUDED.evidence_text, entity_relationships.evidence_text)
                    """,
                    (source_id, target_id, rel_type_id, rel.confidence, rel.evidence, chunk_id, document_id, self.mode)
                )
                stored['relationships'] += 1

            except Exception as e:
                logger.warning(f"Failed to store relationship {rel.source} -> {rel.target}: {e}")

        return stored

    def process_document(self, document_id: str) -> Dict[str, int]:
        """
        Process a document and extract its knowledge graph.

        Args:
            document_id: Document identifier

        Returns:
            Dict with extraction statistics
        """
        total_stats = {'entities': 0, 'relationships': 0, 'mentions': 0, 'chunks_processed': 0}

        # Get all chunks for the document
        chunks = execute_query(
            "SELECT chunk_id, chunk_text FROM chunks WHERE document_id = %s ORDER BY chunk_sequence",
            (document_id,),
            fetch='all'
        )

        if not chunks:
            logger.warning(f"No chunks found for document {document_id}")
            return total_stats

        for chunk in chunks:
            entities, relationships = self.extract(chunk['chunk_text'])

            if entities or relationships:
                stats = self.store_extraction(entities, relationships, document_id, chunk['chunk_id'])
                for key in stats:
                    total_stats[key] += stats[key]

            total_stats['chunks_processed'] += 1

        logger.info(f"Processed document {document_id}: {total_stats}")
        return total_stats


def get_related_entities(entity_name: str, max_depth: int = 2) -> List[Dict]:
    """
    Get entities related to a given entity via graph traversal.

    Args:
        entity_name: Name of the starting entity
        max_depth: Maximum relationship depth to traverse

    Returns:
        List of related entities with their relationship paths
    """
    # Find the entity ID
    entity = execute_query(
        "SELECT entity_id FROM entities WHERE name_normalized = %s",
        (entity_name.lower(),),
        fetch='one'
    )

    if not entity:
        return []

    # Use the database function for graph traversal
    results = execute_query(
        "SELECT * FROM get_related_entities(%s, %s)",
        (entity['entity_id'], max_depth),
        fetch='all'
    )

    return results or []


def get_entity_cooccurrences(document_id: str = None) -> List[Dict]:
    """
    Get entity co-occurrence data.

    Args:
        document_id: Optional document to filter by

    Returns:
        List of entity pairs and their co-occurrence counts
    """
    if document_id:
        return execute_query(
            """
            SELECT * FROM entity_cooccurrence
            WHERE document_id = %s
            ORDER BY cooccurrence_count DESC
            LIMIT 100
            """,
            (document_id,),
            fetch='all'
        )
    else:
        return execute_query(
            """
            SELECT entity1_name, entity2_name, SUM(cooccurrence_count) as total_cooccurrence
            FROM entity_cooccurrence
            GROUP BY entity1_name, entity2_name
            ORDER BY total_cooccurrence DESC
            LIMIT 100
            """,
            fetch='all'
        )


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(description='Knowledge Graph Extraction')
    parser.add_argument('--document', type=str, help='Process specific document')
    parser.add_argument('--rebuild', action='store_true', help='Rebuild entire graph')
    parser.add_argument('--mode', choices=['cloud', 'local', 'statistical'], help='Extraction mode')

    args = parser.parse_args()

    extractor = KnowledgeGraphExtractor(mode=args.mode)

    if args.document:
        stats = extractor.process_document(args.document)
        print(f"Extracted from {args.document}: {stats}")
    elif args.rebuild:
        # Get all documents
        documents = execute_query(
            "SELECT document_id FROM documents WHERE processing_status = 'completed'",
            fetch='all'
        )

        for doc in documents:
            stats = extractor.process_document(doc['document_id'])
            print(f"Processed {doc['document_id']}: {stats}")
    else:
        print("Usage: python knowledge_graph.py --document DOC_001 or --rebuild")
