"""
Configuration module for the Research Development Framework pipeline.
Loads environment variables and project configuration.

Configuration sources (in order of precedence):
1. Environment variables (.env, .env.db)
2. Project configuration (config/project.yaml)
3. Default values
"""

import os
import sys
from pathlib import Path
from dotenv import load_dotenv

# Try to load YAML config
try:
    import yaml
    HAS_YAML = True
except ImportError:
    HAS_YAML = False

# =============================================================================
# PATH SETUP
# =============================================================================
BASE_DIR = Path(__file__).resolve().parent.parent

# Load environment variables
load_dotenv(BASE_DIR / '.env')
load_dotenv(BASE_DIR / '.env.db')

# =============================================================================
# LOAD PROJECT CONFIGURATION
# =============================================================================
PROJECT_CONFIG = {}
PROJECT_CONFIG_PATH = BASE_DIR / 'config' / 'project.yaml'

if HAS_YAML and PROJECT_CONFIG_PATH.exists():
    with open(PROJECT_CONFIG_PATH, 'r') as f:
        PROJECT_CONFIG = yaml.safe_load(f) or {}


def get_config(section: str, key: str, default=None):
    """Get configuration value from project.yaml."""
    if section in PROJECT_CONFIG and key in PROJECT_CONFIG[section]:
        return PROJECT_CONFIG[section][key]
    return default


# =============================================================================
# DATABASE CONFIGURATION
# =============================================================================
DB_CONFIG = {
    'host': os.getenv('DB_HOST', 'localhost'),
    'port': int(os.getenv('DB_PORT', 5432)),
    'database': os.getenv('DB_NAME', 'research_dev_db'),
    'user': os.getenv('DB_USER', 'research_user'),
    'password': os.getenv('DB_PASSWORD', ''),
}


def get_db_connection_string():
    """Return PostgreSQL connection string."""
    return (
        f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}"
        f"@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
    )


# =============================================================================
# INTELLIGENCE TIER CONFIGURATION (v2.0)
# =============================================================================
# The system supports three intelligence modes with graceful degradation:
# - "cloud": Full AI via OpenAI API
# - "local": AI via local LLM (Ollama, LM Studio)
# - "statistical": Classical NLP, no AI required
# - "auto": Automatically select best available

# OpenAI Configuration
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '').strip()
OPENAI_ENABLED = bool(OPENAI_API_KEY) and not OPENAI_API_KEY.startswith('your_')

# Tavily Web Search Configuration
TAVILY_API_KEY = os.getenv('TAVILY_API_KEY', '').strip()
TAVILY_ENABLED = bool(TAVILY_API_KEY) and not TAVILY_API_KEY.startswith('your_')
TAVILY_CONFIG = {
    'search_depth': 'advanced',     # 'basic' or 'advanced'
    'max_results': 10,              # Maximum results per search
    'include_answer': True,         # Include AI-generated answer
    'include_raw_content': True,    # Include full page content
    'timeout': 30,                  # Request timeout in seconds
    # Credit tracking settings
    'credit_limit': int(os.getenv('TAVILY_CREDIT_LIMIT', 1000)),  # Default 1000 credits
    'credit_cost_basic': 1,         # Credits per basic search
    'credit_cost_advanced': 2,      # Credits per advanced search
}

# Local LLM Configuration
LOCAL_LLM_ENDPOINT = os.getenv(
    'LOCAL_LLM_ENDPOINT',
    get_config('intelligence', 'local', {}).get('endpoint', 'http://localhost:11434/v1')
    if isinstance(get_config('intelligence', 'local', {}), dict) else 'http://localhost:11434/v1'
)
LOCAL_LLM_MODEL = os.getenv(
    'LOCAL_LLM_MODEL',
    get_config('intelligence', 'local', {}).get('model', 'llama3')
    if isinstance(get_config('intelligence', 'local', {}), dict) else 'llama3'
)

# Check if local LLM is available
LOCAL_LLM_ENABLED = False
try:
    import requests
    response = requests.get(f"{LOCAL_LLM_ENDPOINT.rstrip('/v1')}/api/tags", timeout=2)
    LOCAL_LLM_ENABLED = response.status_code == 200
except:
    pass

# Determine intelligence mode
_configured_mode = get_config('intelligence', 'mode', 'auto')

def _detect_intelligence_mode():
    """Auto-detect the best available intelligence mode."""
    if _configured_mode != 'auto':
        return _configured_mode

    if OPENAI_ENABLED:
        return 'cloud'
    elif LOCAL_LLM_ENABLED:
        return 'local'
    else:
        return 'statistical'

INTELLIGENCE_MODE = _detect_intelligence_mode()

# Statistical NLP settings (for offline mode)
STATISTICAL_CONFIG = {
    'keyword_method': get_config('intelligence', 'statistical', {}).get('keyword_method', 'rake')
        if isinstance(get_config('intelligence', 'statistical', {}), dict) else 'rake',
    'min_keyword_score': get_config('intelligence', 'statistical', {}).get('min_keyword_score', 0.3)
        if isinstance(get_config('intelligence', 'statistical', {}), dict) else 0.3,
    'max_keywords': get_config('intelligence', 'statistical', {}).get('max_keywords', 10)
        if isinstance(get_config('intelligence', 'statistical', {}), dict) else 10,
    'tfidf_max_features': get_config('intelligence', 'statistical', {}).get('tfidf_max_features', 5000)
        if isinstance(get_config('intelligence', 'statistical', {}), dict) else 5000,
    'tfidf_ngram_range': tuple(get_config('intelligence', 'statistical', {}).get('tfidf_ngram_range', [1, 2]))
        if isinstance(get_config('intelligence', 'statistical', {}), dict) else (1, 2),
}

# Cloud model settings
CLOUD_MODELS = {
    'classification': get_config('intelligence', 'cloud', {}).get('classification_model', 'gpt-4o-mini')
        if isinstance(get_config('intelligence', 'cloud', {}), dict) else 'gpt-4o-mini',
    'chat': get_config('intelligence', 'cloud', {}).get('chat_model', 'gpt-4o')
        if isinstance(get_config('intelligence', 'cloud', {}), dict) else 'gpt-4o',
    'embedding': get_config('intelligence', 'cloud', {}).get('embedding_model', 'text-embedding-3-small')
        if isinstance(get_config('intelligence', 'cloud', {}), dict) else 'text-embedding-3-small',
}

# Legacy compatibility
EMBEDDING_MODEL = CLOUD_MODELS['embedding']
EMBEDDING_DIMENSIONS = int(os.getenv(
    'EMBEDDING_DIMENSIONS',
    get_config('embeddings', 'dimensions', 1536)
))

# =============================================================================
# SEARCH CONFIGURATION
# =============================================================================
# Default search mode depends on whether API key is available
if OPENAI_ENABLED:
    DEFAULT_SEARCH_MODE = get_config('search', 'default_mode', 'hybrid')
else:
    DEFAULT_SEARCH_MODE = 'keyword'  # Force keyword mode without API key

SEARCH_CONFIG = {
    'default_mode': DEFAULT_SEARCH_MODE,
    'max_results': get_config('search', 'max_results', 100),
    'default_limit': get_config('search', 'default_limit', 10),
}

# =============================================================================
# CHUNKING CONFIGURATION
# =============================================================================
CHUNK_CONFIG = {
    'target_tokens': get_config('chunking', 'target_tokens', 750),
    'overlap_tokens': get_config('chunking', 'overlap_tokens', 100),
    'min_tokens': get_config('chunking', 'min_tokens', 100),
    'max_tokens': get_config('chunking', 'max_tokens', 1000),
    'encoding': 'cl100k_base',  # tiktoken encoding for GPT-4/embeddings
}

# =============================================================================
# PATH CONFIGURATION
# =============================================================================
PATHS = {
    'base': BASE_DIR,
    'library': BASE_DIR / 'library',
    # Primary import folder (easy access in root directory)
    'new_docs': BASE_DIR / 'NEW_DOCS',
    # Legacy import folder (still supported)
    'incoming': BASE_DIR / 'library' / 'NEW_DOCS' / 'incoming',
    'processing': BASE_DIR / 'library' / 'NEW_DOCS' / 'processing',
    'completed': BASE_DIR / 'library' / 'NEW_DOCS' / 'completed',
    'failed': BASE_DIR / 'library' / 'NEW_DOCS' / 'failed',
    'low_quality': BASE_DIR / 'library' / 'NEW_DOCS' / 'low_quality',
    'markdown_library': BASE_DIR / 'library' / 'MARKDOWN_LIBRARY',
    'organized': BASE_DIR / 'library' / 'ORGANIZED',
    'compiled': BASE_DIR / 'compiled',
    'config': BASE_DIR / 'config',
}

# =============================================================================
# PROCESSING CONFIGURATION
# =============================================================================
PROCESSING_CONFIG = {
    'supported_extensions': get_config('processing', 'supported_formats',
        ['.pdf', '.docx', '.txt', '.md', '.epub', '.html']),
    'max_file_size_mb': get_config('processing', 'max_file_size_mb', 150),
    'pipeline_version': get_config('processing', 'pipeline_version', '1.0.0'),
    'batch_size': 10,
    'embedding_batch_size': get_config('embeddings', 'batch_size', 100),
    'retry_attempts': get_config('embeddings', 'retry_attempts', 3),
    'retry_delay_seconds': get_config('embeddings', 'retry_delay', 5),
}

# =============================================================================
# QUALITY ASSESSMENT THRESHOLDS
# =============================================================================
QUALITY_THRESHOLDS = {
    'gibberish_ratio_max': get_config('quality', 'max_gibberish_ratio', 0.15),
    'min_avg_word_length': get_config('quality', 'min_avg_word_length', 3.0),
    'max_avg_word_length': get_config('quality', 'max_avg_word_length', 15.0),
    'min_sentence_quality': get_config('quality', 'min_sentence_quality', 0.5),
}

# =============================================================================
# BOOK WORKFLOW CONFIGURATION
# =============================================================================
BOOK_WORKFLOW_CONFIG = {
    # Research settings
    'default_max_iterations': 5,          # Max iterations per subject
    'default_tavily_budget': 50,          # Default Tavily credit budget
    'gap_priority_threshold': 3,          # Min mentions for high priority gap
    'use_graphrag': True,                 # Enable GraphRAG integration
    'use_rerank': True,                   # Enable reranking
    'checkpoint_interval': 1,             # Save checkpoint after each subject

    # Project storage
    'projects_dir': BASE_DIR / 'book_projects',

    # Draft generation
    'draft_styles': ['academic', 'narrative', 'reference'],
    'default_draft_style': 'academic',

    # Gap analysis
    'max_gaps_per_subject': 10,           # Max gaps to collect per subject
    'similarity_threshold': 0.85,         # Threshold for gap deduplication

    # Output settings
    'truncate_synthesis': 2000,           # Max chars for synthesis in summary
}

# Ensure book projects directory exists
BOOK_WORKFLOW_CONFIG['projects_dir'].mkdir(parents=True, exist_ok=True)

# =============================================================================
# GENERATION CONFIGURATION (Opt-in Model)
# =============================================================================
# By default, Claude Code handles all prose generation.
# Internal LLMs are available for utility tasks or delegated heavy lifting.
# =============================================================================
_generation_config = PROJECT_CONFIG.get('generation', {})
_generation_models = _generation_config.get('models', {})

GENERATION_CONFIG = {
    # Default author: "agent" = Claude Code writes, "cli" = CLI writes internally
    'default_author': _generation_config.get('default_author', 'agent'),

    # Models for different purposes
    'primary_model': _generation_models.get('primary', 'gpt-4o'),
    'utility_model': _generation_models.get('utility', 'gpt-4o-mini'),

    # Cost tracking
    'cost_tracking': _generation_config.get('cost_tracking', True),
    'max_budget_usd': _generation_config.get('max_budget_usd', 10.00),

    # Generation parameters
    'temperature': _generation_config.get('temperature', 0.7),
}


def is_cli_generation_enabled() -> bool:
    """Check if CLI should generate text by default (vs returning data for agent)."""
    return GENERATION_CONFIG.get('default_author', 'agent') == 'cli'


def get_generation_model(purpose: str = 'primary') -> str:
    """Get the configured model for a specific purpose.

    Args:
        purpose: 'primary' for drafting/polishing, 'utility' for analysis/sorting

    Returns:
        Model name string (e.g., 'gpt-4o')
    """
    if purpose == 'utility':
        return GENERATION_CONFIG.get('utility_model', 'gpt-4o-mini')
    return GENERATION_CONFIG.get('primary_model', 'gpt-4o')


# =============================================================================
# ENTITY EXTRACTION CONFIGURATION (GraphRAG Enhancement)
# =============================================================================
# Load from project.yaml with environment variable overrides
_entity_config = PROJECT_CONFIG.get('entity_extraction', {})

ENTITY_EXTRACTION_CONFIG = {
    # Whether user has explicitly configured entity extraction
    # If False, commands should prompt for configuration
    'configured': _entity_config.get('configured', False),

    # Default extractor: 'gliner' (fast, CPU) or 'openai' or 'hybrid' (recommended)
    'default_extractor': os.getenv('NER_EXTRACTOR', _entity_config.get('provider', 'gliner')),

    # Relation extraction backend for hybrid mode
    'relation_backend': _entity_config.get('relation_backend', 'openai'),

    # OpenAI model for extraction
    'openai_model': _entity_config.get('openai_model', 'gpt-4o-mini'),

    # GLiNER settings
    'gliner': {
        'model_size': _entity_config.get('gliner_model_size', 'medium'),
        'threshold': _entity_config.get('confidence_threshold', 0.4),
        'flat_ner': True,                 # Disable nested entities
    },

    # LLaMA settings (for users with GPU/resources)
    'llama': {
        'model_path': os.getenv('LLAMA_MODEL_PATH', None),  # Path to .gguf file
        'n_ctx': 2048,                    # Context window size
        'n_gpu_layers': -1,               # -1 = all layers on GPU, 0 = CPU only
        'temperature': 0.1,               # Lower = more deterministic
        'use_transformers': False,        # Use HuggingFace instead of llama-cpp
    },

    # Entity types to extract
    'entity_types': _entity_config.get('entity_types', [
        'Person',           # Authors, historical figures
        'Concept',          # Abstract ideas, theories
        'Work',             # Books, articles, lectures
        'Organization',     # Schools, societies, institutions
        'Location',         # Places mentioned
        'Event',            # Historical events, conferences
        'Term',             # Technical/domain-specific terms
    ]),

    # Extract relationships (S-P-O triples)
    'extract_relations': _entity_config.get('extract_relations', False),

    # Auto-add settings for new entities discovered by NER
    'auto_add': {
        'enabled': False,                 # Auto-add high-confidence entities
        'min_confidence': 0.7,            # Minimum confidence score
        'min_occurrences': 2,             # Minimum occurrences across chunks
    },
}

def is_entity_extraction_configured() -> bool:
    """Check if entity extraction has been explicitly configured by the user."""
    return ENTITY_EXTRACTION_CONFIG.get('configured', False)

# =============================================================================
# V3 FEATURE CONFIGURATION
# =============================================================================
V3_CONFIG = {
    # Cost controls for Research Agent
    'research_agent': {
        'default_cost_budget': float(os.getenv('RESEARCH_BUDGET_USD', 1.00)),
        'warn_at_budget_percent': 0.80,
        'track_costs': True,
        'max_iterations': 5,
    },

    # Knowledge rules for self-improving glossaries
    'knowledge_rules': {
        'enabled': True,
        'auto_create_alias_rules': True,
        'apply_on_ingestion': True,
        'import_merge_history': False,
    },

    # GraphRAG / Entity Extraction
    'graphrag': {
        'default_extractor': os.getenv('NER_EXTRACTOR', 'gliner'),
        'hybrid_relation_backend': 'openai',     # 'openai' or 'llama' for hybrid mode
        'hybrid_openai_model': 'gpt-4o-mini',    # Model for relation extraction
        'extract_relations': False,               # Enable S-P-O triple extraction
        'relationship_types': [
            'influences', 'supports', 'opposes', 'part_of',
            'derived_from', 'precedes', 'follows', 'contrasts_with'
        ],
    },

    # Interactive agent mode (future)
    'interactive_mode': {
        'require_plan_approval': False,
        'require_source_approval': False,
        'allow_gap_pinning': True,
    },

    # Citation management
    'citations': {
        'auto_freeze': False,
        'format': '[AuthorYear]',
        'generate_on_ingest': True,
    },

    # Metadata editing safety
    'metadata': {
        'warn_on_searchable_change': True,
        'searchable_fields': ['title', 'subtitle', 'author', 'primary_category', 'content_type'],
        'auto_reembed_on_edit': False,
    },

    # Document pinning for research prioritization
    'document_pinning': {
        'enabled': True,
        'max_pinned_docs': 50,
        'pinned_context_percent': 0.6,  # 60% of context budget for pinned docs
    },

    # Smart context selection for synthesis
    'context_selection': {
        'max_tokens': 8000,
        'summarize_overflow': True,
        'prioritize_pinned': True,
    },
}

# =============================================================================
# LOGGING CONFIGURATION
# =============================================================================
LOGGING_CONFIG = {
    'level': os.getenv('LOG_LEVEL', 'INFO'),
    'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    'file': BASE_DIR / 'logs' / 'pipeline.log',
}

# Ensure log directory exists
(BASE_DIR / 'logs').mkdir(exist_ok=True)

# =============================================================================
# PROJECT INFO
# =============================================================================
PROJECT_INFO = {
    'name': get_config('project', 'name', 'Research Project'),
    'description': get_config('project', 'description', ''),
    'version': get_config('project', 'version', '1.0.0'),
    'author': get_config('project', 'author', ''),
}

# =============================================================================
# STATUS DISPLAY (for CLI tools)
# =============================================================================
def print_config_status():
    """Print configuration status for debugging."""
    mode_descriptions = {
        'cloud': 'Cloud AI (OpenAI)',
        'local': f'Local LLM ({LOCAL_LLM_MODEL})',
        'statistical': 'Statistical NLP (Offline)'
    }

    print("\n" + "=" * 60)
    print("CONFIGURATION STATUS")
    print("=" * 60)
    print(f"Project:           {PROJECT_INFO['name']}")
    print(f"Version:           {PROJECT_INFO.get('version', '2.0.0')}")
    print(f"Base Directory:    {BASE_DIR}")
    print(f"Database:          {DB_CONFIG['database']}@{DB_CONFIG['host']}")
    print("-" * 60)
    print("INTELLIGENCE TIER:")
    print(f"  Active Mode:     {INTELLIGENCE_MODE.upper()} - {mode_descriptions.get(INTELLIGENCE_MODE, 'Unknown')}")
    print(f"  OpenAI API:      {'Available' if OPENAI_ENABLED else 'Not configured'}")
    print(f"  Local LLM:       {'Available' if LOCAL_LLM_ENABLED else 'Not detected'}")
    print(f"  Statistical:     Always available (fallback)")
    print("-" * 60)
    print(f"Search Mode:       {DEFAULT_SEARCH_MODE}")
    print("-" * 60)
    print("V3 FEATURES:")
    print(f"  Cost Tracking:   {'Enabled' if V3_CONFIG['research_agent']['track_costs'] else 'Disabled'}")
    print(f"  Default Budget:  ${V3_CONFIG['research_agent']['default_cost_budget']:.2f}")
    print(f"  Knowledge Rules: {'Enabled' if V3_CONFIG['knowledge_rules']['enabled'] else 'Disabled'}")
    print(f"  GraphRAG Mode:   {V3_CONFIG['graphrag']['default_extractor']}")
    print(f"  Relation Ext:    {'Enabled' if V3_CONFIG['graphrag']['extract_relations'] else 'Disabled'}")
    print("=" * 60 + "\n")


# Print warning if running without API key
if not OPENAI_ENABLED and __name__ != '__main__':
    import logging
    logger = logging.getLogger(__name__)
    logger.info(
        "OpenAI API key not configured. Running in keyword-only mode. "
        "To enable semantic search, add your API key to .env file."
    )
