-- =============================================================================
-- MIGRATION 003: Knowledge Rules System
-- =============================================================================
-- Implements "Living Glossaries" - the system learns from user corrections
-- and applies rules automatically to future ingestions.
--
-- Features:
--   - Concept alias rules (merge history becomes permanent)
--   - Auto-tagging rules (pattern-based tag suggestions)
--   - Domain extraction hints (improve concept detection)
--   - Rule application tracking (measure effectiveness)
--
-- Run with:
--   PGPASSWORD='JpGZhjgjNd1M8rrh29BT' psql -h localhost -U research_dev_user \
--     -d research_dev_db -f database/migrations/003_knowledge_rules.sql
-- =============================================================================

-- -----------------------------------------------------------------------------
-- Knowledge Rules Table
-- -----------------------------------------------------------------------------
-- Stores learned rules that are automatically applied during ingestion.
-- Rules can come from user actions (merging concepts), imports, or system.
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS knowledge_rules (
    rule_id SERIAL PRIMARY KEY,

    -- Rule classification
    rule_type VARCHAR(50) NOT NULL,
    -- Types:
    --   'concept_alias': Map variant spelling to canonical name
    --   'concept_merge': Record of a merge operation
    --   'auto_tag': Pattern-based tag suggestion
    --   'extraction_hint': Domain-specific terms to look for
    --   'ignore_term': Terms to skip during extraction
    --   'relationship_hint': Suggest relationship between concepts

    -- Rule definition (flexible JSONB structure)
    rule_definition JSONB NOT NULL,
    -- Examples:
    --   concept_alias: {"alias": "R. Steiner", "canonical": "Rudolf Steiner"}
    --   auto_tag: {"pattern": "*anthroposoph*", "tags": ["Anthroposophy"]}
    --   extraction_hint: {"domain": "esoteric", "terms": ["etheric body", "astral body"]}
    --   ignore_term: {"term": "the", "reason": "common word"}
    --   relationship_hint: {"source": "etheric body", "predicate": "supports", "target": "memory"}

    -- Rule metadata
    rule_name VARCHAR(255),                  -- Human-readable name
    description TEXT,                        -- Why this rule exists
    source VARCHAR(50) DEFAULT 'user',       -- 'user', 'system', 'imported', 'inferred'
    confidence DECIMAL(3, 2) DEFAULT 1.0,    -- 0.0-1.0, how reliable is this rule

    -- Application tracking
    applied_count INTEGER DEFAULT 0,         -- Times this rule was applied
    last_applied_at TIMESTAMP,               -- When it was last used
    successful_count INTEGER DEFAULT 0,      -- Times it improved results

    -- Validity
    is_active BOOLEAN DEFAULT TRUE,
    priority INTEGER DEFAULT 100,            -- Higher = applied first (for conflicts)

    -- Audit
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    created_by VARCHAR(100) DEFAULT 'default',
    notes TEXT
);

-- Indexes
CREATE INDEX IF NOT EXISTS idx_rules_type ON knowledge_rules(rule_type);
CREATE INDEX IF NOT EXISTS idx_rules_active ON knowledge_rules(is_active) WHERE is_active = TRUE;
CREATE INDEX IF NOT EXISTS idx_rules_source ON knowledge_rules(source);
CREATE INDEX IF NOT EXISTS idx_rules_priority ON knowledge_rules(priority DESC);

-- GIN index for searching within rule definitions
CREATE INDEX IF NOT EXISTS idx_rules_definition ON knowledge_rules USING gin(rule_definition);

-- Trigger for updated_at
DROP TRIGGER IF EXISTS tr_rules_updated_at ON knowledge_rules;
CREATE TRIGGER tr_rules_updated_at BEFORE UPDATE ON knowledge_rules
    FOR EACH ROW EXECUTE FUNCTION update_updated_at();

-- -----------------------------------------------------------------------------
-- Rule Application Log
-- -----------------------------------------------------------------------------
-- Tracks when rules are applied for auditing and effectiveness measurement.
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS rule_applications (
    application_id SERIAL PRIMARY KEY,
    rule_id INTEGER REFERENCES knowledge_rules(rule_id) ON DELETE CASCADE,

    -- What was affected
    document_id VARCHAR(100) REFERENCES documents(document_id) ON DELETE SET NULL,
    chunk_id VARCHAR(100) REFERENCES chunks(chunk_id) ON DELETE SET NULL,
    concept_id INTEGER REFERENCES concepts(concept_id) ON DELETE SET NULL,

    -- Application details
    input_value TEXT,                        -- What the rule was applied to
    output_value TEXT,                       -- Result after rule application
    context TEXT,                            -- Additional context (e.g., surrounding text)

    -- Feedback
    was_correct BOOLEAN,                     -- User feedback: was this application correct?
    feedback_at TIMESTAMP,
    feedback_by VARCHAR(100),

    applied_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX IF NOT EXISTS idx_applications_rule ON rule_applications(rule_id);
CREATE INDEX IF NOT EXISTS idx_applications_document ON rule_applications(document_id);
CREATE INDEX IF NOT EXISTS idx_applications_date ON rule_applications(applied_at DESC);

-- -----------------------------------------------------------------------------
-- Helper Functions
-- -----------------------------------------------------------------------------

-- Function: Get canonical name for a concept (applies alias rules)
CREATE OR REPLACE FUNCTION get_canonical_concept_name(p_name VARCHAR)
RETURNS VARCHAR AS $$
DECLARE
    canonical VARCHAR;
    rule_id_found INTEGER;
BEGIN
    -- Look for alias rule matching this name
    SELECT
        rule_definition->>'canonical',
        rule_id
    INTO canonical, rule_id_found
    FROM knowledge_rules
    WHERE rule_type = 'concept_alias'
      AND is_active = TRUE
      AND LOWER(rule_definition->>'alias') = LOWER(p_name)
    ORDER BY priority DESC, confidence DESC
    LIMIT 1;

    -- If found, update application count and return canonical
    IF canonical IS NOT NULL THEN
        UPDATE knowledge_rules
        SET applied_count = applied_count + 1,
            last_applied_at = CURRENT_TIMESTAMP
        WHERE rule_id = rule_id_found;

        RETURN canonical;
    END IF;

    -- No rule found, return original
    RETURN p_name;
END;
$$ LANGUAGE plpgsql;

-- Function: Create alias rule from merge operation
CREATE OR REPLACE FUNCTION create_alias_rule(
    p_alias VARCHAR,
    p_canonical VARCHAR,
    p_created_by VARCHAR DEFAULT 'user'
) RETURNS INTEGER AS $$
DECLARE
    new_rule_id INTEGER;
BEGIN
    INSERT INTO knowledge_rules (
        rule_type,
        rule_definition,
        rule_name,
        description,
        source,
        created_by
    ) VALUES (
        'concept_alias',
        jsonb_build_object('alias', p_alias, 'canonical', p_canonical),
        p_alias || ' → ' || p_canonical,
        'Auto-created from concept merge',
        'user',
        p_created_by
    )
    ON CONFLICT DO NOTHING
    RETURNING rule_id INTO new_rule_id;

    RETURN new_rule_id;
END;
$$ LANGUAGE plpgsql;

-- Function: Get domain hints for extraction
CREATE OR REPLACE FUNCTION get_extraction_hints(p_domain VARCHAR DEFAULT NULL)
RETURNS TABLE(term VARCHAR, domain VARCHAR) AS $$
BEGIN
    RETURN QUERY
    SELECT
        jsonb_array_elements_text(rule_definition->'terms')::VARCHAR AS term,
        (rule_definition->>'domain')::VARCHAR AS domain
    FROM knowledge_rules
    WHERE rule_type = 'extraction_hint'
      AND is_active = TRUE
      AND (p_domain IS NULL OR rule_definition->>'domain' = p_domain)
    ORDER BY priority DESC;
END;
$$ LANGUAGE plpgsql;

-- Function: Get auto-tag suggestions for text
CREATE OR REPLACE FUNCTION get_auto_tags(p_text TEXT)
RETURNS TABLE(tag VARCHAR, pattern VARCHAR, confidence DECIMAL) AS $$
BEGIN
    RETURN QUERY
    SELECT
        jsonb_array_elements_text(kr.rule_definition->'tags')::VARCHAR AS tag,
        (kr.rule_definition->>'pattern')::VARCHAR AS pattern,
        kr.confidence
    FROM knowledge_rules kr
    WHERE kr.rule_type = 'auto_tag'
      AND kr.is_active = TRUE
      AND p_text ILIKE '%' || REPLACE(kr.rule_definition->>'pattern', '*', '%') || '%'
    ORDER BY kr.priority DESC, kr.confidence DESC;
END;
$$ LANGUAGE plpgsql;

-- Function: Check if term should be ignored
CREATE OR REPLACE FUNCTION should_ignore_term(p_term VARCHAR)
RETURNS BOOLEAN AS $$
BEGIN
    RETURN EXISTS (
        SELECT 1 FROM knowledge_rules
        WHERE rule_type = 'ignore_term'
          AND is_active = TRUE
          AND LOWER(rule_definition->>'term') = LOWER(p_term)
    );
END;
$$ LANGUAGE plpgsql;

-- Function: Record rule application
CREATE OR REPLACE FUNCTION record_rule_application(
    p_rule_id INTEGER,
    p_input VARCHAR,
    p_output VARCHAR,
    p_document_id VARCHAR DEFAULT NULL,
    p_chunk_id VARCHAR DEFAULT NULL,
    p_concept_id INTEGER DEFAULT NULL
) RETURNS INTEGER AS $$
DECLARE
    app_id INTEGER;
BEGIN
    INSERT INTO rule_applications (
        rule_id, input_value, output_value,
        document_id, chunk_id, concept_id
    ) VALUES (
        p_rule_id, p_input, p_output,
        p_document_id, p_chunk_id, p_concept_id
    )
    RETURNING application_id INTO app_id;

    -- Update rule stats
    UPDATE knowledge_rules
    SET applied_count = applied_count + 1,
        last_applied_at = CURRENT_TIMESTAMP
    WHERE rule_id = p_rule_id;

    RETURN app_id;
END;
$$ LANGUAGE plpgsql;

-- -----------------------------------------------------------------------------
-- Views
-- -----------------------------------------------------------------------------

-- View: Active rules summary
CREATE OR REPLACE VIEW v_rules_summary AS
SELECT
    rule_type,
    COUNT(*) AS rule_count,
    SUM(applied_count) AS total_applications,
    AVG(confidence) AS avg_confidence,
    MAX(last_applied_at) AS most_recent_application
FROM knowledge_rules
WHERE is_active = TRUE
GROUP BY rule_type
ORDER BY rule_count DESC;

-- View: Most used rules
CREATE OR REPLACE VIEW v_top_rules AS
SELECT
    rule_id,
    rule_type,
    rule_name,
    rule_definition,
    applied_count,
    confidence,
    last_applied_at
FROM knowledge_rules
WHERE is_active = TRUE
ORDER BY applied_count DESC
LIMIT 50;

-- View: Concept aliases (human-readable)
CREATE OR REPLACE VIEW v_concept_aliases AS
SELECT
    rule_id,
    rule_definition->>'alias' AS alias,
    rule_definition->>'canonical' AS canonical,
    applied_count,
    confidence,
    created_at
FROM knowledge_rules
WHERE rule_type = 'concept_alias'
  AND is_active = TRUE
ORDER BY applied_count DESC;

-- -----------------------------------------------------------------------------
-- Seed Data: Common Ignore Terms
-- -----------------------------------------------------------------------------
INSERT INTO knowledge_rules (rule_type, rule_definition, rule_name, source, description)
VALUES
    ('ignore_term', '{"term": "the"}', 'Ignore: the', 'system', 'Common article'),
    ('ignore_term', '{"term": "and"}', 'Ignore: and', 'system', 'Common conjunction'),
    ('ignore_term', '{"term": "is"}', 'Ignore: is', 'system', 'Common verb'),
    ('ignore_term', '{"term": "of"}', 'Ignore: of', 'system', 'Common preposition'),
    ('ignore_term', '{"term": "to"}', 'Ignore: to', 'system', 'Common preposition'),
    ('ignore_term', '{"term": "in"}', 'Ignore: in', 'system', 'Common preposition'),
    ('ignore_term', '{"term": "for"}', 'Ignore: for', 'system', 'Common preposition'),
    ('ignore_term', '{"term": "it"}', 'Ignore: it', 'system', 'Common pronoun'),
    ('ignore_term', '{"term": "that"}', 'Ignore: that', 'system', 'Common pronoun'),
    ('ignore_term', '{"term": "this"}', 'Ignore: this', 'system', 'Common demonstrative')
ON CONFLICT DO NOTHING;

-- =============================================================================
-- MIGRATION COMPLETE
-- =============================================================================
-- Verify with:
--   \dt knowledge_rules
--   \dt rule_applications
--   SELECT * FROM v_rules_summary;
--   SELECT get_canonical_concept_name('R. Steiner');
-- =============================================================================
