-- =============================================================================
-- RESEARCH DEVELOPMENT FRAMEWORK - Schema Updates v2.0
-- =============================================================================
-- Optimization updates for intelligent document organization
-- Run this after the base schema.sql
-- =============================================================================

-- =============================================================================
-- PHASE 1: AUTO-CLASSIFICATION SUPPORT
-- =============================================================================

-- Track how document metadata was determined
ALTER TABLE documents ADD COLUMN IF NOT EXISTS
    metadata_source VARCHAR(50) DEFAULT 'filename';
    -- Values: 'filename', 'llm_classification', 'manual', 'hybrid'

-- Store LLM classification confidence
ALTER TABLE documents ADD COLUMN IF NOT EXISTS
    classification_confidence FLOAT;

-- Flag documents that need human review
ALTER TABLE documents ADD COLUMN IF NOT EXISTS
    needs_review BOOLEAN DEFAULT FALSE;

-- Primary category for logical organization
ALTER TABLE documents ADD COLUMN IF NOT EXISTS
    primary_category VARCHAR(100);

-- Content type classification
ALTER TABLE documents ADD COLUMN IF NOT EXISTS
    content_type VARCHAR(50);
    -- Values: 'lecture', 'essay', 'book_chapter', 'academic_paper', 'manual', 'letter', 'interview', 'other'

-- Difficulty level
ALTER TABLE documents ADD COLUMN IF NOT EXISTS
    difficulty_level VARCHAR(20);
    -- Values: 'introductory', 'intermediate', 'advanced', 'expert'

-- Track auto-created topics for review
ALTER TABLE topics ADD COLUMN IF NOT EXISTS
    auto_created BOOLEAN DEFAULT FALSE;

ALTER TABLE topics ADD COLUMN IF NOT EXISTS
    parent_topic_id INTEGER REFERENCES topics(topic_id);

ALTER TABLE topics ADD COLUMN IF NOT EXISTS
    hierarchy_level INTEGER DEFAULT 0;

-- Track auto-created concepts for review
ALTER TABLE concepts ADD COLUMN IF NOT EXISTS
    auto_created BOOLEAN DEFAULT FALSE;

ALTER TABLE concepts ADD COLUMN IF NOT EXISTS
    parent_concept_id INTEGER REFERENCES concepts(concept_id);

ALTER TABLE concepts ADD COLUMN IF NOT EXISTS
    hierarchy_level INTEGER DEFAULT 0;

-- =============================================================================
-- PHASE 2: SEMANTIC CLUSTERING
-- =============================================================================

-- Semantic clusters discovered through embedding analysis
CREATE TABLE IF NOT EXISTS document_clusters (
    cluster_id SERIAL PRIMARY KEY,
    name VARCHAR(255),                      -- AI-generated or manual name
    description TEXT,
    centroid vector(1536),                  -- Mathematical center of cluster
    document_count INTEGER DEFAULT 0,
    coherence_score FLOAT,                  -- How tight the cluster is (0-1)
    auto_generated BOOLEAN DEFAULT TRUE,
    reviewed BOOLEAN DEFAULT FALSE,
    parent_cluster_id INTEGER REFERENCES document_clusters(cluster_id),
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- Trigger for updated_at
CREATE TRIGGER tr_document_clusters_updated_at BEFORE UPDATE ON document_clusters
    FOR EACH ROW EXECUTE FUNCTION update_updated_at();

-- Index for cluster centroid similarity searches
CREATE INDEX IF NOT EXISTS idx_cluster_centroid
ON document_clusters USING ivfflat (centroid vector_cosine_ops) WITH (lists = 50);

-- Link documents to clusters (can belong to multiple)
CREATE TABLE IF NOT EXISTS document_cluster_membership (
    document_id VARCHAR(100) REFERENCES documents(document_id) ON DELETE CASCADE,
    cluster_id INTEGER REFERENCES document_clusters(cluster_id) ON DELETE CASCADE,
    similarity_to_centroid FLOAT,           -- How close to cluster center
    is_primary_cluster BOOLEAN DEFAULT FALSE,
    assigned_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (document_id, cluster_id)
);

-- Index for cluster-based queries
CREATE INDEX IF NOT EXISTS idx_cluster_membership_cluster
ON document_cluster_membership(cluster_id);

CREATE INDEX IF NOT EXISTS idx_cluster_membership_primary
ON document_cluster_membership(document_id) WHERE is_primary_cluster = TRUE;

-- =============================================================================
-- PHASE 3: HIERARCHICAL CHUNKING
-- =============================================================================

-- Add parent-child relationship to chunks
ALTER TABLE chunks ADD COLUMN IF NOT EXISTS
    parent_chunk_id VARCHAR(100) REFERENCES chunks(chunk_id);

-- Chunk level indicator
ALTER TABLE chunks ADD COLUMN IF NOT EXISTS
    chunk_level VARCHAR(20) DEFAULT 'standard';
    -- Values: 'parent', 'child', 'standard'

-- Section/chapter title for context
ALTER TABLE chunks ADD COLUMN IF NOT EXISTS
    section_title VARCHAR(255);

-- Heading level if from a heading
ALTER TABLE chunks ADD COLUMN IF NOT EXISTS
    heading_level INTEGER;

-- Index for parent-child lookups
CREATE INDEX IF NOT EXISTS idx_chunks_parent
ON chunks(parent_chunk_id) WHERE parent_chunk_id IS NOT NULL;

CREATE INDEX IF NOT EXISTS idx_chunks_level
ON chunks(chunk_level);

-- =============================================================================
-- PHASE 4: CROSS-DOCUMENT CONNECTIONS
-- =============================================================================

-- Store discovered semantic connections between chunks across documents
CREATE TABLE IF NOT EXISTS chunk_connections (
    connection_id SERIAL PRIMARY KEY,
    chunk_a_id VARCHAR(100) REFERENCES chunks(chunk_id) ON DELETE CASCADE,
    chunk_b_id VARCHAR(100) REFERENCES chunks(chunk_id) ON DELETE CASCADE,
    similarity_score FLOAT NOT NULL,
    connection_type VARCHAR(50) DEFAULT 'semantic',  -- 'semantic', 'citation', 'concept', 'manual'
    discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    UNIQUE(chunk_a_id, chunk_b_id)
);

CREATE INDEX IF NOT EXISTS idx_chunk_connections_a ON chunk_connections(chunk_a_id);
CREATE INDEX IF NOT EXISTS idx_chunk_connections_b ON chunk_connections(chunk_b_id);
CREATE INDEX IF NOT EXISTS idx_chunk_connections_score ON chunk_connections(similarity_score DESC);

-- =============================================================================
-- PHASE 5: RAG CONVERSATION HISTORY
-- =============================================================================

-- Store RAG chat sessions for context and analytics
CREATE TABLE IF NOT EXISTS chat_sessions (
    session_id SERIAL PRIMARY KEY,
    user_identifier VARCHAR(255),           -- Optional user tracking
    started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    last_activity TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    message_count INTEGER DEFAULT 0
);

CREATE TABLE IF NOT EXISTS chat_messages (
    message_id SERIAL PRIMARY KEY,
    session_id INTEGER REFERENCES chat_sessions(session_id) ON DELETE CASCADE,
    role VARCHAR(20) NOT NULL,              -- 'user', 'assistant', 'system'
    content TEXT NOT NULL,
    sources_used JSONB,                      -- Array of chunk_ids used for response
    model_used VARCHAR(100),
    tokens_used INTEGER,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX IF NOT EXISTS idx_chat_messages_session ON chat_messages(session_id);

-- =============================================================================
-- HELPER VIEWS
-- =============================================================================

-- View for reviewing auto-created taxonomy
CREATE OR REPLACE VIEW v_auto_taxonomy_review AS
SELECT
    'topic' as type,
    topic_id as id,
    name,
    auto_created,
    (SELECT COUNT(*) FROM document_topics WHERE topic_id = topics.topic_id) as usage_count,
    created_at
FROM topics WHERE auto_created = TRUE
UNION ALL
SELECT
    'concept' as type,
    concept_id as id,
    name,
    auto_created,
    (SELECT COUNT(*) FROM document_concepts WHERE concept_id = concepts.concept_id) as usage_count,
    created_at
FROM concepts WHERE auto_created = TRUE
ORDER BY usage_count DESC;

-- View for cluster overview
CREATE OR REPLACE VIEW v_cluster_overview AS
SELECT
    c.cluster_id,
    c.name,
    c.document_count,
    c.coherence_score,
    c.auto_generated,
    c.reviewed,
    COUNT(DISTINCT dcm.document_id) as actual_document_count,
    AVG(dcm.similarity_to_centroid) as avg_similarity,
    c.created_at
FROM document_clusters c
LEFT JOIN document_cluster_membership dcm ON c.cluster_id = dcm.cluster_id
GROUP BY c.cluster_id, c.name, c.document_count, c.coherence_score,
         c.auto_generated, c.reviewed, c.created_at
ORDER BY c.document_count DESC;

-- View for documents with full classification info
CREATE OR REPLACE VIEW v_documents_classified AS
SELECT
    d.document_id,
    d.title,
    a.name as author_name,
    d.publication_year,
    d.primary_category,
    d.content_type,
    d.difficulty_level,
    d.metadata_source,
    d.classification_confidence,
    d.needs_review,
    d.processing_status,
    dc.cluster_id as primary_cluster_id,
    cl.name as cluster_name,
    (SELECT array_agg(t.name) FROM document_topics dt
     JOIN topics t ON dt.topic_id = t.topic_id
     WHERE dt.document_id = d.document_id) as topics,
    (SELECT array_agg(c.name) FROM document_concepts dcon
     JOIN concepts c ON dcon.concept_id = c.concept_id
     WHERE dcon.document_id = d.document_id) as concepts,
    d.created_at
FROM documents d
LEFT JOIN authors a ON d.author_id = a.author_id
LEFT JOIN document_cluster_membership dc ON d.document_id = dc.document_id AND dc.is_primary_cluster = TRUE
LEFT JOIN document_clusters cl ON dc.cluster_id = cl.cluster_id;

-- =============================================================================
-- HELPER FUNCTIONS
-- =============================================================================

-- Function to get parent context for a child chunk
CREATE OR REPLACE FUNCTION get_parent_context(p_child_chunk_id VARCHAR)
RETURNS TABLE (
    parent_id VARCHAR,
    parent_text TEXT,
    section_title VARCHAR,
    child_position INTEGER
) AS $$
BEGIN
    RETURN QUERY
    SELECT
        p.chunk_id as parent_id,
        p.chunk_text as parent_text,
        p.section_title,
        c.chunk_sequence - (
            SELECT MIN(chunk_sequence)
            FROM chunks
            WHERE parent_chunk_id = p.chunk_id
        ) as child_position
    FROM chunks c
    JOIN chunks p ON c.parent_chunk_id = p.chunk_id
    WHERE c.chunk_id = p_child_chunk_id;
END;
$$ LANGUAGE plpgsql;

-- Function to find related chunks across documents
CREATE OR REPLACE FUNCTION find_related_chunks(
    p_chunk_id VARCHAR,
    p_similarity_threshold FLOAT DEFAULT 0.8,
    p_limit INTEGER DEFAULT 10
)
RETURNS TABLE (
    related_chunk_id VARCHAR,
    document_id VARCHAR,
    document_title VARCHAR,
    similarity FLOAT,
    chunk_text TEXT
) AS $$
DECLARE
    v_embedding vector(1536);
    v_document_id VARCHAR;
BEGIN
    -- Get the embedding and document_id of the source chunk
    SELECT c.embedding, c.document_id INTO v_embedding, v_document_id
    FROM chunks c WHERE c.chunk_id = p_chunk_id;

    IF v_embedding IS NULL THEN
        RAISE EXCEPTION 'Chunk % has no embedding', p_chunk_id;
    END IF;

    RETURN QUERY
    SELECT
        c.chunk_id as related_chunk_id,
        c.document_id,
        d.title as document_title,
        1 - (c.embedding <=> v_embedding) as similarity,
        c.chunk_text
    FROM chunks c
    JOIN documents d ON c.document_id = d.document_id
    WHERE c.chunk_id != p_chunk_id
      AND c.document_id != v_document_id  -- Different document
      AND c.embedding IS NOT NULL
      AND 1 - (c.embedding <=> v_embedding) >= p_similarity_threshold
    ORDER BY c.embedding <=> v_embedding
    LIMIT p_limit;
END;
$$ LANGUAGE plpgsql;

-- Function to get cluster documents with similarity ranking
CREATE OR REPLACE FUNCTION get_cluster_documents(
    p_cluster_id INTEGER,
    p_limit INTEGER DEFAULT 50
)
RETURNS TABLE (
    document_id VARCHAR,
    title VARCHAR,
    author_name VARCHAR,
    similarity_to_centroid FLOAT,
    primary_category VARCHAR
) AS $$
BEGIN
    RETURN QUERY
    SELECT
        d.document_id,
        d.title,
        a.name as author_name,
        dcm.similarity_to_centroid,
        d.primary_category
    FROM document_cluster_membership dcm
    JOIN documents d ON dcm.document_id = d.document_id
    LEFT JOIN authors a ON d.author_id = a.author_id
    WHERE dcm.cluster_id = p_cluster_id
    ORDER BY dcm.similarity_to_centroid DESC
    LIMIT p_limit;
END;
$$ LANGUAGE plpgsql;

-- =============================================================================
-- UPDATE SEARCH HISTORY FOR RAG TRACKING
-- =============================================================================

ALTER TABLE search_history ADD COLUMN IF NOT EXISTS
    is_rag_query BOOLEAN DEFAULT FALSE;

ALTER TABLE search_history ADD COLUMN IF NOT EXISTS
    rag_response_tokens INTEGER;

-- =============================================================================
-- ADDITIONAL INDEXES FOR PERFORMANCE
-- =============================================================================

-- Index for filtering by metadata source
CREATE INDEX IF NOT EXISTS idx_documents_metadata_source
ON documents(metadata_source);

-- Index for filtering documents needing review
CREATE INDEX IF NOT EXISTS idx_documents_needs_review
ON documents(needs_review) WHERE needs_review = TRUE;

-- Index for primary category filtering
CREATE INDEX IF NOT EXISTS idx_documents_primary_category
ON documents(primary_category);

-- Index for content type filtering
CREATE INDEX IF NOT EXISTS idx_documents_content_type
ON documents(content_type);

-- Composite index for common filter combinations
CREATE INDEX IF NOT EXISTS idx_documents_category_type
ON documents(primary_category, content_type);

-- =============================================================================
-- COMMENTS
-- =============================================================================

COMMENT ON TABLE document_clusters IS 'Semantic clusters discovered through embedding analysis';
COMMENT ON TABLE document_cluster_membership IS 'Links documents to semantic clusters with similarity scores';
COMMENT ON TABLE chunk_connections IS 'Discovered semantic connections between chunks across documents';
COMMENT ON TABLE chat_sessions IS 'RAG chat session tracking';
COMMENT ON TABLE chat_messages IS 'Individual messages in RAG chat sessions';

COMMENT ON COLUMN documents.metadata_source IS 'How metadata was determined: filename, llm_classification, manual, hybrid';
COMMENT ON COLUMN documents.classification_confidence IS 'LLM confidence in classification (0.0-1.0)';
COMMENT ON COLUMN documents.needs_review IS 'Flag for documents requiring human review of classification';
COMMENT ON COLUMN chunks.parent_chunk_id IS 'Reference to parent chunk for hierarchical chunking';
COMMENT ON COLUMN chunks.chunk_level IS 'Chunk hierarchy level: parent, child, or standard';
