-- =============================================================================
-- RESEARCH DEVELOPMENT FRAMEWORK v4.0 - OCR Workflow Enhancements
-- =============================================================================
-- Adds re-OCR attempt tracking and archived status for CLI workflow
-- =============================================================================

-- Add 'archived' to the quality status enum
-- Archived = document kept for metadata only, text unavailable/unusable
DO $$
BEGIN
    -- Check if 'archived' already exists in the enum
    IF NOT EXISTS (
        SELECT 1 FROM pg_enum
        WHERE enumlabel = 'archived'
        AND enumtypid = 'ocr_quality_status'::regtype
    ) THEN
        ALTER TYPE ocr_quality_status ADD VALUE 'archived' AFTER 'unusable';
    END IF;
END $$;

-- Add re-OCR attempt tracking columns
DO $$
BEGIN
    -- Track number of re-OCR attempts
    IF NOT EXISTS (SELECT 1 FROM information_schema.columns
                   WHERE table_name = 'documents' AND column_name = 'reocr_attempts') THEN
        ALTER TABLE documents ADD COLUMN reocr_attempts INTEGER DEFAULT 0;
    END IF;

    -- Track when last re-OCR was attempted
    IF NOT EXISTS (SELECT 1 FROM information_schema.columns
                   WHERE table_name = 'documents' AND column_name = 'reocr_last_attempt') THEN
        ALTER TABLE documents ADD COLUMN reocr_last_attempt TIMESTAMP;
    END IF;

    -- Track which OCR method was last used
    IF NOT EXISTS (SELECT 1 FROM information_schema.columns
                   WHERE table_name = 'documents' AND column_name = 'reocr_last_method') THEN
        ALTER TABLE documents ADD COLUMN reocr_last_method VARCHAR(50);
    END IF;
END $$;

-- Create index for querying documents by OCR status efficiently
CREATE INDEX IF NOT EXISTS idx_documents_quality_status ON documents(quality_status);
CREATE INDEX IF NOT EXISTS idx_documents_reocr_attempts ON documents(reocr_attempts)
    WHERE quality_status IN ('poor', 'unusable');

-- Add helpful comment
COMMENT ON COLUMN documents.reocr_attempts IS 'Number of re-OCR attempts made on this document';
COMMENT ON COLUMN documents.reocr_last_attempt IS 'Timestamp of most recent re-OCR attempt';
COMMENT ON COLUMN documents.reocr_last_method IS 'OCR method used in last attempt: tesseract_best, tesseract_fast, easyocr';

-- =============================================================================
-- HELPFUL VIEWS
-- =============================================================================

-- View for OCR workflow monitoring
CREATE OR REPLACE VIEW v_ocr_queue AS
SELECT
    d.document_id,
    d.title,
    d.quality_status,
    d.quality_score,
    d.reocr_attempts,
    d.reocr_last_attempt,
    d.reocr_last_method,
    d.source_file,
    d.file_path,
    d.created_at,
    CASE
        WHEN d.quality_status = 'archived' THEN 'archived'
        WHEN d.quality_status IN ('poor', 'unusable') AND d.reocr_attempts >= 3 THEN 'max_attempts'
        WHEN d.quality_status IN ('poor', 'unusable') THEN 'pending_reocr'
        ELSE 'ok'
    END as workflow_status
FROM documents d
WHERE d.quality_status IN ('poor', 'unusable', 'archived')
ORDER BY
    CASE d.quality_status
        WHEN 'unusable' THEN 1
        WHEN 'poor' THEN 2
        WHEN 'archived' THEN 3
    END,
    d.reocr_attempts ASC,
    d.created_at ASC;

-- Summary stats for CLI
CREATE OR REPLACE VIEW v_ocr_stats AS
SELECT
    COUNT(*) FILTER (WHERE quality_status = 'poor') as poor_count,
    COUNT(*) FILTER (WHERE quality_status = 'unusable') as unusable_count,
    COUNT(*) FILTER (WHERE quality_status = 'archived') as archived_count,
    COUNT(*) FILTER (WHERE quality_status IN ('poor', 'unusable') AND reocr_attempts = 0) as never_attempted,
    COUNT(*) FILTER (WHERE quality_status IN ('poor', 'unusable') AND reocr_attempts > 0 AND reocr_attempts < 3) as in_progress,
    COUNT(*) FILTER (WHERE quality_status IN ('poor', 'unusable') AND reocr_attempts >= 3) as max_attempts_reached
FROM documents;
