#!/bin/bash
# =============================================================================
# Research Library Processing Script
# =============================================================================
# Complete workflow for processing documents with optional adaptive tuning.
#
# Workflow:
#   1. Ingest Documents  - Parse PDFs, extract text, classify
#   2. Chunk Documents   - Split into searchable blocks
#   3. Auto-Glossary     - (Optional) Generate domain-specific glossary
#   4. Extract Concepts  - Find and weight concepts using glossary
#   5. Generate Keys     - Create BibTeX citation keys
#   6. Embeddings        - (Optional) Generate OpenAI embeddings
#
# Usage:
#   ./process_library.sh                           # Standard processing
#   ./process_library.sh --theme "Anthroposophy"   # With adaptive tuning
#   ./process_library.sh --full                    # Include embeddings
#   RESEARCH_THEME="Alchemy" ./process_library.sh  # Via environment
#
# =============================================================================

set -e

# Configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Default settings
THEME="${RESEARCH_THEME:-}"
GLOSSARY_FILE=""
GENERATE_EMBEDDINGS=false
SKIP_INGEST=false
SKIP_CHUNK=false
VERBOSE=false
AUTO_REPAIR=false

# Parse arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --theme|-t)
            THEME="$2"
            shift 2
            ;;
        --glossary|-g)
            GLOSSARY_FILE="$2"
            shift 2
            ;;
        --full|-f)
            GENERATE_EMBEDDINGS=true
            shift
            ;;
        --skip-ingest)
            SKIP_INGEST=true
            shift
            ;;
        --skip-chunk)
            SKIP_CHUNK=true
            shift
            ;;
        --verbose|-v)
            VERBOSE=true
            shift
            ;;
        --auto-repair|-r)
            AUTO_REPAIR=true
            shift
            ;;
        --help|-h)
            echo "Usage: $0 [OPTIONS]"
            echo ""
            echo "Options:"
            echo "  --theme, -t THEME    Research theme for adaptive glossary generation"
            echo "  --glossary, -g FILE  Use existing glossary file instead of generating"
            echo "  --full, -f           Include OpenAI embedding generation (requires API key)"
            echo "  --auto-repair, -r    Auto-OCR PDFs with poor text quality"
            echo "  --skip-ingest        Skip document ingestion (reprocess existing)"
            echo "  --skip-chunk         Skip chunking (only run concept extraction)"
            echo "  --verbose, -v        Show detailed output"
            echo "  --help, -h           Show this help message"
            echo ""
            echo "Environment Variables:"
            echo "  RESEARCH_THEME       Alternative to --theme flag"
            echo ""
            echo "Examples:"
            echo "  # Standard processing"
            echo "  ./process_library.sh"
            echo ""
            echo "  # With Anthroposophy glossary tuning"
            echo "  ./process_library.sh --theme 'Anthroposophy and Rudolf Steiner'"
            echo ""
            echo "  # Use pre-made glossary"
            echo "  ./process_library.sh --glossary glossaries/anthroposophy_sample.txt"
            echo ""
            echo "  # Full processing with embeddings"
            echo "  ./process_library.sh --theme 'Freemasonry' --full"
            echo ""
            echo "  # Reprocess concepts only"
            echo "  ./process_library.sh --skip-ingest --skip-chunk --theme 'Alchemy'"
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            exit 1
            ;;
    esac
done

# Helper functions
log_step() {
    echo -e "${BLUE}==>${NC} ${GREEN}$1${NC}"
}

log_info() {
    echo -e "    $1"
}

log_warn() {
    echo -e "${YELLOW}Warning:${NC} $1"
}

log_error() {
    echo -e "${RED}Error:${NC} $1"
}

# Check for new documents
check_new_docs() {
    NEW_DOC_COUNT=$(find NEW_DOCS -maxdepth 1 -type f \( -name "*.pdf" -o -name "*.docx" -o -name "*.txt" -o -name "*.md" \) 2>/dev/null | wc -l)
    echo "$NEW_DOC_COUNT"
}

# =============================================================================
# MAIN WORKFLOW
# =============================================================================

echo ""
echo "=============================================="
echo "  RESEARCH LIBRARY PROCESSOR"
echo "=============================================="
echo ""

# Show configuration
if [ -n "$THEME" ]; then
    log_info "Theme: $THEME"
fi
if [ -n "$GLOSSARY_FILE" ]; then
    log_info "Glossary: $GLOSSARY_FILE"
fi
if [ "$GENERATE_EMBEDDINGS" = true ]; then
    log_info "Embeddings: Enabled"
fi
if [ "$AUTO_REPAIR" = true ]; then
    log_info "Auto-repair: Enabled (OCR fallback for poor PDFs)"
fi
echo ""

# -----------------------------------------------------------------------------
# Step 1: Document Ingestion
# -----------------------------------------------------------------------------
if [ "$SKIP_INGEST" = false ]; then
    log_step "Step 1: Document Ingestion"

    NEW_DOCS=$(check_new_docs)
    if [ "$NEW_DOCS" -gt 0 ]; then
        log_info "Found $NEW_DOCS new document(s) to process"

        INGEST_OPTS=""
        if [ "$AUTO_REPAIR" = true ]; then
            INGEST_OPTS="--auto-repair"
        fi

        if [ "$VERBOSE" = true ]; then
            python3 pipeline/ingest_documents.py $INGEST_OPTS
        else
            python3 pipeline/ingest_documents.py $INGEST_OPTS 2>&1 | tail -10
        fi

        log_info "Ingestion complete"
    else
        log_info "No new documents in NEW_DOCS/"
    fi
    echo ""
else
    log_info "Skipping ingestion (--skip-ingest)"
    echo ""
fi

# -----------------------------------------------------------------------------
# Step 2: Document Chunking
# -----------------------------------------------------------------------------
if [ "$SKIP_CHUNK" = false ]; then
    log_step "Step 2: Document Chunking"

    if [ "$VERBOSE" = true ]; then
        python3 pipeline/chunk_documents.py
    else
        python3 pipeline/chunk_documents.py 2>&1 | tail -5
    fi

    log_info "Chunking complete"
    echo ""
else
    log_info "Skipping chunking (--skip-chunk)"
    echo ""
fi

# -----------------------------------------------------------------------------
# Step 3: Adaptive Glossary Generation (Optional)
# -----------------------------------------------------------------------------
if [ -n "$THEME" ] && [ -z "$GLOSSARY_FILE" ]; then
    log_step "Step 3: Generating Adaptive Glossary"
    log_info "Theme: $THEME"

    # Generate glossary filename from theme
    SAFE_THEME=$(echo "$THEME" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/_/g' | sed 's/__*/_/g')
    GLOSSARY_FILE="glossaries/${SAFE_THEME}_auto.txt"

    # Create glossaries directory
    mkdir -p glossaries

    # Generate the glossary
    python3 pipeline/auto_glossary.py \
        --theme "$THEME" \
        --output "$GLOSSARY_FILE" \
        --samples 50 \
        --terms 25

    if [ -f "$GLOSSARY_FILE" ]; then
        TERM_COUNT=$(grep -v "^#" "$GLOSSARY_FILE" | grep -v "^$" | wc -l)
        log_info "Generated $TERM_COUNT glossary terms -> $GLOSSARY_FILE"
    else
        log_warn "Glossary generation failed, continuing without"
        GLOSSARY_FILE=""
    fi
    echo ""
elif [ -n "$GLOSSARY_FILE" ]; then
    log_step "Step 3: Using Existing Glossary"
    if [ -f "$GLOSSARY_FILE" ]; then
        TERM_COUNT=$(grep -v "^#" "$GLOSSARY_FILE" | grep -v "^$" | wc -l)
        log_info "Loaded $TERM_COUNT terms from $GLOSSARY_FILE"
    else
        log_error "Glossary file not found: $GLOSSARY_FILE"
        exit 1
    fi
    echo ""
else
    log_info "Step 3: Skipping glossary generation (no theme specified)"
    echo ""
fi

# -----------------------------------------------------------------------------
# Step 4: Concept Extraction
# -----------------------------------------------------------------------------
log_step "Step 4: Concept Extraction"

if [ -n "$GLOSSARY_FILE" ] && [ -f "$GLOSSARY_FILE" ]; then
    log_info "Using glossary: $GLOSSARY_FILE"

    if [ "$VERBOSE" = true ]; then
        python3 pipeline/extract_concepts.py --glossary "$GLOSSARY_FILE"
    else
        python3 pipeline/extract_concepts.py --glossary "$GLOSSARY_FILE" 2>&1 | tail -10
    fi
else
    log_info "Standard extraction (no glossary)"

    if [ "$VERBOSE" = true ]; then
        python3 pipeline/extract_concepts.py
    else
        python3 pipeline/extract_concepts.py 2>&1 | tail -10
    fi
fi

log_info "Concept extraction complete"
echo ""

# -----------------------------------------------------------------------------
# Step 5: Generate Citation Keys
# -----------------------------------------------------------------------------
log_step "Step 5: Generating Citation Keys"

python3 pipeline/edit_metadata.py --generate-keys 2>&1 | tail -3

echo ""

# -----------------------------------------------------------------------------
# Step 6: Embeddings (Optional)
# -----------------------------------------------------------------------------
if [ "$GENERATE_EMBEDDINGS" = true ]; then
    log_step "Step 6: Generating Embeddings"

    # Check for API key
    if [ -z "$OPENAI_API_KEY" ] && ! grep -q "OPENAI_API_KEY" .env 2>/dev/null; then
        log_warn "No OpenAI API key found, skipping embeddings"
    else
        if [ "$VERBOSE" = true ]; then
            python3 pipeline/generate_embeddings.py
        else
            python3 pipeline/generate_embeddings.py 2>&1 | tail -5
        fi
        log_info "Embedding generation complete"
    fi
    echo ""
else
    log_info "Step 6: Skipping embeddings (use --full to enable)"
    echo ""
fi

# -----------------------------------------------------------------------------
# Summary
# -----------------------------------------------------------------------------
echo "=============================================="
echo "  PROCESSING COMPLETE"
echo "=============================================="
echo ""

# Get stats
DOC_COUNT=$(python3 -c "from pipeline.db_utils import execute_query; print(len(execute_query('SELECT 1 FROM documents', fetch='all')))" 2>/dev/null || echo "?")
CHUNK_COUNT=$(python3 -c "from pipeline.db_utils import execute_query; print(len(execute_query('SELECT 1 FROM chunks', fetch='all')))" 2>/dev/null || echo "?")

log_info "Total Documents: $DOC_COUNT"
log_info "Total Chunks: $CHUNK_COUNT"

if [ -n "$GLOSSARY_FILE" ] && [ -f "$GLOSSARY_FILE" ]; then
    log_info "Glossary: $GLOSSARY_FILE"
fi

echo ""
echo "Next steps:"
echo "  - Search: python3 pipeline/search_export.py 'your query'"
echo "  - Chat:   curl -X POST localhost:5000/api/chat -d '{\"message\":\"...\"}'"
echo ""
