#!/usr/bin/env python3
"""
Semantic Clustering Pipeline for automatic document organization.

This module clusters documents based on their embedding similarity, enabling:
- Automatic discovery of document groups/themes
- Visual library organization
- Filtered search within clusters
- Cross-document relationship discovery

Supports two clustering modes:
1. Embedding-based: Uses OpenAI embeddings (requires API key)
2. TF-IDF-based: Uses term frequency vectors (fully offline)

Usage:
    python cluster_documents.py                    # Cluster all documents
    python cluster_documents.py --n-clusters 20   # Specify number of clusters
    python cluster_documents.py --method dbscan   # Use DBSCAN instead of K-means
    python cluster_documents.py --tfidf           # Use TF-IDF vectors instead of embeddings
    python cluster_documents.py --update          # Update existing clusters
"""

import logging
import argparse
from typing import List, Dict, Tuple, Optional, Any
from datetime import datetime
from collections import Counter

import numpy as np

# Try to import sklearn for clustering and TF-IDF
try:
    from sklearn.cluster import KMeans, DBSCAN
    from sklearn.metrics import silhouette_score
    from sklearn.feature_extraction.text import TfidfVectorizer
    HAS_SKLEARN = True
except ImportError:
    HAS_SKLEARN = False

from config import (
    OPENAI_API_KEY, OPENAI_ENABLED, LOGGING_CONFIG,
    INTELLIGENCE_MODE, LOCAL_LLM_ENDPOINT, LOCAL_LLM_MODEL,
    STATISTICAL_CONFIG, CLOUD_MODELS
)
from db_utils import execute_query, get_db_connection

# Try to import OpenAI for cluster naming
try:
    from openai import OpenAI
    HAS_OPENAI = True
except ImportError:
    HAS_OPENAI = False

# Setup logging
logging.basicConfig(
    level=getattr(logging, LOGGING_CONFIG['level']),
    format=LOGGING_CONFIG['format']
)
logger = logging.getLogger(__name__)


class SemanticClusterer:
    """
    Automatically cluster documents based on embedding or TF-IDF similarity.

    Supports two modes:
    - Embedding-based: Uses OpenAI/local embeddings for semantic clustering
    - TF-IDF-based: Uses term frequency vectors for offline clustering

    Uses K-means or DBSCAN clustering to discover natural groupings in the
    library. Can optionally generate descriptive names using an LLM.
    """

    def __init__(self, min_cluster_size: int = 5, mode: str = None):
        """
        Initialize the clusterer.

        Args:
            min_cluster_size: Minimum documents per cluster (smaller clusters are merged/discarded)
            mode: Intelligence mode override ('cloud', 'local', 'statistical', or None for auto)
        """
        if not HAS_SKLEARN:
            raise ImportError("scikit-learn is required. Run: pip install scikit-learn")

        self.min_cluster_size = min_cluster_size
        self.mode = mode or INTELLIGENCE_MODE
        self.client = None
        self.local_client = None
        self.tfidf = None

        # Initialize LLM client for naming based on mode
        if self.mode == 'cloud' and OPENAI_ENABLED and HAS_OPENAI:
            try:
                self.client = OpenAI(api_key=OPENAI_API_KEY)
                logger.info("SemanticClusterer initialized in CLOUD mode (embedding-based)")
            except Exception as e:
                logger.warning(f"Failed to initialize OpenAI client: {e}")
                self.mode = 'statistical'

        elif self.mode == 'local' and HAS_OPENAI:
            try:
                self.local_client = OpenAI(
                    base_url=LOCAL_LLM_ENDPOINT,
                    api_key="not-needed"
                )
                logger.info("SemanticClusterer initialized in LOCAL mode")
            except Exception as e:
                logger.warning(f"Failed to initialize local LLM: {e}")
                self.mode = 'statistical'

        if self.mode == 'statistical':
            logger.info("SemanticClusterer initialized in STATISTICAL mode (TF-IDF based)")
            self._init_tfidf()

    def _init_tfidf(self):
        """Initialize TF-IDF vectorizer for statistical clustering."""
        self.tfidf = TfidfVectorizer(
            max_features=STATISTICAL_CONFIG.get('tfidf_max_features', 5000),
            ngram_range=STATISTICAL_CONFIG.get('tfidf_ngram_range', (1, 2)),
            stop_words='english',
            min_df=2,  # Ignore terms appearing in less than 2 docs
            max_df=0.95  # Ignore terms appearing in more than 95% of docs
        )
        logger.debug("TF-IDF vectorizer initialized for clustering")

    def load_document_embeddings(self) -> Tuple[List[str], np.ndarray]:
        """
        Load document-level embeddings (average of chunk embeddings).

        Returns:
            Tuple of (document_ids, embedding_matrix)
        """
        logger.info("Loading document embeddings...")

        # Get average embedding per document
        results = execute_query("""
            SELECT
                c.document_id,
                d.title,
                array_agg(e.val ORDER BY e.idx) as doc_embedding
            FROM chunks c
            JOIN documents d ON c.document_id = d.document_id
            CROSS JOIN LATERAL unnest(c.embedding::real[]) WITH ORDINALITY AS e(val, idx)
            WHERE c.embedding IS NOT NULL
            GROUP BY c.document_id, d.title
            ORDER BY c.document_id
        """, fetch='all')

        if not results:
            # Fallback: simpler query that might work better
            results = execute_query("""
                SELECT
                    document_id,
                    embedding::text as embedding_str
                FROM chunks
                WHERE embedding IS NOT NULL
            """, fetch='all')

            if not results:
                raise ValueError("No documents with embeddings found")

            # Aggregate by document
            doc_embeddings = {}
            for row in results:
                doc_id = row['document_id']
                # Parse embedding from string representation
                emb_str = row['embedding_str'].strip('[]')
                emb = [float(x) for x in emb_str.split(',')]

                if doc_id not in doc_embeddings:
                    doc_embeddings[doc_id] = []
                doc_embeddings[doc_id].append(emb)

            # Average embeddings per document
            doc_ids = []
            embeddings = []
            for doc_id, embs in doc_embeddings.items():
                doc_ids.append(doc_id)
                avg_emb = np.mean(embs, axis=0)
                embeddings.append(avg_emb)

            embeddings = np.array(embeddings)
        else:
            doc_ids = [r['document_id'] for r in results]
            embeddings = np.array([r['doc_embedding'] for r in results])

        logger.info(f"Loaded embeddings for {len(doc_ids)} documents")
        return doc_ids, embeddings

    def load_document_tfidf(self) -> Tuple[List[str], np.ndarray, List[str]]:
        """
        Load documents and compute TF-IDF vectors (for statistical mode).

        This method creates vectors based on term frequency, enabling clustering
        without requiring embeddings or API calls.

        Returns:
            Tuple of (document_ids, tfidf_matrix, feature_names)
        """
        logger.info("Loading documents for TF-IDF clustering...")

        # Get document text from chunks
        results = execute_query("""
            SELECT
                c.document_id,
                d.title,
                STRING_AGG(c.content, ' ' ORDER BY c.chunk_number) as full_text
            FROM chunks c
            JOIN documents d ON c.document_id = d.document_id
            GROUP BY c.document_id, d.title
            ORDER BY c.document_id
        """, fetch='all')

        if not results:
            raise ValueError("No documents with chunks found")

        doc_ids = [r['document_id'] for r in results]
        texts = [r['full_text'] for r in results]

        # Initialize TF-IDF if needed
        if self.tfidf is None:
            self._init_tfidf()

        # Fit and transform
        tfidf_matrix = self.tfidf.fit_transform(texts)
        feature_names = self.tfidf.get_feature_names_out()

        logger.info(f"Created TF-IDF matrix: {tfidf_matrix.shape[0]} docs x {tfidf_matrix.shape[1]} features")
        return doc_ids, tfidf_matrix.toarray(), feature_names

    def load_document_vectors(self, use_tfidf: bool = None) -> Tuple[List[str], np.ndarray]:
        """
        Load document vectors using the appropriate method.

        Args:
            use_tfidf: Force TF-IDF mode. If None, auto-detect based on mode.

        Returns:
            Tuple of (document_ids, vector_matrix)
        """
        # Auto-detect based on mode
        if use_tfidf is None:
            use_tfidf = self.mode == 'statistical'

        if use_tfidf:
            doc_ids, matrix, _ = self.load_document_tfidf()
            return doc_ids, matrix
        else:
            return self.load_document_embeddings()

    def find_optimal_clusters(
        self,
        embeddings: np.ndarray,
        min_k: int = 3,
        max_k: int = 30
    ) -> Tuple[int, float]:
        """
        Find optimal number of clusters using silhouette score.

        Args:
            embeddings: Document embedding matrix
            min_k: Minimum number of clusters to try
            max_k: Maximum number of clusters to try

        Returns:
            Tuple of (optimal_k, best_silhouette_score)
        """
        logger.info(f"Finding optimal cluster count (testing {min_k}-{max_k})...")

        n_samples = len(embeddings)
        max_k = min(max_k, n_samples // self.min_cluster_size)

        if max_k < min_k:
            logger.warning(f"Not enough documents for clustering (need at least {min_k * self.min_cluster_size})")
            return min_k, 0.0

        best_k = min_k
        best_score = -1

        for k in range(min_k, max_k + 1):
            try:
                kmeans = KMeans(n_clusters=k, random_state=42, n_init=10, max_iter=300)
                labels = kmeans.fit_predict(embeddings)

                # Silhouette score measures cluster quality
                score = silhouette_score(embeddings, labels, metric='cosine')

                if score > best_score:
                    best_score = score
                    best_k = k

                logger.debug(f"k={k}: silhouette={score:.4f}")
            except Exception as e:
                logger.warning(f"Error testing k={k}: {e}")
                continue

        logger.info(f"Optimal clusters: {best_k} (silhouette score: {best_score:.4f})")
        return best_k, best_score

    def cluster_documents(
        self,
        n_clusters: Optional[int] = None,
        method: str = 'kmeans',
        clear_existing: bool = True,
        use_tfidf: bool = None
    ) -> Dict[int, List[str]]:
        """
        Cluster all documents and save results to database.

        Args:
            n_clusters: Number of clusters (auto-detected if None)
            method: 'kmeans' or 'dbscan'
            clear_existing: Whether to clear existing cluster assignments
            use_tfidf: Force TF-IDF mode. If None, auto-detect based on mode.

        Returns:
            Dict mapping cluster_id to list of document_ids
        """
        # Auto-select TF-IDF if in statistical mode
        if use_tfidf is None:
            use_tfidf = self.mode == 'statistical'

        vector_type = "TF-IDF" if use_tfidf else "embedding"
        logger.info(f"Loading document vectors ({vector_type} mode)...")

        doc_ids, embeddings = self.load_document_vectors(use_tfidf=use_tfidf)

        if len(embeddings) < self.min_cluster_size:
            raise ValueError(f"Need at least {self.min_cluster_size} documents with embeddings")

        # Clear existing cluster data if requested
        if clear_existing:
            self._clear_existing_clusters()

        # Determine number of clusters
        if n_clusters is None and method == 'kmeans':
            n_clusters, _ = self.find_optimal_clusters(embeddings)

        # Perform clustering
        logger.info(f"Clustering {len(embeddings)} documents using {method}...")

        if method == 'kmeans':
            model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10, max_iter=300)
            labels = model.fit_predict(embeddings)
            centroids = model.cluster_centers_
        elif method == 'dbscan':
            # DBSCAN for density-based clustering
            model = DBSCAN(eps=0.3, min_samples=self.min_cluster_size, metric='cosine')
            labels = model.fit_predict(embeddings)

            # Calculate centroids manually for DBSCAN
            unique_labels = set(labels)
            unique_labels.discard(-1)  # Remove noise label

            centroids = []
            label_mapping = {}
            for i, label in enumerate(sorted(unique_labels)):
                label_mapping[label] = i
                mask = labels == label
                centroids.append(embeddings[mask].mean(axis=0))

            # Remap labels to consecutive integers
            labels = np.array([label_mapping.get(l, -1) for l in labels])
            centroids = np.array(centroids) if centroids else np.array([])
        else:
            raise ValueError(f"Unknown clustering method: {method}")

        # Process and save clusters
        clusters = {}
        for cluster_idx in range(len(centroids)):
            # Get documents in this cluster
            mask = labels == cluster_idx
            cluster_docs = [doc_ids[i] for i, m in enumerate(mask) if m]

            if len(cluster_docs) < self.min_cluster_size:
                logger.debug(f"Skipping cluster {cluster_idx} (only {len(cluster_docs)} docs)")
                continue

            # Calculate coherence (average similarity to centroid)
            cluster_embeddings = embeddings[mask]
            similarities = self._cosine_similarity(cluster_embeddings, centroids[cluster_idx])
            coherence = float(np.mean(similarities))

            # Generate cluster name
            cluster_name = self._generate_cluster_name(cluster_docs[:10])

            # Save cluster to database
            cluster_id = self._save_cluster(
                name=cluster_name,
                centroid=centroids[cluster_idx].tolist(),
                document_count=len(cluster_docs),
                coherence_score=coherence
            )

            if cluster_id:
                clusters[cluster_id] = cluster_docs

                # Link documents to cluster
                for i, doc_id in enumerate(cluster_docs):
                    idx = list(doc_ids).index(doc_id)
                    similarity = float(similarities[np.where(mask)[0].tolist().index(idx)])
                    self._link_document_to_cluster(doc_id, cluster_id, similarity)

                logger.info(f"Created cluster '{cluster_name}' with {len(cluster_docs)} documents")

        # Handle noise points (DBSCAN)
        if method == 'dbscan':
            noise_docs = [doc_ids[i] for i, l in enumerate(labels) if l == -1]
            if noise_docs:
                logger.info(f"{len(noise_docs)} documents not assigned to any cluster")

        logger.info(f"Clustering complete: {len(clusters)} clusters created")
        return clusters

    def _cosine_similarity(self, embeddings: np.ndarray, centroid: np.ndarray) -> np.ndarray:
        """Calculate cosine similarity between embeddings and a centroid."""
        # Normalize
        embeddings_norm = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
        centroid_norm = centroid / np.linalg.norm(centroid)

        # Dot product gives cosine similarity for normalized vectors
        return np.dot(embeddings_norm, centroid_norm)

    def _generate_cluster_name(self, sample_doc_ids: List[str]) -> str:
        """Generate descriptive name for cluster using LLM or fallback."""
        # Get titles of sample documents
        placeholders = ','.join(['%s'] * len(sample_doc_ids))
        titles = execute_query(
            f"SELECT title FROM documents WHERE document_id IN ({placeholders})",
            tuple(sample_doc_ids),
            fetch='all'
        )

        if not titles:
            return f"Cluster_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

        title_list = [t['title'] for t in titles if t['title']]

        # Select appropriate LLM client
        client = self.client or self.local_client
        model = CLOUD_MODELS['classification'] if self.client else LOCAL_LLM_MODEL

        # Try LLM naming
        if client and title_list:
            try:
                response = client.chat.completions.create(
                    model=model,
                    messages=[{
                        "role": "user",
                        "content": f"""These documents are semantically clustered together:
{chr(10).join('- ' + t for t in title_list[:10])}

Suggest a short (2-5 word) descriptive name for this cluster that captures the common theme.
Respond with ONLY the name, nothing else."""
                    }],
                    max_tokens=30,
                    temperature=0.3
                )
                name = response.choices[0].message.content.strip()
                # Clean up the name
                name = name.strip('"\'')
                if len(name) > 100:
                    name = name[:100]
                return name
            except Exception as e:
                logger.warning(f"LLM naming failed: {e}")

        # Fallback: use most common words from titles
        return self._generate_name_from_titles(title_list)

    def _generate_name_from_titles(self, titles: List[str]) -> str:
        """Generate cluster name from common words in titles."""
        import re
        from collections import Counter

        # Extract words from titles
        words = []
        stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought', 'used', 'that', 'this', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they'}

        for title in titles:
            title_words = re.findall(r'\b[A-Za-z]{3,}\b', title.lower())
            words.extend([w for w in title_words if w not in stop_words])

        if not words:
            return f"Cluster_{len(titles)}_docs"

        # Get most common words
        word_counts = Counter(words)
        top_words = [word.title() for word, _ in word_counts.most_common(3)]

        return ' '.join(top_words)

    def _clear_existing_clusters(self) -> None:
        """Clear all existing cluster data."""
        logger.info("Clearing existing cluster data...")
        with get_db_connection() as conn:
            with conn.cursor() as cur:
                cur.execute("DELETE FROM document_cluster_membership")
                cur.execute("DELETE FROM document_clusters")

    def _save_cluster(
        self,
        name: str,
        centroid: List[float],
        document_count: int,
        coherence_score: float
    ) -> Optional[int]:
        """Save cluster to database and return cluster_id."""
        try:
            result = execute_query(
                """INSERT INTO document_clusters
                   (name, centroid, document_count, coherence_score, auto_generated)
                   VALUES (%s, %s::vector, %s, %s, TRUE)
                   RETURNING cluster_id""",
                (name, centroid, document_count, coherence_score),
                fetch='one'
            )
            return result['cluster_id'] if result else None
        except Exception as e:
            logger.error(f"Failed to save cluster '{name}': {e}")
            return None

    def _link_document_to_cluster(
        self,
        document_id: str,
        cluster_id: int,
        similarity: float,
        is_primary: bool = True
    ) -> None:
        """Link a document to a cluster."""
        try:
            execute_query(
                """INSERT INTO document_cluster_membership
                   (document_id, cluster_id, similarity_to_centroid, is_primary_cluster)
                   VALUES (%s, %s, %s, %s)
                   ON CONFLICT (document_id, cluster_id) DO UPDATE
                   SET similarity_to_centroid = EXCLUDED.similarity_to_centroid""",
                (document_id, cluster_id, similarity, is_primary),
                fetch='none'
            )
        except Exception as e:
            logger.warning(f"Failed to link document {document_id} to cluster {cluster_id}: {e}")

    def find_cross_document_connections(
        self,
        similarity_threshold: float = 0.85,
        limit: int = 1000
    ) -> List[Dict[str, Any]]:
        """
        Find highly similar chunks across different documents.

        This enables "see also" and related passage discovery.

        Args:
            similarity_threshold: Minimum similarity score
            limit: Maximum connections to return

        Returns:
            List of connection dicts
        """
        logger.info(f"Finding cross-document connections (threshold: {similarity_threshold})...")

        # This query finds chunk pairs with high similarity across different documents
        connections = execute_query("""
            WITH ranked_pairs AS (
                SELECT
                    c1.chunk_id as chunk_a,
                    c1.document_id as doc_a,
                    d1.title as title_a,
                    c2.chunk_id as chunk_b,
                    c2.document_id as doc_b,
                    d2.title as title_b,
                    1 - (c1.embedding <=> c2.embedding) as similarity,
                    ROW_NUMBER() OVER (PARTITION BY c1.chunk_id ORDER BY c1.embedding <=> c2.embedding) as rn
                FROM chunks c1
                JOIN documents d1 ON c1.document_id = d1.document_id
                JOIN chunks c2 ON c1.document_id != c2.document_id
                JOIN documents d2 ON c2.document_id = d2.document_id
                WHERE c1.embedding IS NOT NULL
                  AND c2.embedding IS NOT NULL
                  AND c1.chunk_id < c2.chunk_id  -- Avoid duplicates
            )
            SELECT chunk_a, doc_a, title_a, chunk_b, doc_b, title_b, similarity
            FROM ranked_pairs
            WHERE rn <= 3 AND similarity >= %s
            ORDER BY similarity DESC
            LIMIT %s
        """, (similarity_threshold, limit), fetch='all')

        logger.info(f"Found {len(connections)} cross-document connections")
        return connections

    def save_connections_to_database(self, connections: List[Dict[str, Any]]) -> int:
        """
        Save discovered connections to the chunk_connections table.

        Args:
            connections: List of connection dicts from find_cross_document_connections

        Returns:
            Number of connections saved
        """
        saved = 0
        for conn in connections:
            try:
                execute_query(
                    """INSERT INTO chunk_connections
                       (chunk_a_id, chunk_b_id, similarity_score, connection_type)
                       VALUES (%s, %s, %s, 'semantic')
                       ON CONFLICT (chunk_a_id, chunk_b_id) DO UPDATE
                       SET similarity_score = EXCLUDED.similarity_score""",
                    (conn['chunk_a'], conn['chunk_b'], conn['similarity']),
                    fetch='none'
                )
                saved += 1
            except Exception as e:
                logger.warning(f"Failed to save connection: {e}")

        logger.info(f"Saved {saved} connections to database")
        return saved

    def get_cluster_summary(self) -> List[Dict[str, Any]]:
        """Get summary of all clusters."""
        return execute_query("""
            SELECT
                c.cluster_id,
                c.name,
                c.document_count,
                c.coherence_score,
                c.auto_generated,
                c.reviewed,
                c.created_at,
                array_agg(DISTINCT d.title ORDER BY d.title) FILTER (WHERE d.title IS NOT NULL) as sample_titles
            FROM document_clusters c
            LEFT JOIN document_cluster_membership dcm ON c.cluster_id = dcm.cluster_id
            LEFT JOIN documents d ON dcm.document_id = d.document_id
            GROUP BY c.cluster_id, c.name, c.document_count, c.coherence_score,
                     c.auto_generated, c.reviewed, c.created_at
            ORDER BY c.document_count DESC
        """, fetch='all') or []


def main():
    """Command-line interface for document clustering."""
    parser = argparse.ArgumentParser(
        description='Cluster documents based on semantic or TF-IDF similarity'
    )
    parser.add_argument(
        '--n-clusters',
        type=int,
        help='Number of clusters (auto-detected if not specified)'
    )
    parser.add_argument(
        '--method',
        choices=['kmeans', 'dbscan'],
        default='kmeans',
        help='Clustering method (default: kmeans)'
    )
    parser.add_argument(
        '--min-size',
        type=int,
        default=5,
        help='Minimum documents per cluster (default: 5)'
    )
    parser.add_argument(
        '--keep-existing',
        action='store_true',
        help='Keep existing clusters (default: replace all)'
    )
    parser.add_argument(
        '--find-connections',
        action='store_true',
        help='Also find cross-document connections'
    )
    parser.add_argument(
        '--connection-threshold',
        type=float,
        default=0.85,
        help='Similarity threshold for connections (default: 0.85)'
    )
    parser.add_argument(
        '--tfidf',
        action='store_true',
        help='Use TF-IDF vectors instead of embeddings (offline mode)'
    )
    parser.add_argument(
        '--mode',
        choices=['cloud', 'local', 'statistical'],
        help='Intelligence mode override (default: auto-detect)'
    )

    args = parser.parse_args()

    # Determine mode
    mode = args.mode
    if args.tfidf:
        mode = 'statistical'

    try:
        clusterer = SemanticClusterer(min_cluster_size=args.min_size, mode=mode)

        # Perform clustering
        print("\n" + "=" * 60)
        print("DOCUMENT CLUSTERING")
        print("=" * 60)
        print(f"Mode: {clusterer.mode.upper()} ({'TF-IDF vectors' if clusterer.mode == 'statistical' else 'Embeddings'})")
        print("-" * 60)

        clusters = clusterer.cluster_documents(
            n_clusters=args.n_clusters,
            method=args.method,
            clear_existing=not args.keep_existing
        )

        # Print summary
        print("\n" + "=" * 60)
        print("CLUSTER SUMMARY")
        print("=" * 60)

        summary = clusterer.get_cluster_summary()
        for cluster in summary:
            print(f"\n[{cluster['cluster_id']}] {cluster['name']}")
            print(f"    Documents: {cluster['document_count']}")
            print(f"    Coherence: {cluster['coherence_score']:.3f}")
            if cluster.get('sample_titles'):
                print(f"    Sample: {cluster['sample_titles'][:2]}")

        print("\n" + "=" * 60)
        print(f"Total clusters: {len(clusters)}")
        print(f"Total documents clustered: {sum(len(docs) for docs in clusters.values())}")
        print("=" * 60)

        # Find connections if requested
        if args.find_connections:
            print("\n" + "=" * 60)
            print("FINDING CROSS-DOCUMENT CONNECTIONS")
            print("=" * 60)

            connections = clusterer.find_cross_document_connections(
                similarity_threshold=args.connection_threshold
            )

            if connections:
                saved = clusterer.save_connections_to_database(connections)
                print(f"Found and saved {saved} semantic connections")

                # Show top connections
                print("\nTop 10 connections:")
                for conn in connections[:10]:
                    print(f"  [{conn['similarity']:.3f}] {conn['title_a'][:30]}... <-> {conn['title_b'][:30]}...")

    except Exception as e:
        logger.error(f"Clustering failed: {e}")
        raise


if __name__ == '__main__':
    main()
