""" Searcher module for Obsidian RAG Backend. This module handles semantic search operations on the indexed ChromaDB collection. """ from dataclasses import dataclass from typing import List import chromadb from chromadb.config import Settings from sentence_transformers import SentenceTransformer from indexer import EMBEDDING_MODEL @dataclass class SearchResult: """ Represents a single search result with metadata and relevance score. """ file_path: str section_title: str line_start: int line_end: int score: float text: str def search_vault( query: str, chroma_db_path: str, collection_name: str = "obsidian_vault", embedding_model: str = EMBEDDING_MODEL, limit: int = 5, min_score: float = 0.0, ) -> List[SearchResult]: """ Search the indexed vault for semantically similar content. Args: query: Search query string chroma_db_path: Path to ChromaDB data directory collection_name: Name of the ChromaDB collection to search embedding_model: Model used for embeddings (must match indexing model) limit: Maximum number of results to return min_score: Minimum similarity score threshold (0.0 to 1.0) Returns: List of SearchResult objects, sorted by relevance (highest score first) Raises: ValueError: If the collection does not exist or query is empty """ if not query or not query.strip(): raise ValueError("Query cannot be empty") # Initialize ChromaDB client chroma_client = chromadb.PersistentClient( path=chroma_db_path, settings=Settings(anonymized_telemetry=False) ) # Get collection (will raise if it doesn't exist) try: collection = chroma_client.get_collection(name=collection_name) except Exception as e: raise ValueError( f"Collection '{collection_name}' not found. " f"Please index your vault first using the index command." ) from e # Initialize embedding model (same as used during indexing) model = SentenceTransformer(embedding_model) # Generate query embedding query_embedding = model.encode(query, show_progress_bar=False) # Perform search results = collection.query( query_embeddings=[query_embedding.tolist()], n_results=limit, ) # Parse and format results search_results = _parse_search_results(results, min_score) return search_results def _parse_search_results( raw_results: dict, min_score: float, ) -> List[SearchResult]: """ Parse ChromaDB query results into SearchResult objects. ChromaDB returns distances (lower = more similar). We convert to similarity scores (higher = more similar) using: score = 1 - distance Args: raw_results: Raw results dictionary from ChromaDB query min_score: Minimum similarity score to include Returns: List of SearchResult objects filtered by min_score """ search_results = [] # ChromaDB returns results as lists of lists (one list per query) # We only have one query, so we take the first element documents = raw_results.get("documents", [[]])[0] metadatas = raw_results.get("metadatas", [[]])[0] distances = raw_results.get("distances", [[]])[0] for doc, metadata, distance in zip(documents, metadatas, distances): # Convert distance to similarity score (cosine distance -> cosine similarity) score = 1.0 - distance # Filter by minimum score if score < min_score: continue search_results.append(SearchResult( file_path=metadata["file_path"], section_title=metadata["section_title"], line_start=metadata["line_start"], line_end=metadata["line_end"], score=score, text=doc, )) return search_results