MyObsidianAI/obsidian_rag/searcher.py

"""
Searcher module for Obsidian RAG Backend.

This module handles semantic search operations on the indexed ChromaDB collection.
"""
from dataclasses import dataclass
from typing import List
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

from indexer import EMBEDDING_MODEL


@dataclass
class SearchResult:
  """
  Represents a single search result with metadata and relevance score.
  """
  file_path: str
  section_title: str
  line_start: int
  line_end: int
  score: float
  text: str


def search_vault(
    query: str,
    chroma_db_path: str,
    collection_name: str = "obsidian_vault",
    embedding_model: str = EMBEDDING_MODEL,
    limit: int = 5,
    min_score: float = 0.0,
) -> List[SearchResult]:
  """
  Search the indexed vault for semantically similar content.

  Args:
      query: Search query string
      chroma_db_path: Path to ChromaDB data directory
      collection_name: Name of the ChromaDB collection to search
      embedding_model: Model used for embeddings (must match indexing model)
      limit: Maximum number of results to return
      min_score: Minimum similarity score threshold (0.0 to 1.0)

  Returns:
      List of SearchResult objects, sorted by relevance (highest score first)

  Raises:
      ValueError: If the collection does not exist or query is empty
  """
  if not query or not query.strip():
    raise ValueError("Query cannot be empty")

  # Initialize ChromaDB client
  chroma_client = chromadb.PersistentClient(
    path=chroma_db_path,
    settings=Settings(anonymized_telemetry=False)
  )

  # Get collection (will raise if it doesn't exist)
  try:
    collection = chroma_client.get_collection(name=collection_name)
  except Exception as e:
    raise ValueError(
      f"Collection '{collection_name}' not found. "
      f"Please index your vault first using the index command."
    ) from e

  # Initialize embedding model (same as used during indexing)
  model = SentenceTransformer(embedding_model)

  # Generate query embedding
  query_embedding = model.encode(query, show_progress_bar=False)

  # Perform search
  results = collection.query(
    query_embeddings=[query_embedding.tolist()],
    n_results=limit,
  )

  # Parse and format results
  search_results = _parse_search_results(results, min_score)

  return search_results


def _parse_search_results(
    raw_results: dict,
    min_score: float,
) -> List[SearchResult]:
  """
  Parse ChromaDB query results into SearchResult objects.

  ChromaDB returns distances (lower = more similar). We convert to
  similarity scores (higher = more similar) using: score = 1 - distance

  Args:
      raw_results: Raw results dictionary from ChromaDB query
      min_score: Minimum similarity score to include

  Returns:
      List of SearchResult objects filtered by min_score
  """
  search_results = []

  # ChromaDB returns results as lists of lists (one list per query)
  # We only have one query, so we take the first element
  documents = raw_results.get("documents", [[]])[0]
  metadatas = raw_results.get("metadatas", [[]])[0]
  distances = raw_results.get("distances", [[]])[0]

  for doc, metadata, distance in zip(documents, metadatas, distances):
    # Convert distance to similarity score (cosine distance -> cosine similarity)
    score = 1.0 - distance

    # Filter by minimum score
    if score < min_score:
      continue

    search_results.append(SearchResult(
      file_path=metadata["file_path"],
      section_title=metadata["section_title"],
      line_start=metadata["line_start"],
      line_end=metadata["line_end"],
      score=score,
      text=doc,
    ))

  return search_results