Files
MyObsidianAI/obsidian_rag/searcher.py
Kodjo Sossouvi d4925f7969 Initial commit
2025-12-12 11:31:44 +01:00

131 lines
3.6 KiB
Python

"""
Searcher module for Obsidian RAG Backend.
This module handles semantic search operations on the indexed ChromaDB collection.
"""
from dataclasses import dataclass
from typing import List
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from indexer import EMBEDDING_MODEL
@dataclass
class SearchResult:
"""
Represents a single search result with metadata and relevance score.
"""
file_path: str
section_title: str
line_start: int
line_end: int
score: float
text: str
def search_vault(
query: str,
chroma_db_path: str,
collection_name: str = "obsidian_vault",
embedding_model: str = EMBEDDING_MODEL,
limit: int = 5,
min_score: float = 0.0,
) -> List[SearchResult]:
"""
Search the indexed vault for semantically similar content.
Args:
query: Search query string
chroma_db_path: Path to ChromaDB data directory
collection_name: Name of the ChromaDB collection to search
embedding_model: Model used for embeddings (must match indexing model)
limit: Maximum number of results to return
min_score: Minimum similarity score threshold (0.0 to 1.0)
Returns:
List of SearchResult objects, sorted by relevance (highest score first)
Raises:
ValueError: If the collection does not exist or query is empty
"""
if not query or not query.strip():
raise ValueError("Query cannot be empty")
# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(
path=chroma_db_path,
settings=Settings(anonymized_telemetry=False)
)
# Get collection (will raise if it doesn't exist)
try:
collection = chroma_client.get_collection(name=collection_name)
except Exception as e:
raise ValueError(
f"Collection '{collection_name}' not found. "
f"Please index your vault first using the index command."
) from e
# Initialize embedding model (same as used during indexing)
model = SentenceTransformer(embedding_model)
# Generate query embedding
query_embedding = model.encode(query, show_progress_bar=False)
# Perform search
results = collection.query(
query_embeddings=[query_embedding.tolist()],
n_results=limit,
)
# Parse and format results
search_results = _parse_search_results(results, min_score)
return search_results
def _parse_search_results(
raw_results: dict,
min_score: float,
) -> List[SearchResult]:
"""
Parse ChromaDB query results into SearchResult objects.
ChromaDB returns distances (lower = more similar). We convert to
similarity scores (higher = more similar) using: score = 1 - distance
Args:
raw_results: Raw results dictionary from ChromaDB query
min_score: Minimum similarity score to include
Returns:
List of SearchResult objects filtered by min_score
"""
search_results = []
# ChromaDB returns results as lists of lists (one list per query)
# We only have one query, so we take the first element
documents = raw_results.get("documents", [[]])[0]
metadatas = raw_results.get("metadatas", [[]])[0]
distances = raw_results.get("distances", [[]])[0]
for doc, metadata, distance in zip(documents, metadatas, distances):
# Convert distance to similarity score (cosine distance -> cosine similarity)
score = 1.0 - distance
# Filter by minimum score
if score < min_score:
continue
search_results.append(SearchResult(
file_path=metadata["file_path"],
section_title=metadata["section_title"],
line_start=metadata["line_start"],
line_end=metadata["line_end"],
score=score,
text=doc,
))
return search_results