131 lines
3.6 KiB
Python
131 lines
3.6 KiB
Python
"""
|
|
Searcher module for Obsidian RAG Backend.
|
|
|
|
This module handles semantic search operations on the indexed ChromaDB collection.
|
|
"""
|
|
from dataclasses import dataclass
|
|
from typing import List
|
|
import chromadb
|
|
from chromadb.config import Settings
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
from indexer import EMBEDDING_MODEL
|
|
|
|
|
|
@dataclass
|
|
class SearchResult:
|
|
"""
|
|
Represents a single search result with metadata and relevance score.
|
|
"""
|
|
file_path: str
|
|
section_title: str
|
|
line_start: int
|
|
line_end: int
|
|
score: float
|
|
text: str
|
|
|
|
|
|
def search_vault(
|
|
query: str,
|
|
chroma_db_path: str,
|
|
collection_name: str = "obsidian_vault",
|
|
embedding_model: str = EMBEDDING_MODEL,
|
|
limit: int = 5,
|
|
min_score: float = 0.0,
|
|
) -> List[SearchResult]:
|
|
"""
|
|
Search the indexed vault for semantically similar content.
|
|
|
|
Args:
|
|
query: Search query string
|
|
chroma_db_path: Path to ChromaDB data directory
|
|
collection_name: Name of the ChromaDB collection to search
|
|
embedding_model: Model used for embeddings (must match indexing model)
|
|
limit: Maximum number of results to return
|
|
min_score: Minimum similarity score threshold (0.0 to 1.0)
|
|
|
|
Returns:
|
|
List of SearchResult objects, sorted by relevance (highest score first)
|
|
|
|
Raises:
|
|
ValueError: If the collection does not exist or query is empty
|
|
"""
|
|
if not query or not query.strip():
|
|
raise ValueError("Query cannot be empty")
|
|
|
|
# Initialize ChromaDB client
|
|
chroma_client = chromadb.PersistentClient(
|
|
path=chroma_db_path,
|
|
settings=Settings(anonymized_telemetry=False)
|
|
)
|
|
|
|
# Get collection (will raise if it doesn't exist)
|
|
try:
|
|
collection = chroma_client.get_collection(name=collection_name)
|
|
except Exception as e:
|
|
raise ValueError(
|
|
f"Collection '{collection_name}' not found. "
|
|
f"Please index your vault first using the index command."
|
|
) from e
|
|
|
|
# Initialize embedding model (same as used during indexing)
|
|
model = SentenceTransformer(embedding_model)
|
|
|
|
# Generate query embedding
|
|
query_embedding = model.encode(query, show_progress_bar=False)
|
|
|
|
# Perform search
|
|
results = collection.query(
|
|
query_embeddings=[query_embedding.tolist()],
|
|
n_results=limit,
|
|
)
|
|
|
|
# Parse and format results
|
|
search_results = _parse_search_results(results, min_score)
|
|
|
|
return search_results
|
|
|
|
|
|
def _parse_search_results(
|
|
raw_results: dict,
|
|
min_score: float,
|
|
) -> List[SearchResult]:
|
|
"""
|
|
Parse ChromaDB query results into SearchResult objects.
|
|
|
|
ChromaDB returns distances (lower = more similar). We convert to
|
|
similarity scores (higher = more similar) using: score = 1 - distance
|
|
|
|
Args:
|
|
raw_results: Raw results dictionary from ChromaDB query
|
|
min_score: Minimum similarity score to include
|
|
|
|
Returns:
|
|
List of SearchResult objects filtered by min_score
|
|
"""
|
|
search_results = []
|
|
|
|
# ChromaDB returns results as lists of lists (one list per query)
|
|
# We only have one query, so we take the first element
|
|
documents = raw_results.get("documents", [[]])[0]
|
|
metadatas = raw_results.get("metadatas", [[]])[0]
|
|
distances = raw_results.get("distances", [[]])[0]
|
|
|
|
for doc, metadata, distance in zip(documents, metadatas, distances):
|
|
# Convert distance to similarity score (cosine distance -> cosine similarity)
|
|
score = 1.0 - distance
|
|
|
|
# Filter by minimum score
|
|
if score < min_score:
|
|
continue
|
|
|
|
search_results.append(SearchResult(
|
|
file_path=metadata["file_path"],
|
|
section_title=metadata["section_title"],
|
|
line_start=metadata["line_start"],
|
|
line_end=metadata["line_end"],
|
|
score=score,
|
|
text=doc,
|
|
))
|
|
|
|
return search_results |