Initial commit

This commit is contained in:
Kodjo Sossouvi
2025-12-12 11:31:44 +01:00
commit d4925f7969
21 changed files with 2957 additions and 0 deletions

0
obsidian_rag/__init__.py Normal file
View File

403
obsidian_rag/cli.py Normal file
View File

@@ -0,0 +1,403 @@
"""
CLI module for Obsidian RAG Backend.
Provides command-line interface for indexing and searching the Obsidian vault.
"""
import os
from pathlib import Path
from typing import Optional
import typer
from rich.console import Console
from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
from indexer import index_vault
from rag_chain import RAGChain
from searcher import search_vault, SearchResult
app = typer.Typer(
name="obsidian-rag",
help="Local semantic search backend for Obsidian markdown files",
add_completion=False,
)
console = Console()
# Default ChromaDB path
DEFAULT_CHROMA_PATH = Path.home() / ".obsidian_rag" / "chroma_db"
def _truncate_path(path: str, max_len: int = 60) -> str:
"""Return a truncated version of the file path if too long."""
if len(path) <= max_len:
return path
return "..." + path[-(max_len - 3):]
@app.command()
def index(
vault_path: str = typer.Argument(
...,
help="Path to the Obsidian vault directory",
),
chroma_path: Optional[str] = typer.Option(
None,
"--chroma-path",
"-c",
help=f"Path to ChromaDB storage (default: {DEFAULT_CHROMA_PATH})",
),
collection_name: str = typer.Option(
"obsidian_vault",
"--collection",
help="Name of the ChromaDB collection",
),
max_chunk_tokens: int = typer.Option(
200,
"--max-tokens",
help="Maximum tokens per chunk",
),
overlap_tokens: int = typer.Option(
30,
"--overlap",
help="Number of overlapping tokens between chunks",
),
):
"""
Index all markdown files from the Obsidian vault into ChromaDB.
"""
vault_path_obj = Path(vault_path)
chroma_path_obj = Path(chroma_path) if chroma_path else DEFAULT_CHROMA_PATH
if not vault_path_obj.exists():
console.print(f"[red]✗ Error:[/red] Vault path does not exist: {vault_path}")
raise typer.Exit(code=1)
if not vault_path_obj.is_dir():
console.print(f"[red]✗ Error:[/red] Vault path is not a directory: {vault_path}")
raise typer.Exit(code=1)
chroma_path_obj.mkdir(parents=True, exist_ok=True)
md_files = list(vault_path_obj.rglob("*.md"))
total_files = len(md_files)
if total_files == 0:
console.print(f"[yellow]⚠ Warning:[/yellow] No markdown files found in {vault_path}")
raise typer.Exit(code=0)
console.print(f"\n[cyan]Found {total_files} markdown files to index[/cyan]\n")
# One single stable progress bar
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
console=console,
) as progress:
main_task = progress.add_task("[cyan]Indexing vault...", total=total_files)
# Create a separate status line below the progress bar
status_line = console.status("[dim]Preparing first file...")
def progress_callback(current_file: str, files_processed: int, total: int):
"""Update progress bar and status message."""
progress.update(main_task, completed=files_processed)
short_file = _truncate_path(current_file)
status_line.update(f"[dim]Processing: {short_file}")
try:
with status_line:
stats = index_vault(
vault_path=str(vault_path_obj),
chroma_db_path=str(chroma_path_obj),
collection_name=collection_name,
max_chunk_tokens=max_chunk_tokens,
overlap_tokens=overlap_tokens,
progress_callback=progress_callback,
)
progress.update(main_task, completed=total_files)
status_line.update("[green]✓ Completed")
except Exception as e:
console.print(f"\n[red]✗ Error during indexing:[/red] {str(e)}")
raise typer.Exit(code=1)
console.print()
_display_index_results(stats)
@app.command()
def search(
query: str = typer.Argument(
...,
help="Search query",
),
chroma_path: Optional[str] = typer.Option(
None,
"--chroma-path",
"-c",
help=f"Path to ChromaDB storage (default: {DEFAULT_CHROMA_PATH})",
),
collection_name: str = typer.Option(
"obsidian_vault",
"--collection",
help="Name of the ChromaDB collection",
),
limit: int = typer.Option(
5,
"--limit",
"-l",
help="Maximum number of results to return",
),
min_score: float = typer.Option(
0.0,
"--min-score",
"-s",
help="Minimum similarity score (0.0 to 1.0)",
),
format: str = typer.Option(
"compact",
"--format",
"-f",
help="Output format: compact (default), panel, table",
),
):
"""
Search the indexed vault for semantically similar content.
Returns relevant sections from your Obsidian notes based on
semantic similarity to the query.
"""
# Resolve paths
chroma_path_obj = Path(chroma_path) if chroma_path else DEFAULT_CHROMA_PATH
# Validate chroma path exists
if not chroma_path_obj.exists():
console.print(
f"[red]✗ Error:[/red] ChromaDB not found at {chroma_path_obj}\n"
f"Please run 'obsidian-rag index <vault_path>' first to create the index."
)
raise typer.Exit(code=1)
# Validate format
valid_formats = ["compact", "panel", "table"]
if format not in valid_formats:
console.print(f"[red]✗ Error:[/red] Invalid format '{format}'. Valid options: {', '.join(valid_formats)}")
raise typer.Exit(code=1)
# Perform search
try:
with console.status("[cyan]Searching...", spinner="dots"):
results = search_vault(
query=query,
chroma_db_path=str(chroma_path_obj),
collection_name=collection_name,
limit=limit,
min_score=min_score,
)
except ValueError as e:
console.print(f"[red]✗ Error:[/red] {str(e)}")
raise typer.Exit(code=1)
except Exception as e:
console.print(f"[red]✗ Unexpected error:[/red] {str(e)}")
raise typer.Exit(code=1)
# Display results
if not results:
console.print(f"\n[yellow]No results found for query:[/yellow] '{query}'")
if min_score > 0:
console.print(f"[dim]Try lowering --min-score (currently {min_score})[/dim]")
raise typer.Exit(code=0)
console.print(f"\n[cyan]Found {len(results)} result(s) for:[/cyan] '{query}'\n")
# Display with selected format
if format == "compact":
_display_results_compact(results)
elif format == "panel":
_display_results_panel(results)
elif format == "table":
_display_results_table(results)
@app.command()
def ask(
query: str = typer.Argument(
...,
help="Question to ask the LLM based on your Obsidian notes."
),
chroma_path: Optional[str] = typer.Option(
None,
"--chroma-path",
"-c",
help=f"Path to ChromaDB storage (default: {DEFAULT_CHROMA_PATH})",
),
collection_name: str = typer.Option(
"obsidian_vault",
"--collection",
help="Name of the ChromaDB collection",
),
top_k: int = typer.Option(
5,
"--top-k",
"-k",
help="Number of top chunks to use for context",
),
min_score: float = typer.Option(
0.0,
"--min-score",
"-s",
help="Minimum similarity score for chunks",
),
api_key: Optional[str] = typer.Option(
None,
"--api-key",
help="Clovis API key (or set CLOVIS_API_KEY environment variable)",
),
base_url: Optional[str] = typer.Option(
None,
"--base-url",
help="Clovis base URL (or set CLOVIS_BASE_URL environment variable)",
),
):
"""
Ask a question to the LLM using RAG over your Obsidian vault.
"""
# Resolve ChromaDB path
chroma_path_obj = Path(chroma_path) if chroma_path else DEFAULT_CHROMA_PATH
if not chroma_path_obj.exists():
console.print(
f"[red]✗ Error:[/red] ChromaDB not found at {chroma_path_obj}\n"
f"Please run 'obsidian-rag index <vault_path>' first to create the index."
)
raise typer.Exit(code=1)
# Resolve API key and base URL
api_key = api_key or os.getenv("CLOVIS_API_KEY")
base_url = base_url or os.getenv("CLOVIS_BASE_URL")
if not api_key or not base_url:
console.print(
"[red]✗ Error:[/red] API key or base URL not provided.\n"
"Set them via --api-key / --base-url or environment variables CLOVIS_API_KEY and CLOVIS_BASE_URL."
)
raise typer.Exit(code=1)
# Instantiate RAGChain
rag = RAGChain(
chroma_db_path=str(chroma_path_obj),
collection_name=collection_name,
top_k=top_k,
min_score=min_score,
api_key=api_key,
base_url=base_url,
)
# Get answer from RAG
try:
with console.status("[cyan]Querying LLM...", spinner="dots"):
answer, used_chunks = rag.answer_query(query)
except Exception as e:
console.print(f"[red]✗ Error:[/red] {str(e)}")
raise typer.Exit(code=1)
# Display answer
console.print("\n[bold green]Answer:[/bold green]\n")
console.print(answer + "\n")
# Display sources used
if used_chunks:
sources = ", ".join(f"{c.file_path}#L{c.line_start}-L{c.line_end}" for c in used_chunks)
console.print(f"[bold cyan]Sources:[/bold cyan] {sources}\n")
else:
console.print("[bold cyan]Sources:[/bold cyan] None\n")
def _display_index_results(stats: dict):
"""
Display indexing results with rich formatting.
Args:
stats: Statistics dictionary from index_vault
"""
files_processed = stats["files_processed"]
chunks_created = stats["chunks_created"]
errors = stats["errors"]
# Success summary
console.print(Panel(
f"[green]✓[/green] Indexing completed\n\n"
f"Files processed: [cyan]{files_processed}[/cyan]\n"
f"Chunks created: [cyan]{chunks_created}[/cyan]\n"
f"Collection: [cyan]{stats['collection_name']}[/cyan]",
title="[bold]Indexing Results[/bold]",
border_style="green",
))
# Display errors if any
if errors:
console.print(f"\n[yellow]⚠ {len(errors)} file(s) skipped due to errors:[/yellow]\n")
for error in errors:
console.print(f" [red]•[/red] {error['file']}: [dim]{error['error']}[/dim]")
def _display_results_compact(results: list[SearchResult]):
"""
Display search results in compact format.
Args:
results: List of SearchResult objects
"""
for idx, result in enumerate(results, 1):
# Format score as stars (0-5 scale)
stars = "" * int(result.score * 5)
console.print(f"[bold cyan]{idx}.[/bold cyan] {result.file_path} [dim](score: {result.score:.2f} {stars})[/dim]")
console.print(
f" Section: [yellow]{result.section_title}[/yellow] | Lines: [dim]{result.line_start}-{result.line_end}[/dim]")
# Truncate text if too long
text = result.text
if len(text) > 200:
text = text[:200] + "..."
console.print(f" {text}\n")
def _display_results_panel(results: list[SearchResult]):
"""
Display search results in panel format (rich boxes).
Args:
results: List of SearchResult objects
"""
# TODO: Implement panel format in future
console.print("[yellow]Panel format not yet implemented. Using compact format.[/yellow]\n")
_display_results_compact(results)
def _display_results_table(results: list[SearchResult]):
"""
Display search results in table format.
Args:
results: List of SearchResult objects
"""
# TODO: Implement table format in future
console.print("[yellow]Table format not yet implemented. Using compact format.[/yellow]\n")
_display_results_compact(results)
def main():
"""
Entry point for the CLI application.
"""
app()
if __name__ == "__main__":
main()

284
obsidian_rag/indexer.py Normal file
View File

@@ -0,0 +1,284 @@
"""
Indexer module for Obsidian RAG Backend.
This module handles the indexing of markdown files into a ChromaDB vector store
using local embeddings from sentence-transformers.
"""
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Dict, List, Optional, Callable
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from markdown_parser import ParsedDocument
from markdown_parser import parse_markdown_file
# EMBEDDING_MODEL = "all-MiniLM-L6-v2"
EMBEDDING_MODEL = "all-MiniLM-L12-v2"
@dataclass
class ChunkMetadata:
file_path: str
section_title: str
line_start: int
line_end: int
@dataclass
class Chunk:
id: str
text: str
metadata: ChunkMetadata
def index_vault(
vault_path: str,
chroma_db_path: str,
collection_name: str = "obsidian_vault",
embedding_model: str = EMBEDDING_MODEL,
max_chunk_tokens: int = 200,
overlap_tokens: int = 30,
progress_callback: Optional[Callable[[str, int, int], None]] = None,
) -> Dict:
"""
Index all markdown files from vault into ChromaDB.
Args:
vault_path: Path to the Obsidian vault directory
chroma_db_path: Path where ChromaDB will store its data
collection_name: Name of the ChromaDB collection
embedding_model: Name of the sentence-transformers model to use
max_chunk_tokens: Maximum tokens per chunk
overlap_tokens: Number of overlapping tokens between chunks
progress_callback: Optional callback function called for each file processed.
Signature: callback(current_file: str, files_processed: int, total_files: int)
Returns:
Dictionary with indexing statistics:
- files_processed: Number of files successfully processed
- chunks_created: Total number of chunks created
- errors: List of errors encountered (file path and error message)
- collection_name: Name of the collection used
"""
vault_path_obj = Path(vault_path)
if not vault_path_obj.exists():
raise ValueError(f"Vault path does not exist: {vault_path}")
# Initialize embedding model and tokenizer
model = SentenceTransformer(embedding_model)
tokenizer = model.tokenizer
# Initialize ChromaDB client and collection
chroma_client = chromadb.PersistentClient(
path=chroma_db_path,
settings=Settings(anonymized_telemetry=False)
)
collection = _get_or_create_collection(chroma_client, collection_name)
# Find all markdown files
md_files = list(vault_path_obj.rglob("*.md"))
total_files = len(md_files)
# Statistics tracking
stats = {
"files_processed": 0,
"chunks_created": 0,
"errors": [],
"collection_name": collection_name,
}
# Process each file
for md_file in md_files:
# Get relative path for display
relative_path = md_file.relative_to(vault_path_obj)
# Notify callback that we're starting this file
if progress_callback:
progress_callback(str(relative_path), stats["files_processed"], total_files)
try:
# Parse markdown file
parsed_doc = parse_markdown_file(md_file)
# Create chunks from document
chunks = _create_chunks_from_document(
parsed_doc,
tokenizer,
max_chunk_tokens,
overlap_tokens,
vault_path_obj,
)
if chunks:
# Extract data for ChromaDB
documents = [chunk.text for chunk in chunks]
metadatas = [asdict(chunk.metadata) for chunk in chunks]
ids = [chunk.id for chunk in chunks]
# Generate embeddings and add to collection
embeddings = model.encode(documents, show_progress_bar=False)
collection.add(
documents=documents,
metadatas=metadatas,
ids=ids,
embeddings=embeddings.tolist(),
)
stats["chunks_created"] += len(chunks)
stats["files_processed"] += 1
except Exception as e:
# Log error but continue processing
stats["errors"].append({
"file": str(relative_path),
"error": str(e),
})
return stats
def _get_or_create_collection(
chroma_client: chromadb.PersistentClient,
collection_name: str,
) -> chromadb.Collection:
"""
Get or create a ChromaDB collection, resetting it if it already exists.
Args:
chroma_client: ChromaDB client instance
collection_name: Name of the collection
Returns:
ChromaDB collection instance
"""
try:
# Try to delete existing collection
chroma_client.delete_collection(name=collection_name)
except Exception:
# Collection doesn't exist, that's fine
pass
# Create fresh collection
collection = chroma_client.create_collection(
name=collection_name,
metadata={"hnsw:space": "cosine"} # Use cosine similarity
)
return collection
def _create_chunks_from_document(
parsed_doc: ParsedDocument,
tokenizer,
max_chunk_tokens: int,
overlap_tokens: int,
vault_path: Path,
) -> List[Chunk]:
"""
Transform a parsed document into chunks with metadata.
Implements hybrid chunking strategy:
- Short sections (≤max_chunk_tokens): one chunk per section
- Long sections (>max_chunk_tokens): split with sliding window
Args:
parsed_doc: Parsed document from markdown_parser
tokenizer: Tokenizer from sentence-transformers model
max_chunk_tokens: Maximum tokens per chunk
overlap_tokens: Number of overlapping tokens between chunks
vault_path: Path to vault root (for relative path calculation)
Returns:
List of chunk dictionaries with 'text', 'metadata', and 'id' keys
"""
chunks = []
file_path = parsed_doc.file_path
relative_path = file_path.relative_to(vault_path)
for section in parsed_doc.sections:
section_text = f"{parsed_doc.title} {section.title} {section.content}"
section_title = section.title
line_start = section.start_line
line_end = section.end_line
# Tokenize section to check length
tokens = tokenizer.encode(section_text, add_special_tokens=False)
if len(tokens) <= max_chunk_tokens:
# Short section: create single chunk
chunk_id = f"{relative_path}::{section_title}::{line_start}-{line_end}"
chunks.append(Chunk(chunk_id, section_text, ChunkMetadata(str(relative_path),
section_title,
line_start,
line_start
)))
else:
# Long section: split with sliding window
sub_chunks = _chunk_section(
section_text,
tokenizer,
max_chunk_tokens,
overlap_tokens,
)
# Create chunk for each sub-chunk
for idx, sub_chunk_text in enumerate(sub_chunks):
chunk_id = f"{relative_path}::{section_title}::{line_start}-{line_end}::chunk{idx}"
chunks.append(Chunk(chunk_id, sub_chunk_text, ChunkMetadata(str(relative_path),
section_title,
line_start,
line_start
)))
return chunks
def _chunk_section(
section_text: str,
tokenizer,
max_chunk_tokens: int,
overlap_tokens: int,
) -> List[str]:
"""
Split a section into overlapping chunks using sliding window.
Args:
section_text: Text content to chunk
tokenizer: Tokenizer from sentence-transformers model
max_chunk_tokens: Maximum tokens per chunk
overlap_tokens: Number of overlapping tokens between chunks
Returns:
List of text chunks
"""
# Apply safety margin to prevent decode/encode inconsistencies
# from exceeding the max token limit
max_chunk_tokens_to_use = int(max_chunk_tokens * 0.98)
# Tokenize the full text
tokens = tokenizer.encode(section_text, add_special_tokens=False)
chunks = []
start_idx = 0
while start_idx < len(tokens):
# Extract chunk tokens
end_idx = start_idx + max_chunk_tokens_to_use
chunk_tokens = tokens[start_idx:end_idx]
# Decode back to text
chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
chunks.append(chunk_text)
# Move window forward (with overlap)
start_idx += max_chunk_tokens_to_use - overlap_tokens
# Avoid infinite loop if overlap >= max_chunk_tokens
if start_idx <= start_idx - (max_chunk_tokens_to_use - overlap_tokens):
break
return chunks

View File

@@ -0,0 +1,74 @@
from typing import Dict
import openai
class LLMClient:
"""
Minimalist client for interacting with Clovis LLM via OpenAI SDK.
Attributes:
api_key (str): API key for Clovis.
base_url (str): Base URL for Clovis LLM gateway.
model (str): Model name to use. Defaults to 'ClovisLLM'.
"""
def __init__(self, api_key: str, base_url: str, model: str = "ClovisLLM") -> None:
if not api_key:
raise ValueError("API key is required for LLMClient.")
if not base_url:
raise ValueError("Base URL is required for LLMClient.")
self.api_key = api_key
self.base_url = base_url
self.model = model
self.client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
def generate(self, system_prompt: str, user_prompt: str, context: str) -> Dict[str, object]:
"""
Generate a response from the LLM given a system prompt, user prompt, and context.
Args:
system_prompt (str): Instructions for the assistant.
user_prompt (str): The user's query.
context (str): Concatenated chunks from RAG search.
Returns:
Dict[str, object]: Contains:
- "answer" (str): Text generated by the LLM.
- "usage" (int): Total tokens used in the completion.
"""
# Construct user message with explicit CONTEXT / QUESTION separation
user_message_content = f"CONTEXT:\n{context}\n\nQUESTION:\n{user_prompt}"
try:
response = self.client.chat.completions.create(model=self.model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message_content}
],
temperature=0.7,
max_tokens=2000,
top_p=1.0,
n=1,
# stream=False,
# presence_penalty=0.0,
# frequency_penalty=0.0,
# stop=None,
# logit_bias={},
user="obsidian_rag",
)
except Exception as e:
# For now, propagate exceptions (C1 minimal)
raise e
# Extract text and usage
try:
answer_text = response.choices[0].message.content
total_tokens = response.usage.total_tokens
except AttributeError:
# Fallback if response structure is unexpected
answer_text = ""
total_tokens = 0
return {"answer": answer_text, "usage": total_tokens}

View File

@@ -0,0 +1,213 @@
"""Markdown parser for Obsidian vault files.
This module provides functionality to parse markdown files and extract
their structure (sections, line numbers) for semantic search indexing.
"""
import re
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional
@dataclass
class MarkdownSection:
"""Represents a section in a markdown document.
Attributes:
level: Header level (0 for no header, 1 for #, 2 for ##, etc.)
title: Section title (empty string if level=0)
content: Text content without the header line
start_line: Line number where section starts (1-indexed)
end_line: Line number where section ends (1-indexed, inclusive)
"""
level: int
title: str
content: str
parents: list[str]
start_line: int
end_line: int
@dataclass
class ParsedDocument:
"""Represents a parsed markdown document.
Attributes:
file_path: Path to the markdown file
sections: List of sections extracted from the document
raw_content: Full file content as string
"""
file_path: Path
title: str
sections: List[MarkdownSection]
raw_content: str
def _compute_parents(current_parents, previous_level, previous_title, current_level):
"""Computes the parents of `current_parents`."""
return current_parents
def parse_markdown_file(file_path: Path, vault_path=None) -> ParsedDocument:
"""Parse a markdown file and extract its structure.
This function reads a markdown file, identifies all header sections,
and extracts their content with precise line number tracking.
Files without headers are treated as a single section with level 0.
Args:
file_path: Path to the markdown file to parse
vault_path: Path to the vault file.
Returns:
ParsedDocument containing the file structure and content
Raises:
FileNotFoundError: If the file does not exist
Example:
>>> doc = parse_markdown_file(Path("notes/example.md"))
>>> print(f"Found {len(doc.sections)} sections")
>>> print(doc.sections[0].title)
"""
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
if vault_path:
title = str(file_path.relative_to(vault_path)).replace(".md", "")
title = title.replace("\\", " ").replace("/", " ")
else:
title = file_path.stem
raw_content = file_path.read_text(encoding="utf-8")
lines = raw_content.splitlines()
sections: List[MarkdownSection] = []
current_section_start = 1
current_level = 0
current_title = ""
current_parents = []
current_content_lines: List[str] = []
header_pattern = re.compile(r"^(#{1,6})\s+(.+)$")
for line_num, line in enumerate(lines, start=1):
match = header_pattern.match(line)
if match:
# Save the previous section only if it actually has content.
if current_content_lines:
content = "\n".join(current_content_lines)
sections.append(
MarkdownSection(
level=current_level,
title=current_title,
content=content,
parents=current_parents,
start_line=current_section_start,
end_line=line_num - 1,
)
)
# Start a new section with the detected header.
previous_level = current_level
previous_title = current_title
current_level = len(match.group(1))
current_title = match.group(2).strip()
current_section_start = line_num
current_parents = _compute_parents(current_parents, previous_level, previous_title, current_level)
current_content_lines = []
else:
current_content_lines.append(line)
# Handle the final section (or whole file if no headers were found).
if lines:
content = "\n".join(current_content_lines)
end_line = len(lines)
# Case 1 no header was ever found.
if not sections and current_level == 0:
sections.append(
MarkdownSection(
level=0,
title="",
content=content,
parents=current_parents,
start_line=1,
end_line=end_line,
)
)
# Case 2 a single header was found (sections empty but we have a title).
elif not sections:
sections.append(
MarkdownSection(
level=current_level,
title=current_title,
content=content,
parents=current_parents,
start_line=current_section_start,
end_line=end_line,
)
)
# Case 3 multiple headers were found (sections already contains earlier ones).
else:
sections.append(
MarkdownSection(
level=current_level,
title=current_title,
content=content,
parents=current_parents,
start_line=current_section_start,
end_line=end_line,
)
)
else:
# Empty file: create a single empty level0 section.
sections.append(
MarkdownSection(
level=0,
title="",
content="",
parents=[],
start_line=1,
end_line=1,
)
)
return ParsedDocument(
file_path=file_path,
title=title,
sections=sections,
raw_content=raw_content,
)
def find_section_at_line(
document: ParsedDocument,
line_number: int,
) -> Optional[MarkdownSection]:
"""Find which section contains a given line number.
This function searches through the document's sections to find
which section contains the specified line number.
Args:
document: Parsed markdown document
line_number: Line number to search for (1-indexed)
Returns:
MarkdownSection containing the line, or None if line number
is invalid or out of range
Example:
>>> section = find_section_at_line(doc, 42)
>>> if section:
... print(f"Line 42 is in section: {section.title}")
"""
if line_number < 1:
return None
for section in document.sections:
if section.start_line <= line_number <= section.end_line:
return section

96
obsidian_rag/rag_chain.py Normal file
View File

@@ -0,0 +1,96 @@
# File: obsidian_rag/rag_chain.py
from pathlib import Path
from typing import List, Tuple
from indexer import EMBEDDING_MODEL
from llm_client import LLMClient
from searcher import search_vault, SearchResult
class RAGChain:
"""
Retrieval-Augmented Generation (RAG) chain for answering queries
using semantic search over an Obsidian vault and LLM.
Attributes:
chroma_db_path (Path): Path to ChromaDB.
collection_name (str): Chroma collection name.
embedding_model (str): Embedding model name.
top_k (int): Number of chunks to send to the LLM.
min_score (float): Minimum similarity score for chunks.
system_prompt (str): System prompt to instruct the LLM.
llm_client (LLMClient): Internal LLM client instance.
"""
DEFAULT_SYSTEM_PROMPT = (
"You are an assistant specialized in analyzing Obsidian notes.\n\n"
"INSTRUCTIONS:\n"
"- Answer based ONLY on the provided context\n"
"- Cite the sources (files) you use\n"
"- If the information is not in the context, say \"I did not find this information in your notes\"\n"
"- Be concise but thorough\n"
"- Structure your answer with sections if necessary"
)
def __init__(
self,
chroma_db_path: str,
api_key: str,
base_url: str,
collection_name: str = "obsidian_vault",
embedding_model: str = EMBEDDING_MODEL,
top_k: int = 5,
min_score: float = 0.0,
system_prompt: str = None,
) -> None:
self.chroma_db_path = Path(chroma_db_path)
self.collection_name = collection_name
self.embedding_model = embedding_model
self.top_k = top_k
self.min_score = min_score
self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT
# Instantiate internal LLM client
self.llm_client = LLMClient(api_key=api_key, base_url=base_url)
def answer_query(self, query: str) -> Tuple[str, List[SearchResult]]:
"""
Answer a user query using RAG: search vault, build context, call LLM.
Args:
query (str): User query.
Returns:
Tuple[str, List[SearchResult]]:
- LLM answer (str)
- List of used SearchResult chunks
"""
# 1. Perform semantic search
chunks: List[SearchResult] = search_vault(
query=query,
chroma_db_path=str(self.chroma_db_path),
collection_name=self.collection_name,
embedding_model=self.embedding_model,
limit=self.top_k,
min_score=self.min_score,
)
# 2. Build context string with citations
context_parts: List[str] = []
for chunk in chunks:
chunk_text = chunk.text.strip()
citation = f"[{chunk.file_path}#L{chunk.line_start}-L{chunk.line_end}]"
context_parts.append(f"{chunk_text}\n{citation}")
context_str = "\n\n".join(context_parts) if context_parts else ""
# 3. Call LLM with context + question
llm_response = self.llm_client.generate(
system_prompt=self.system_prompt,
user_prompt=query,
context=context_str,
)
answer_text = llm_response.get("answer", "")
return answer_text, chunks

131
obsidian_rag/searcher.py Normal file
View File

@@ -0,0 +1,131 @@
"""
Searcher module for Obsidian RAG Backend.
This module handles semantic search operations on the indexed ChromaDB collection.
"""
from dataclasses import dataclass
from typing import List
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from indexer import EMBEDDING_MODEL
@dataclass
class SearchResult:
"""
Represents a single search result with metadata and relevance score.
"""
file_path: str
section_title: str
line_start: int
line_end: int
score: float
text: str
def search_vault(
query: str,
chroma_db_path: str,
collection_name: str = "obsidian_vault",
embedding_model: str = EMBEDDING_MODEL,
limit: int = 5,
min_score: float = 0.0,
) -> List[SearchResult]:
"""
Search the indexed vault for semantically similar content.
Args:
query: Search query string
chroma_db_path: Path to ChromaDB data directory
collection_name: Name of the ChromaDB collection to search
embedding_model: Model used for embeddings (must match indexing model)
limit: Maximum number of results to return
min_score: Minimum similarity score threshold (0.0 to 1.0)
Returns:
List of SearchResult objects, sorted by relevance (highest score first)
Raises:
ValueError: If the collection does not exist or query is empty
"""
if not query or not query.strip():
raise ValueError("Query cannot be empty")
# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(
path=chroma_db_path,
settings=Settings(anonymized_telemetry=False)
)
# Get collection (will raise if it doesn't exist)
try:
collection = chroma_client.get_collection(name=collection_name)
except Exception as e:
raise ValueError(
f"Collection '{collection_name}' not found. "
f"Please index your vault first using the index command."
) from e
# Initialize embedding model (same as used during indexing)
model = SentenceTransformer(embedding_model)
# Generate query embedding
query_embedding = model.encode(query, show_progress_bar=False)
# Perform search
results = collection.query(
query_embeddings=[query_embedding.tolist()],
n_results=limit,
)
# Parse and format results
search_results = _parse_search_results(results, min_score)
return search_results
def _parse_search_results(
raw_results: dict,
min_score: float,
) -> List[SearchResult]:
"""
Parse ChromaDB query results into SearchResult objects.
ChromaDB returns distances (lower = more similar). We convert to
similarity scores (higher = more similar) using: score = 1 - distance
Args:
raw_results: Raw results dictionary from ChromaDB query
min_score: Minimum similarity score to include
Returns:
List of SearchResult objects filtered by min_score
"""
search_results = []
# ChromaDB returns results as lists of lists (one list per query)
# We only have one query, so we take the first element
documents = raw_results.get("documents", [[]])[0]
metadatas = raw_results.get("metadatas", [[]])[0]
distances = raw_results.get("distances", [[]])[0]
for doc, metadata, distance in zip(documents, metadatas, distances):
# Convert distance to similarity score (cosine distance -> cosine similarity)
score = 1.0 - distance
# Filter by minimum score
if score < min_score:
continue
search_results.append(SearchResult(
file_path=metadata["file_path"],
section_title=metadata["section_title"],
line_start=metadata["line_start"],
line_end=metadata["line_end"],
score=score,
text=doc,
))
return search_results