Initial commit
This commit is contained in:
0
obsidian_rag/__init__.py
Normal file
0
obsidian_rag/__init__.py
Normal file
403
obsidian_rag/cli.py
Normal file
403
obsidian_rag/cli.py
Normal file
@@ -0,0 +1,403 @@
|
||||
"""
|
||||
CLI module for Obsidian RAG Backend.
|
||||
|
||||
Provides command-line interface for indexing and searching the Obsidian vault.
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
|
||||
|
||||
from indexer import index_vault
|
||||
from rag_chain import RAGChain
|
||||
from searcher import search_vault, SearchResult
|
||||
|
||||
app = typer.Typer(
|
||||
name="obsidian-rag",
|
||||
help="Local semantic search backend for Obsidian markdown files",
|
||||
add_completion=False,
|
||||
)
|
||||
console = Console()
|
||||
|
||||
# Default ChromaDB path
|
||||
DEFAULT_CHROMA_PATH = Path.home() / ".obsidian_rag" / "chroma_db"
|
||||
|
||||
|
||||
def _truncate_path(path: str, max_len: int = 60) -> str:
|
||||
"""Return a truncated version of the file path if too long."""
|
||||
if len(path) <= max_len:
|
||||
return path
|
||||
return "..." + path[-(max_len - 3):]
|
||||
|
||||
|
||||
@app.command()
|
||||
def index(
|
||||
vault_path: str = typer.Argument(
|
||||
...,
|
||||
help="Path to the Obsidian vault directory",
|
||||
),
|
||||
chroma_path: Optional[str] = typer.Option(
|
||||
None,
|
||||
"--chroma-path",
|
||||
"-c",
|
||||
help=f"Path to ChromaDB storage (default: {DEFAULT_CHROMA_PATH})",
|
||||
),
|
||||
collection_name: str = typer.Option(
|
||||
"obsidian_vault",
|
||||
"--collection",
|
||||
help="Name of the ChromaDB collection",
|
||||
),
|
||||
max_chunk_tokens: int = typer.Option(
|
||||
200,
|
||||
"--max-tokens",
|
||||
help="Maximum tokens per chunk",
|
||||
),
|
||||
overlap_tokens: int = typer.Option(
|
||||
30,
|
||||
"--overlap",
|
||||
help="Number of overlapping tokens between chunks",
|
||||
),
|
||||
):
|
||||
"""
|
||||
Index all markdown files from the Obsidian vault into ChromaDB.
|
||||
"""
|
||||
vault_path_obj = Path(vault_path)
|
||||
chroma_path_obj = Path(chroma_path) if chroma_path else DEFAULT_CHROMA_PATH
|
||||
|
||||
if not vault_path_obj.exists():
|
||||
console.print(f"[red]✗ Error:[/red] Vault path does not exist: {vault_path}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
if not vault_path_obj.is_dir():
|
||||
console.print(f"[red]✗ Error:[/red] Vault path is not a directory: {vault_path}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
chroma_path_obj.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
md_files = list(vault_path_obj.rglob("*.md"))
|
||||
total_files = len(md_files)
|
||||
|
||||
if total_files == 0:
|
||||
console.print(f"[yellow]⚠ Warning:[/yellow] No markdown files found in {vault_path}")
|
||||
raise typer.Exit(code=0)
|
||||
|
||||
console.print(f"\n[cyan]Found {total_files} markdown files to index[/cyan]\n")
|
||||
|
||||
# One single stable progress bar
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
|
||||
main_task = progress.add_task("[cyan]Indexing vault...", total=total_files)
|
||||
|
||||
# Create a separate status line below the progress bar
|
||||
status_line = console.status("[dim]Preparing first file...")
|
||||
|
||||
def progress_callback(current_file: str, files_processed: int, total: int):
|
||||
"""Update progress bar and status message."""
|
||||
progress.update(main_task, completed=files_processed)
|
||||
|
||||
short_file = _truncate_path(current_file)
|
||||
status_line.update(f"[dim]Processing: {short_file}")
|
||||
|
||||
try:
|
||||
with status_line:
|
||||
stats = index_vault(
|
||||
vault_path=str(vault_path_obj),
|
||||
chroma_db_path=str(chroma_path_obj),
|
||||
collection_name=collection_name,
|
||||
max_chunk_tokens=max_chunk_tokens,
|
||||
overlap_tokens=overlap_tokens,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
progress.update(main_task, completed=total_files)
|
||||
status_line.update("[green]✓ Completed")
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"\n[red]✗ Error during indexing:[/red] {str(e)}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
console.print()
|
||||
_display_index_results(stats)
|
||||
|
||||
|
||||
@app.command()
|
||||
def search(
|
||||
query: str = typer.Argument(
|
||||
...,
|
||||
help="Search query",
|
||||
),
|
||||
chroma_path: Optional[str] = typer.Option(
|
||||
None,
|
||||
"--chroma-path",
|
||||
"-c",
|
||||
help=f"Path to ChromaDB storage (default: {DEFAULT_CHROMA_PATH})",
|
||||
),
|
||||
collection_name: str = typer.Option(
|
||||
"obsidian_vault",
|
||||
"--collection",
|
||||
help="Name of the ChromaDB collection",
|
||||
),
|
||||
limit: int = typer.Option(
|
||||
5,
|
||||
"--limit",
|
||||
"-l",
|
||||
help="Maximum number of results to return",
|
||||
),
|
||||
min_score: float = typer.Option(
|
||||
0.0,
|
||||
"--min-score",
|
||||
"-s",
|
||||
help="Minimum similarity score (0.0 to 1.0)",
|
||||
),
|
||||
format: str = typer.Option(
|
||||
"compact",
|
||||
"--format",
|
||||
"-f",
|
||||
help="Output format: compact (default), panel, table",
|
||||
),
|
||||
):
|
||||
"""
|
||||
Search the indexed vault for semantically similar content.
|
||||
|
||||
Returns relevant sections from your Obsidian notes based on
|
||||
semantic similarity to the query.
|
||||
"""
|
||||
# Resolve paths
|
||||
chroma_path_obj = Path(chroma_path) if chroma_path else DEFAULT_CHROMA_PATH
|
||||
|
||||
# Validate chroma path exists
|
||||
if not chroma_path_obj.exists():
|
||||
console.print(
|
||||
f"[red]✗ Error:[/red] ChromaDB not found at {chroma_path_obj}\n"
|
||||
f"Please run 'obsidian-rag index <vault_path>' first to create the index."
|
||||
)
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
# Validate format
|
||||
valid_formats = ["compact", "panel", "table"]
|
||||
if format not in valid_formats:
|
||||
console.print(f"[red]✗ Error:[/red] Invalid format '{format}'. Valid options: {', '.join(valid_formats)}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
# Perform search
|
||||
try:
|
||||
with console.status("[cyan]Searching...", spinner="dots"):
|
||||
results = search_vault(
|
||||
query=query,
|
||||
chroma_db_path=str(chroma_path_obj),
|
||||
collection_name=collection_name,
|
||||
limit=limit,
|
||||
min_score=min_score,
|
||||
)
|
||||
except ValueError as e:
|
||||
console.print(f"[red]✗ Error:[/red] {str(e)}")
|
||||
raise typer.Exit(code=1)
|
||||
except Exception as e:
|
||||
console.print(f"[red]✗ Unexpected error:[/red] {str(e)}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
# Display results
|
||||
if not results:
|
||||
console.print(f"\n[yellow]No results found for query:[/yellow] '{query}'")
|
||||
if min_score > 0:
|
||||
console.print(f"[dim]Try lowering --min-score (currently {min_score})[/dim]")
|
||||
raise typer.Exit(code=0)
|
||||
|
||||
console.print(f"\n[cyan]Found {len(results)} result(s) for:[/cyan] '{query}'\n")
|
||||
|
||||
# Display with selected format
|
||||
if format == "compact":
|
||||
_display_results_compact(results)
|
||||
elif format == "panel":
|
||||
_display_results_panel(results)
|
||||
elif format == "table":
|
||||
_display_results_table(results)
|
||||
|
||||
|
||||
@app.command()
|
||||
def ask(
|
||||
query: str = typer.Argument(
|
||||
...,
|
||||
help="Question to ask the LLM based on your Obsidian notes."
|
||||
),
|
||||
chroma_path: Optional[str] = typer.Option(
|
||||
None,
|
||||
"--chroma-path",
|
||||
"-c",
|
||||
help=f"Path to ChromaDB storage (default: {DEFAULT_CHROMA_PATH})",
|
||||
),
|
||||
collection_name: str = typer.Option(
|
||||
"obsidian_vault",
|
||||
"--collection",
|
||||
help="Name of the ChromaDB collection",
|
||||
),
|
||||
top_k: int = typer.Option(
|
||||
5,
|
||||
"--top-k",
|
||||
"-k",
|
||||
help="Number of top chunks to use for context",
|
||||
),
|
||||
min_score: float = typer.Option(
|
||||
0.0,
|
||||
"--min-score",
|
||||
"-s",
|
||||
help="Minimum similarity score for chunks",
|
||||
),
|
||||
api_key: Optional[str] = typer.Option(
|
||||
None,
|
||||
"--api-key",
|
||||
help="Clovis API key (or set CLOVIS_API_KEY environment variable)",
|
||||
),
|
||||
base_url: Optional[str] = typer.Option(
|
||||
None,
|
||||
"--base-url",
|
||||
help="Clovis base URL (or set CLOVIS_BASE_URL environment variable)",
|
||||
),
|
||||
):
|
||||
"""
|
||||
Ask a question to the LLM using RAG over your Obsidian vault.
|
||||
"""
|
||||
|
||||
# Resolve ChromaDB path
|
||||
chroma_path_obj = Path(chroma_path) if chroma_path else DEFAULT_CHROMA_PATH
|
||||
if not chroma_path_obj.exists():
|
||||
console.print(
|
||||
f"[red]✗ Error:[/red] ChromaDB not found at {chroma_path_obj}\n"
|
||||
f"Please run 'obsidian-rag index <vault_path>' first to create the index."
|
||||
)
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
# Resolve API key and base URL
|
||||
api_key = api_key or os.getenv("CLOVIS_API_KEY")
|
||||
base_url = base_url or os.getenv("CLOVIS_BASE_URL")
|
||||
if not api_key or not base_url:
|
||||
console.print(
|
||||
"[red]✗ Error:[/red] API key or base URL not provided.\n"
|
||||
"Set them via --api-key / --base-url or environment variables CLOVIS_API_KEY and CLOVIS_BASE_URL."
|
||||
)
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
# Instantiate RAGChain
|
||||
rag = RAGChain(
|
||||
chroma_db_path=str(chroma_path_obj),
|
||||
collection_name=collection_name,
|
||||
top_k=top_k,
|
||||
min_score=min_score,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
)
|
||||
|
||||
# Get answer from RAG
|
||||
try:
|
||||
with console.status("[cyan]Querying LLM...", spinner="dots"):
|
||||
answer, used_chunks = rag.answer_query(query)
|
||||
except Exception as e:
|
||||
console.print(f"[red]✗ Error:[/red] {str(e)}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
# Display answer
|
||||
console.print("\n[bold green]Answer:[/bold green]\n")
|
||||
console.print(answer + "\n")
|
||||
|
||||
# Display sources used
|
||||
if used_chunks:
|
||||
sources = ", ".join(f"{c.file_path}#L{c.line_start}-L{c.line_end}" for c in used_chunks)
|
||||
console.print(f"[bold cyan]Sources:[/bold cyan] {sources}\n")
|
||||
else:
|
||||
console.print("[bold cyan]Sources:[/bold cyan] None\n")
|
||||
|
||||
|
||||
def _display_index_results(stats: dict):
|
||||
"""
|
||||
Display indexing results with rich formatting.
|
||||
|
||||
Args:
|
||||
stats: Statistics dictionary from index_vault
|
||||
"""
|
||||
files_processed = stats["files_processed"]
|
||||
chunks_created = stats["chunks_created"]
|
||||
errors = stats["errors"]
|
||||
|
||||
# Success summary
|
||||
console.print(Panel(
|
||||
f"[green]✓[/green] Indexing completed\n\n"
|
||||
f"Files processed: [cyan]{files_processed}[/cyan]\n"
|
||||
f"Chunks created: [cyan]{chunks_created}[/cyan]\n"
|
||||
f"Collection: [cyan]{stats['collection_name']}[/cyan]",
|
||||
title="[bold]Indexing Results[/bold]",
|
||||
border_style="green",
|
||||
))
|
||||
|
||||
# Display errors if any
|
||||
if errors:
|
||||
console.print(f"\n[yellow]⚠ {len(errors)} file(s) skipped due to errors:[/yellow]\n")
|
||||
for error in errors:
|
||||
console.print(f" [red]•[/red] {error['file']}: [dim]{error['error']}[/dim]")
|
||||
|
||||
|
||||
def _display_results_compact(results: list[SearchResult]):
|
||||
"""
|
||||
Display search results in compact format.
|
||||
|
||||
Args:
|
||||
results: List of SearchResult objects
|
||||
"""
|
||||
for idx, result in enumerate(results, 1):
|
||||
# Format score as stars (0-5 scale)
|
||||
stars = "⭐" * int(result.score * 5)
|
||||
|
||||
console.print(f"[bold cyan]{idx}.[/bold cyan] {result.file_path} [dim](score: {result.score:.2f} {stars})[/dim]")
|
||||
console.print(
|
||||
f" Section: [yellow]{result.section_title}[/yellow] | Lines: [dim]{result.line_start}-{result.line_end}[/dim]")
|
||||
|
||||
# Truncate text if too long
|
||||
text = result.text
|
||||
if len(text) > 200:
|
||||
text = text[:200] + "..."
|
||||
|
||||
console.print(f" {text}\n")
|
||||
|
||||
|
||||
def _display_results_panel(results: list[SearchResult]):
|
||||
"""
|
||||
Display search results in panel format (rich boxes).
|
||||
|
||||
Args:
|
||||
results: List of SearchResult objects
|
||||
"""
|
||||
# TODO: Implement panel format in future
|
||||
console.print("[yellow]Panel format not yet implemented. Using compact format.[/yellow]\n")
|
||||
_display_results_compact(results)
|
||||
|
||||
|
||||
def _display_results_table(results: list[SearchResult]):
|
||||
"""
|
||||
Display search results in table format.
|
||||
|
||||
Args:
|
||||
results: List of SearchResult objects
|
||||
"""
|
||||
# TODO: Implement table format in future
|
||||
console.print("[yellow]Table format not yet implemented. Using compact format.[/yellow]\n")
|
||||
_display_results_compact(results)
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Entry point for the CLI application.
|
||||
"""
|
||||
app()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
284
obsidian_rag/indexer.py
Normal file
284
obsidian_rag/indexer.py
Normal file
@@ -0,0 +1,284 @@
|
||||
"""
|
||||
Indexer module for Obsidian RAG Backend.
|
||||
|
||||
This module handles the indexing of markdown files into a ChromaDB vector store
|
||||
using local embeddings from sentence-transformers.
|
||||
"""
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Callable
|
||||
|
||||
import chromadb
|
||||
from chromadb.config import Settings
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
from markdown_parser import ParsedDocument
|
||||
from markdown_parser import parse_markdown_file
|
||||
|
||||
# EMBEDDING_MODEL = "all-MiniLM-L6-v2"
|
||||
EMBEDDING_MODEL = "all-MiniLM-L12-v2"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkMetadata:
|
||||
file_path: str
|
||||
section_title: str
|
||||
line_start: int
|
||||
line_end: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
id: str
|
||||
text: str
|
||||
metadata: ChunkMetadata
|
||||
|
||||
|
||||
def index_vault(
|
||||
vault_path: str,
|
||||
chroma_db_path: str,
|
||||
collection_name: str = "obsidian_vault",
|
||||
embedding_model: str = EMBEDDING_MODEL,
|
||||
max_chunk_tokens: int = 200,
|
||||
overlap_tokens: int = 30,
|
||||
progress_callback: Optional[Callable[[str, int, int], None]] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Index all markdown files from vault into ChromaDB.
|
||||
|
||||
Args:
|
||||
vault_path: Path to the Obsidian vault directory
|
||||
chroma_db_path: Path where ChromaDB will store its data
|
||||
collection_name: Name of the ChromaDB collection
|
||||
embedding_model: Name of the sentence-transformers model to use
|
||||
max_chunk_tokens: Maximum tokens per chunk
|
||||
overlap_tokens: Number of overlapping tokens between chunks
|
||||
progress_callback: Optional callback function called for each file processed.
|
||||
Signature: callback(current_file: str, files_processed: int, total_files: int)
|
||||
|
||||
Returns:
|
||||
Dictionary with indexing statistics:
|
||||
- files_processed: Number of files successfully processed
|
||||
- chunks_created: Total number of chunks created
|
||||
- errors: List of errors encountered (file path and error message)
|
||||
- collection_name: Name of the collection used
|
||||
"""
|
||||
|
||||
vault_path_obj = Path(vault_path)
|
||||
if not vault_path_obj.exists():
|
||||
raise ValueError(f"Vault path does not exist: {vault_path}")
|
||||
|
||||
# Initialize embedding model and tokenizer
|
||||
model = SentenceTransformer(embedding_model)
|
||||
tokenizer = model.tokenizer
|
||||
|
||||
# Initialize ChromaDB client and collection
|
||||
chroma_client = chromadb.PersistentClient(
|
||||
path=chroma_db_path,
|
||||
settings=Settings(anonymized_telemetry=False)
|
||||
)
|
||||
collection = _get_or_create_collection(chroma_client, collection_name)
|
||||
|
||||
# Find all markdown files
|
||||
md_files = list(vault_path_obj.rglob("*.md"))
|
||||
total_files = len(md_files)
|
||||
|
||||
# Statistics tracking
|
||||
stats = {
|
||||
"files_processed": 0,
|
||||
"chunks_created": 0,
|
||||
"errors": [],
|
||||
"collection_name": collection_name,
|
||||
}
|
||||
|
||||
# Process each file
|
||||
for md_file in md_files:
|
||||
# Get relative path for display
|
||||
relative_path = md_file.relative_to(vault_path_obj)
|
||||
|
||||
# Notify callback that we're starting this file
|
||||
if progress_callback:
|
||||
progress_callback(str(relative_path), stats["files_processed"], total_files)
|
||||
|
||||
try:
|
||||
# Parse markdown file
|
||||
parsed_doc = parse_markdown_file(md_file)
|
||||
|
||||
# Create chunks from document
|
||||
chunks = _create_chunks_from_document(
|
||||
parsed_doc,
|
||||
tokenizer,
|
||||
max_chunk_tokens,
|
||||
overlap_tokens,
|
||||
vault_path_obj,
|
||||
)
|
||||
|
||||
if chunks:
|
||||
# Extract data for ChromaDB
|
||||
documents = [chunk.text for chunk in chunks]
|
||||
metadatas = [asdict(chunk.metadata) for chunk in chunks]
|
||||
ids = [chunk.id for chunk in chunks]
|
||||
|
||||
# Generate embeddings and add to collection
|
||||
embeddings = model.encode(documents, show_progress_bar=False)
|
||||
collection.add(
|
||||
documents=documents,
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
embeddings=embeddings.tolist(),
|
||||
)
|
||||
|
||||
stats["chunks_created"] += len(chunks)
|
||||
|
||||
stats["files_processed"] += 1
|
||||
|
||||
except Exception as e:
|
||||
# Log error but continue processing
|
||||
stats["errors"].append({
|
||||
"file": str(relative_path),
|
||||
"error": str(e),
|
||||
})
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def _get_or_create_collection(
|
||||
chroma_client: chromadb.PersistentClient,
|
||||
collection_name: str,
|
||||
) -> chromadb.Collection:
|
||||
"""
|
||||
Get or create a ChromaDB collection, resetting it if it already exists.
|
||||
|
||||
Args:
|
||||
chroma_client: ChromaDB client instance
|
||||
collection_name: Name of the collection
|
||||
|
||||
Returns:
|
||||
ChromaDB collection instance
|
||||
"""
|
||||
try:
|
||||
# Try to delete existing collection
|
||||
chroma_client.delete_collection(name=collection_name)
|
||||
except Exception:
|
||||
# Collection doesn't exist, that's fine
|
||||
pass
|
||||
|
||||
# Create fresh collection
|
||||
collection = chroma_client.create_collection(
|
||||
name=collection_name,
|
||||
metadata={"hnsw:space": "cosine"} # Use cosine similarity
|
||||
)
|
||||
|
||||
return collection
|
||||
|
||||
|
||||
def _create_chunks_from_document(
|
||||
parsed_doc: ParsedDocument,
|
||||
tokenizer,
|
||||
max_chunk_tokens: int,
|
||||
overlap_tokens: int,
|
||||
vault_path: Path,
|
||||
) -> List[Chunk]:
|
||||
"""
|
||||
Transform a parsed document into chunks with metadata.
|
||||
|
||||
Implements hybrid chunking strategy:
|
||||
- Short sections (≤max_chunk_tokens): one chunk per section
|
||||
- Long sections (>max_chunk_tokens): split with sliding window
|
||||
|
||||
Args:
|
||||
parsed_doc: Parsed document from markdown_parser
|
||||
tokenizer: Tokenizer from sentence-transformers model
|
||||
max_chunk_tokens: Maximum tokens per chunk
|
||||
overlap_tokens: Number of overlapping tokens between chunks
|
||||
vault_path: Path to vault root (for relative path calculation)
|
||||
|
||||
Returns:
|
||||
List of chunk dictionaries with 'text', 'metadata', and 'id' keys
|
||||
"""
|
||||
chunks = []
|
||||
file_path = parsed_doc.file_path
|
||||
relative_path = file_path.relative_to(vault_path)
|
||||
|
||||
for section in parsed_doc.sections:
|
||||
section_text = f"{parsed_doc.title} {section.title} {section.content}"
|
||||
section_title = section.title
|
||||
line_start = section.start_line
|
||||
line_end = section.end_line
|
||||
|
||||
# Tokenize section to check length
|
||||
tokens = tokenizer.encode(section_text, add_special_tokens=False)
|
||||
|
||||
if len(tokens) <= max_chunk_tokens:
|
||||
# Short section: create single chunk
|
||||
chunk_id = f"{relative_path}::{section_title}::{line_start}-{line_end}"
|
||||
chunks.append(Chunk(chunk_id, section_text, ChunkMetadata(str(relative_path),
|
||||
section_title,
|
||||
line_start,
|
||||
line_start
|
||||
)))
|
||||
else:
|
||||
# Long section: split with sliding window
|
||||
sub_chunks = _chunk_section(
|
||||
section_text,
|
||||
tokenizer,
|
||||
max_chunk_tokens,
|
||||
overlap_tokens,
|
||||
)
|
||||
|
||||
# Create chunk for each sub-chunk
|
||||
for idx, sub_chunk_text in enumerate(sub_chunks):
|
||||
chunk_id = f"{relative_path}::{section_title}::{line_start}-{line_end}::chunk{idx}"
|
||||
chunks.append(Chunk(chunk_id, sub_chunk_text, ChunkMetadata(str(relative_path),
|
||||
section_title,
|
||||
line_start,
|
||||
line_start
|
||||
)))
|
||||
return chunks
|
||||
|
||||
|
||||
def _chunk_section(
|
||||
section_text: str,
|
||||
tokenizer,
|
||||
max_chunk_tokens: int,
|
||||
overlap_tokens: int,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Split a section into overlapping chunks using sliding window.
|
||||
|
||||
Args:
|
||||
section_text: Text content to chunk
|
||||
tokenizer: Tokenizer from sentence-transformers model
|
||||
max_chunk_tokens: Maximum tokens per chunk
|
||||
overlap_tokens: Number of overlapping tokens between chunks
|
||||
|
||||
Returns:
|
||||
List of text chunks
|
||||
"""
|
||||
# Apply safety margin to prevent decode/encode inconsistencies
|
||||
# from exceeding the max token limit
|
||||
max_chunk_tokens_to_use = int(max_chunk_tokens * 0.98)
|
||||
|
||||
# Tokenize the full text
|
||||
tokens = tokenizer.encode(section_text, add_special_tokens=False)
|
||||
|
||||
chunks = []
|
||||
start_idx = 0
|
||||
|
||||
while start_idx < len(tokens):
|
||||
# Extract chunk tokens
|
||||
end_idx = start_idx + max_chunk_tokens_to_use
|
||||
chunk_tokens = tokens[start_idx:end_idx]
|
||||
|
||||
# Decode back to text
|
||||
chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
|
||||
chunks.append(chunk_text)
|
||||
|
||||
# Move window forward (with overlap)
|
||||
start_idx += max_chunk_tokens_to_use - overlap_tokens
|
||||
|
||||
# Avoid infinite loop if overlap >= max_chunk_tokens
|
||||
if start_idx <= start_idx - (max_chunk_tokens_to_use - overlap_tokens):
|
||||
break
|
||||
|
||||
return chunks
|
||||
74
obsidian_rag/llm_client.py
Normal file
74
obsidian_rag/llm_client.py
Normal file
@@ -0,0 +1,74 @@
|
||||
from typing import Dict
|
||||
|
||||
import openai
|
||||
|
||||
|
||||
class LLMClient:
|
||||
"""
|
||||
Minimalist client for interacting with Clovis LLM via OpenAI SDK.
|
||||
|
||||
Attributes:
|
||||
api_key (str): API key for Clovis.
|
||||
base_url (str): Base URL for Clovis LLM gateway.
|
||||
model (str): Model name to use. Defaults to 'ClovisLLM'.
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: str, base_url: str, model: str = "ClovisLLM") -> None:
|
||||
if not api_key:
|
||||
raise ValueError("API key is required for LLMClient.")
|
||||
if not base_url:
|
||||
raise ValueError("Base URL is required for LLMClient.")
|
||||
|
||||
self.api_key = api_key
|
||||
self.base_url = base_url
|
||||
self.model = model
|
||||
self.client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
|
||||
|
||||
def generate(self, system_prompt: str, user_prompt: str, context: str) -> Dict[str, object]:
|
||||
"""
|
||||
Generate a response from the LLM given a system prompt, user prompt, and context.
|
||||
|
||||
Args:
|
||||
system_prompt (str): Instructions for the assistant.
|
||||
user_prompt (str): The user's query.
|
||||
context (str): Concatenated chunks from RAG search.
|
||||
|
||||
Returns:
|
||||
Dict[str, object]: Contains:
|
||||
- "answer" (str): Text generated by the LLM.
|
||||
- "usage" (int): Total tokens used in the completion.
|
||||
"""
|
||||
# Construct user message with explicit CONTEXT / QUESTION separation
|
||||
user_message_content = f"CONTEXT:\n{context}\n\nQUESTION:\n{user_prompt}"
|
||||
|
||||
try:
|
||||
response = self.client.chat.completions.create(model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_message_content}
|
||||
],
|
||||
temperature=0.7,
|
||||
max_tokens=2000,
|
||||
top_p=1.0,
|
||||
n=1,
|
||||
# stream=False,
|
||||
# presence_penalty=0.0,
|
||||
# frequency_penalty=0.0,
|
||||
# stop=None,
|
||||
# logit_bias={},
|
||||
user="obsidian_rag",
|
||||
)
|
||||
except Exception as e:
|
||||
# For now, propagate exceptions (C1 minimal)
|
||||
raise e
|
||||
|
||||
# Extract text and usage
|
||||
try:
|
||||
answer_text = response.choices[0].message.content
|
||||
total_tokens = response.usage.total_tokens
|
||||
except AttributeError:
|
||||
# Fallback if response structure is unexpected
|
||||
answer_text = ""
|
||||
total_tokens = 0
|
||||
|
||||
return {"answer": answer_text, "usage": total_tokens}
|
||||
213
obsidian_rag/markdown_parser.py
Normal file
213
obsidian_rag/markdown_parser.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""Markdown parser for Obsidian vault files.
|
||||
|
||||
This module provides functionality to parse markdown files and extract
|
||||
their structure (sections, line numbers) for semantic search indexing.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class MarkdownSection:
|
||||
"""Represents a section in a markdown document.
|
||||
|
||||
Attributes:
|
||||
level: Header level (0 for no header, 1 for #, 2 for ##, etc.)
|
||||
title: Section title (empty string if level=0)
|
||||
content: Text content without the header line
|
||||
start_line: Line number where section starts (1-indexed)
|
||||
end_line: Line number where section ends (1-indexed, inclusive)
|
||||
"""
|
||||
level: int
|
||||
title: str
|
||||
content: str
|
||||
parents: list[str]
|
||||
start_line: int
|
||||
end_line: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedDocument:
|
||||
"""Represents a parsed markdown document.
|
||||
|
||||
Attributes:
|
||||
file_path: Path to the markdown file
|
||||
sections: List of sections extracted from the document
|
||||
raw_content: Full file content as string
|
||||
"""
|
||||
file_path: Path
|
||||
title: str
|
||||
sections: List[MarkdownSection]
|
||||
raw_content: str
|
||||
|
||||
|
||||
def _compute_parents(current_parents, previous_level, previous_title, current_level):
|
||||
"""Computes the parents of `current_parents`."""
|
||||
return current_parents
|
||||
|
||||
|
||||
def parse_markdown_file(file_path: Path, vault_path=None) -> ParsedDocument:
|
||||
"""Parse a markdown file and extract its structure.
|
||||
|
||||
This function reads a markdown file, identifies all header sections,
|
||||
and extracts their content with precise line number tracking.
|
||||
Files without headers are treated as a single section with level 0.
|
||||
|
||||
Args:
|
||||
file_path: Path to the markdown file to parse
|
||||
vault_path: Path to the vault file.
|
||||
|
||||
Returns:
|
||||
ParsedDocument containing the file structure and content
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist
|
||||
|
||||
Example:
|
||||
>>> doc = parse_markdown_file(Path("notes/example.md"))
|
||||
>>> print(f"Found {len(doc.sections)} sections")
|
||||
>>> print(doc.sections[0].title)
|
||||
"""
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
if vault_path:
|
||||
title = str(file_path.relative_to(vault_path)).replace(".md", "")
|
||||
title = title.replace("\\", " ").replace("/", " ")
|
||||
else:
|
||||
title = file_path.stem
|
||||
|
||||
raw_content = file_path.read_text(encoding="utf-8")
|
||||
lines = raw_content.splitlines()
|
||||
|
||||
sections: List[MarkdownSection] = []
|
||||
current_section_start = 1
|
||||
current_level = 0
|
||||
current_title = ""
|
||||
current_parents = []
|
||||
current_content_lines: List[str] = []
|
||||
|
||||
header_pattern = re.compile(r"^(#{1,6})\s+(.+)$")
|
||||
|
||||
for line_num, line in enumerate(lines, start=1):
|
||||
match = header_pattern.match(line)
|
||||
|
||||
if match:
|
||||
# Save the previous section only if it actually has content.
|
||||
if current_content_lines:
|
||||
content = "\n".join(current_content_lines)
|
||||
sections.append(
|
||||
MarkdownSection(
|
||||
level=current_level,
|
||||
title=current_title,
|
||||
content=content,
|
||||
parents=current_parents,
|
||||
start_line=current_section_start,
|
||||
end_line=line_num - 1,
|
||||
)
|
||||
)
|
||||
|
||||
# Start a new section with the detected header.
|
||||
previous_level = current_level
|
||||
previous_title = current_title
|
||||
current_level = len(match.group(1))
|
||||
current_title = match.group(2).strip()
|
||||
current_section_start = line_num
|
||||
current_parents = _compute_parents(current_parents, previous_level, previous_title, current_level)
|
||||
current_content_lines = []
|
||||
else:
|
||||
current_content_lines.append(line)
|
||||
|
||||
# Handle the final section (or whole file if no headers were found).
|
||||
if lines:
|
||||
content = "\n".join(current_content_lines)
|
||||
end_line = len(lines)
|
||||
|
||||
# Case 1 – no header was ever found.
|
||||
if not sections and current_level == 0:
|
||||
sections.append(
|
||||
MarkdownSection(
|
||||
level=0,
|
||||
title="",
|
||||
content=content,
|
||||
parents=current_parents,
|
||||
start_line=1,
|
||||
end_line=end_line,
|
||||
)
|
||||
)
|
||||
# Case 2 – a single header was found (sections empty but we have a title).
|
||||
elif not sections:
|
||||
sections.append(
|
||||
MarkdownSection(
|
||||
level=current_level,
|
||||
title=current_title,
|
||||
content=content,
|
||||
parents=current_parents,
|
||||
start_line=current_section_start,
|
||||
end_line=end_line,
|
||||
)
|
||||
)
|
||||
# Case 3 – multiple headers were found (sections already contains earlier ones).
|
||||
else:
|
||||
sections.append(
|
||||
MarkdownSection(
|
||||
level=current_level,
|
||||
title=current_title,
|
||||
content=content,
|
||||
parents=current_parents,
|
||||
start_line=current_section_start,
|
||||
end_line=end_line,
|
||||
)
|
||||
)
|
||||
else:
|
||||
# Empty file: create a single empty level‑0 section.
|
||||
sections.append(
|
||||
MarkdownSection(
|
||||
level=0,
|
||||
title="",
|
||||
content="",
|
||||
parents=[],
|
||||
start_line=1,
|
||||
end_line=1,
|
||||
)
|
||||
)
|
||||
|
||||
return ParsedDocument(
|
||||
file_path=file_path,
|
||||
title=title,
|
||||
sections=sections,
|
||||
raw_content=raw_content,
|
||||
)
|
||||
|
||||
|
||||
def find_section_at_line(
|
||||
document: ParsedDocument,
|
||||
line_number: int,
|
||||
) -> Optional[MarkdownSection]:
|
||||
"""Find which section contains a given line number.
|
||||
|
||||
This function searches through the document's sections to find
|
||||
which section contains the specified line number.
|
||||
|
||||
Args:
|
||||
document: Parsed markdown document
|
||||
line_number: Line number to search for (1-indexed)
|
||||
|
||||
Returns:
|
||||
MarkdownSection containing the line, or None if line number
|
||||
is invalid or out of range
|
||||
|
||||
Example:
|
||||
>>> section = find_section_at_line(doc, 42)
|
||||
>>> if section:
|
||||
... print(f"Line 42 is in section: {section.title}")
|
||||
"""
|
||||
if line_number < 1:
|
||||
return None
|
||||
|
||||
for section in document.sections:
|
||||
if section.start_line <= line_number <= section.end_line:
|
||||
return section
|
||||
96
obsidian_rag/rag_chain.py
Normal file
96
obsidian_rag/rag_chain.py
Normal file
@@ -0,0 +1,96 @@
|
||||
# File: obsidian_rag/rag_chain.py
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
from indexer import EMBEDDING_MODEL
|
||||
from llm_client import LLMClient
|
||||
from searcher import search_vault, SearchResult
|
||||
|
||||
|
||||
class RAGChain:
|
||||
"""
|
||||
Retrieval-Augmented Generation (RAG) chain for answering queries
|
||||
using semantic search over an Obsidian vault and LLM.
|
||||
|
||||
Attributes:
|
||||
chroma_db_path (Path): Path to ChromaDB.
|
||||
collection_name (str): Chroma collection name.
|
||||
embedding_model (str): Embedding model name.
|
||||
top_k (int): Number of chunks to send to the LLM.
|
||||
min_score (float): Minimum similarity score for chunks.
|
||||
system_prompt (str): System prompt to instruct the LLM.
|
||||
llm_client (LLMClient): Internal LLM client instance.
|
||||
"""
|
||||
|
||||
DEFAULT_SYSTEM_PROMPT = (
|
||||
"You are an assistant specialized in analyzing Obsidian notes.\n\n"
|
||||
"INSTRUCTIONS:\n"
|
||||
"- Answer based ONLY on the provided context\n"
|
||||
"- Cite the sources (files) you use\n"
|
||||
"- If the information is not in the context, say \"I did not find this information in your notes\"\n"
|
||||
"- Be concise but thorough\n"
|
||||
"- Structure your answer with sections if necessary"
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
chroma_db_path: str,
|
||||
api_key: str,
|
||||
base_url: str,
|
||||
collection_name: str = "obsidian_vault",
|
||||
embedding_model: str = EMBEDDING_MODEL,
|
||||
top_k: int = 5,
|
||||
min_score: float = 0.0,
|
||||
system_prompt: str = None,
|
||||
) -> None:
|
||||
self.chroma_db_path = Path(chroma_db_path)
|
||||
self.collection_name = collection_name
|
||||
self.embedding_model = embedding_model
|
||||
self.top_k = top_k
|
||||
self.min_score = min_score
|
||||
self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT
|
||||
|
||||
# Instantiate internal LLM client
|
||||
self.llm_client = LLMClient(api_key=api_key, base_url=base_url)
|
||||
|
||||
def answer_query(self, query: str) -> Tuple[str, List[SearchResult]]:
|
||||
"""
|
||||
Answer a user query using RAG: search vault, build context, call LLM.
|
||||
|
||||
Args:
|
||||
query (str): User query.
|
||||
|
||||
Returns:
|
||||
Tuple[str, List[SearchResult]]:
|
||||
- LLM answer (str)
|
||||
- List of used SearchResult chunks
|
||||
"""
|
||||
# 1. Perform semantic search
|
||||
chunks: List[SearchResult] = search_vault(
|
||||
query=query,
|
||||
chroma_db_path=str(self.chroma_db_path),
|
||||
collection_name=self.collection_name,
|
||||
embedding_model=self.embedding_model,
|
||||
limit=self.top_k,
|
||||
min_score=self.min_score,
|
||||
)
|
||||
|
||||
# 2. Build context string with citations
|
||||
context_parts: List[str] = []
|
||||
for chunk in chunks:
|
||||
chunk_text = chunk.text.strip()
|
||||
citation = f"[{chunk.file_path}#L{chunk.line_start}-L{chunk.line_end}]"
|
||||
context_parts.append(f"{chunk_text}\n{citation}")
|
||||
|
||||
context_str = "\n\n".join(context_parts) if context_parts else ""
|
||||
|
||||
# 3. Call LLM with context + question
|
||||
llm_response = self.llm_client.generate(
|
||||
system_prompt=self.system_prompt,
|
||||
user_prompt=query,
|
||||
context=context_str,
|
||||
)
|
||||
|
||||
answer_text = llm_response.get("answer", "")
|
||||
return answer_text, chunks
|
||||
131
obsidian_rag/searcher.py
Normal file
131
obsidian_rag/searcher.py
Normal file
@@ -0,0 +1,131 @@
|
||||
"""
|
||||
Searcher module for Obsidian RAG Backend.
|
||||
|
||||
This module handles semantic search operations on the indexed ChromaDB collection.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
import chromadb
|
||||
from chromadb.config import Settings
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
from indexer import EMBEDDING_MODEL
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchResult:
|
||||
"""
|
||||
Represents a single search result with metadata and relevance score.
|
||||
"""
|
||||
file_path: str
|
||||
section_title: str
|
||||
line_start: int
|
||||
line_end: int
|
||||
score: float
|
||||
text: str
|
||||
|
||||
|
||||
def search_vault(
|
||||
query: str,
|
||||
chroma_db_path: str,
|
||||
collection_name: str = "obsidian_vault",
|
||||
embedding_model: str = EMBEDDING_MODEL,
|
||||
limit: int = 5,
|
||||
min_score: float = 0.0,
|
||||
) -> List[SearchResult]:
|
||||
"""
|
||||
Search the indexed vault for semantically similar content.
|
||||
|
||||
Args:
|
||||
query: Search query string
|
||||
chroma_db_path: Path to ChromaDB data directory
|
||||
collection_name: Name of the ChromaDB collection to search
|
||||
embedding_model: Model used for embeddings (must match indexing model)
|
||||
limit: Maximum number of results to return
|
||||
min_score: Minimum similarity score threshold (0.0 to 1.0)
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects, sorted by relevance (highest score first)
|
||||
|
||||
Raises:
|
||||
ValueError: If the collection does not exist or query is empty
|
||||
"""
|
||||
if not query or not query.strip():
|
||||
raise ValueError("Query cannot be empty")
|
||||
|
||||
# Initialize ChromaDB client
|
||||
chroma_client = chromadb.PersistentClient(
|
||||
path=chroma_db_path,
|
||||
settings=Settings(anonymized_telemetry=False)
|
||||
)
|
||||
|
||||
# Get collection (will raise if it doesn't exist)
|
||||
try:
|
||||
collection = chroma_client.get_collection(name=collection_name)
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
f"Collection '{collection_name}' not found. "
|
||||
f"Please index your vault first using the index command."
|
||||
) from e
|
||||
|
||||
# Initialize embedding model (same as used during indexing)
|
||||
model = SentenceTransformer(embedding_model)
|
||||
|
||||
# Generate query embedding
|
||||
query_embedding = model.encode(query, show_progress_bar=False)
|
||||
|
||||
# Perform search
|
||||
results = collection.query(
|
||||
query_embeddings=[query_embedding.tolist()],
|
||||
n_results=limit,
|
||||
)
|
||||
|
||||
# Parse and format results
|
||||
search_results = _parse_search_results(results, min_score)
|
||||
|
||||
return search_results
|
||||
|
||||
|
||||
def _parse_search_results(
|
||||
raw_results: dict,
|
||||
min_score: float,
|
||||
) -> List[SearchResult]:
|
||||
"""
|
||||
Parse ChromaDB query results into SearchResult objects.
|
||||
|
||||
ChromaDB returns distances (lower = more similar). We convert to
|
||||
similarity scores (higher = more similar) using: score = 1 - distance
|
||||
|
||||
Args:
|
||||
raw_results: Raw results dictionary from ChromaDB query
|
||||
min_score: Minimum similarity score to include
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects filtered by min_score
|
||||
"""
|
||||
search_results = []
|
||||
|
||||
# ChromaDB returns results as lists of lists (one list per query)
|
||||
# We only have one query, so we take the first element
|
||||
documents = raw_results.get("documents", [[]])[0]
|
||||
metadatas = raw_results.get("metadatas", [[]])[0]
|
||||
distances = raw_results.get("distances", [[]])[0]
|
||||
|
||||
for doc, metadata, distance in zip(documents, metadatas, distances):
|
||||
# Convert distance to similarity score (cosine distance -> cosine similarity)
|
||||
score = 1.0 - distance
|
||||
|
||||
# Filter by minimum score
|
||||
if score < min_score:
|
||||
continue
|
||||
|
||||
search_results.append(SearchResult(
|
||||
file_path=metadata["file_path"],
|
||||
section_title=metadata["section_title"],
|
||||
line_start=metadata["line_start"],
|
||||
line_end=metadata["line_end"],
|
||||
score=score,
|
||||
text=doc,
|
||||
))
|
||||
|
||||
return search_results
|
||||
Reference in New Issue
Block a user