""" Indexer module for Obsidian RAG Backend. This module handles the indexing of markdown files into a ChromaDB vector store using local embeddings from sentence-transformers. """ from dataclasses import dataclass, asdict from pathlib import Path from typing import Dict, List, Optional, Callable import chromadb from chromadb.config import Settings from sentence_transformers import SentenceTransformer from markdown_parser import ParsedDocument from markdown_parser import parse_markdown_file # EMBEDDING_MODEL = "all-MiniLM-L6-v2" EMBEDDING_MODEL = "all-MiniLM-L12-v2" @dataclass class ChunkMetadata: file_path: str section_title: str line_start: int line_end: int @dataclass class Chunk: id: str text: str metadata: ChunkMetadata def index_vault( vault_path: str, chroma_db_path: str, collection_name: str = "obsidian_vault", embedding_model: str = EMBEDDING_MODEL, max_chunk_tokens: int = 200, overlap_tokens: int = 30, progress_callback: Optional[Callable[[str, int, int], None]] = None, ) -> Dict: """ Index all markdown files from vault into ChromaDB. Args: vault_path: Path to the Obsidian vault directory chroma_db_path: Path where ChromaDB will store its data collection_name: Name of the ChromaDB collection embedding_model: Name of the sentence-transformers model to use max_chunk_tokens: Maximum tokens per chunk overlap_tokens: Number of overlapping tokens between chunks progress_callback: Optional callback function called for each file processed. Signature: callback(current_file: str, files_processed: int, total_files: int) Returns: Dictionary with indexing statistics: - files_processed: Number of files successfully processed - chunks_created: Total number of chunks created - errors: List of errors encountered (file path and error message) - collection_name: Name of the collection used """ vault_path_obj = Path(vault_path) if not vault_path_obj.exists(): raise ValueError(f"Vault path does not exist: {vault_path}") # Initialize embedding model and tokenizer model = SentenceTransformer(embedding_model) tokenizer = model.tokenizer # Initialize ChromaDB client and collection chroma_client = chromadb.PersistentClient( path=chroma_db_path, settings=Settings(anonymized_telemetry=False) ) collection = _get_or_create_collection(chroma_client, collection_name) # Find all markdown files md_files = list(vault_path_obj.rglob("*.md")) total_files = len(md_files) # Statistics tracking stats = { "files_processed": 0, "chunks_created": 0, "errors": [], "collection_name": collection_name, } # Process each file for md_file in md_files: # Get relative path for display relative_path = md_file.relative_to(vault_path_obj) # Notify callback that we're starting this file if progress_callback: progress_callback(str(relative_path), stats["files_processed"], total_files) try: # Parse markdown file parsed_doc = parse_markdown_file(md_file) # Create chunks from document chunks = _create_chunks_from_document( parsed_doc, tokenizer, max_chunk_tokens, overlap_tokens, vault_path_obj, ) if chunks: # Extract data for ChromaDB documents = [chunk.text for chunk in chunks] metadatas = [asdict(chunk.metadata) for chunk in chunks] ids = [chunk.id for chunk in chunks] # Generate embeddings and add to collection embeddings = model.encode(documents, show_progress_bar=False) collection.add( documents=documents, metadatas=metadatas, ids=ids, embeddings=embeddings.tolist(), ) stats["chunks_created"] += len(chunks) stats["files_processed"] += 1 except Exception as e: # Log error but continue processing stats["errors"].append({ "file": str(relative_path), "error": str(e), }) return stats def _get_or_create_collection( chroma_client: chromadb.PersistentClient, collection_name: str, ) -> chromadb.Collection: """ Get or create a ChromaDB collection, resetting it if it already exists. Args: chroma_client: ChromaDB client instance collection_name: Name of the collection Returns: ChromaDB collection instance """ try: # Try to delete existing collection chroma_client.delete_collection(name=collection_name) except Exception: # Collection doesn't exist, that's fine pass # Create fresh collection collection = chroma_client.create_collection( name=collection_name, metadata={"hnsw:space": "cosine"} # Use cosine similarity ) return collection def _create_chunks_from_document( parsed_doc: ParsedDocument, tokenizer, max_chunk_tokens: int, overlap_tokens: int, vault_path: Path, ) -> List[Chunk]: """ Transform a parsed document into chunks with metadata. Implements hybrid chunking strategy: - Short sections (≤max_chunk_tokens): one chunk per section - Long sections (>max_chunk_tokens): split with sliding window Args: parsed_doc: Parsed document from markdown_parser tokenizer: Tokenizer from sentence-transformers model max_chunk_tokens: Maximum tokens per chunk overlap_tokens: Number of overlapping tokens between chunks vault_path: Path to vault root (for relative path calculation) Returns: List of chunk dictionaries with 'text', 'metadata', and 'id' keys """ chunks = [] file_path = parsed_doc.file_path relative_path = file_path.relative_to(vault_path) for section in parsed_doc.sections: section_text = f"{parsed_doc.title} {section.title} {section.content}" section_title = section.title line_start = section.start_line line_end = section.end_line # Tokenize section to check length tokens = tokenizer.encode(section_text, add_special_tokens=False) if len(tokens) <= max_chunk_tokens: # Short section: create single chunk chunk_id = f"{relative_path}::{section_title}::{line_start}-{line_end}" chunks.append(Chunk(chunk_id, section_text, ChunkMetadata(str(relative_path), section_title, line_start, line_start ))) else: # Long section: split with sliding window sub_chunks = _chunk_section( section_text, tokenizer, max_chunk_tokens, overlap_tokens, ) # Create chunk for each sub-chunk for idx, sub_chunk_text in enumerate(sub_chunks): chunk_id = f"{relative_path}::{section_title}::{line_start}-{line_end}::chunk{idx}" chunks.append(Chunk(chunk_id, sub_chunk_text, ChunkMetadata(str(relative_path), section_title, line_start, line_start ))) return chunks def _chunk_section( section_text: str, tokenizer, max_chunk_tokens: int, overlap_tokens: int, ) -> List[str]: """ Split a section into overlapping chunks using sliding window. Args: section_text: Text content to chunk tokenizer: Tokenizer from sentence-transformers model max_chunk_tokens: Maximum tokens per chunk overlap_tokens: Number of overlapping tokens between chunks Returns: List of text chunks """ # Apply safety margin to prevent decode/encode inconsistencies # from exceeding the max token limit max_chunk_tokens_to_use = int(max_chunk_tokens * 0.98) # Tokenize the full text tokens = tokenizer.encode(section_text, add_special_tokens=False) chunks = [] start_idx = 0 while start_idx < len(tokens): # Extract chunk tokens end_idx = start_idx + max_chunk_tokens_to_use chunk_tokens = tokens[start_idx:end_idx] # Decode back to text chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True) chunks.append(chunk_text) # Move window forward (with overlap) start_idx += max_chunk_tokens_to_use - overlap_tokens # Avoid infinite loop if overlap >= max_chunk_tokens if start_idx <= start_idx - (max_chunk_tokens_to_use - overlap_tokens): break return chunks