Initial commit
This commit is contained in:
284
obsidian_rag/indexer.py
Normal file
284
obsidian_rag/indexer.py
Normal file
@@ -0,0 +1,284 @@
|
||||
"""
|
||||
Indexer module for Obsidian RAG Backend.
|
||||
|
||||
This module handles the indexing of markdown files into a ChromaDB vector store
|
||||
using local embeddings from sentence-transformers.
|
||||
"""
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Callable
|
||||
|
||||
import chromadb
|
||||
from chromadb.config import Settings
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
from markdown_parser import ParsedDocument
|
||||
from markdown_parser import parse_markdown_file
|
||||
|
||||
# EMBEDDING_MODEL = "all-MiniLM-L6-v2"
|
||||
EMBEDDING_MODEL = "all-MiniLM-L12-v2"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkMetadata:
|
||||
file_path: str
|
||||
section_title: str
|
||||
line_start: int
|
||||
line_end: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
id: str
|
||||
text: str
|
||||
metadata: ChunkMetadata
|
||||
|
||||
|
||||
def index_vault(
|
||||
vault_path: str,
|
||||
chroma_db_path: str,
|
||||
collection_name: str = "obsidian_vault",
|
||||
embedding_model: str = EMBEDDING_MODEL,
|
||||
max_chunk_tokens: int = 200,
|
||||
overlap_tokens: int = 30,
|
||||
progress_callback: Optional[Callable[[str, int, int], None]] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Index all markdown files from vault into ChromaDB.
|
||||
|
||||
Args:
|
||||
vault_path: Path to the Obsidian vault directory
|
||||
chroma_db_path: Path where ChromaDB will store its data
|
||||
collection_name: Name of the ChromaDB collection
|
||||
embedding_model: Name of the sentence-transformers model to use
|
||||
max_chunk_tokens: Maximum tokens per chunk
|
||||
overlap_tokens: Number of overlapping tokens between chunks
|
||||
progress_callback: Optional callback function called for each file processed.
|
||||
Signature: callback(current_file: str, files_processed: int, total_files: int)
|
||||
|
||||
Returns:
|
||||
Dictionary with indexing statistics:
|
||||
- files_processed: Number of files successfully processed
|
||||
- chunks_created: Total number of chunks created
|
||||
- errors: List of errors encountered (file path and error message)
|
||||
- collection_name: Name of the collection used
|
||||
"""
|
||||
|
||||
vault_path_obj = Path(vault_path)
|
||||
if not vault_path_obj.exists():
|
||||
raise ValueError(f"Vault path does not exist: {vault_path}")
|
||||
|
||||
# Initialize embedding model and tokenizer
|
||||
model = SentenceTransformer(embedding_model)
|
||||
tokenizer = model.tokenizer
|
||||
|
||||
# Initialize ChromaDB client and collection
|
||||
chroma_client = chromadb.PersistentClient(
|
||||
path=chroma_db_path,
|
||||
settings=Settings(anonymized_telemetry=False)
|
||||
)
|
||||
collection = _get_or_create_collection(chroma_client, collection_name)
|
||||
|
||||
# Find all markdown files
|
||||
md_files = list(vault_path_obj.rglob("*.md"))
|
||||
total_files = len(md_files)
|
||||
|
||||
# Statistics tracking
|
||||
stats = {
|
||||
"files_processed": 0,
|
||||
"chunks_created": 0,
|
||||
"errors": [],
|
||||
"collection_name": collection_name,
|
||||
}
|
||||
|
||||
# Process each file
|
||||
for md_file in md_files:
|
||||
# Get relative path for display
|
||||
relative_path = md_file.relative_to(vault_path_obj)
|
||||
|
||||
# Notify callback that we're starting this file
|
||||
if progress_callback:
|
||||
progress_callback(str(relative_path), stats["files_processed"], total_files)
|
||||
|
||||
try:
|
||||
# Parse markdown file
|
||||
parsed_doc = parse_markdown_file(md_file)
|
||||
|
||||
# Create chunks from document
|
||||
chunks = _create_chunks_from_document(
|
||||
parsed_doc,
|
||||
tokenizer,
|
||||
max_chunk_tokens,
|
||||
overlap_tokens,
|
||||
vault_path_obj,
|
||||
)
|
||||
|
||||
if chunks:
|
||||
# Extract data for ChromaDB
|
||||
documents = [chunk.text for chunk in chunks]
|
||||
metadatas = [asdict(chunk.metadata) for chunk in chunks]
|
||||
ids = [chunk.id for chunk in chunks]
|
||||
|
||||
# Generate embeddings and add to collection
|
||||
embeddings = model.encode(documents, show_progress_bar=False)
|
||||
collection.add(
|
||||
documents=documents,
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
embeddings=embeddings.tolist(),
|
||||
)
|
||||
|
||||
stats["chunks_created"] += len(chunks)
|
||||
|
||||
stats["files_processed"] += 1
|
||||
|
||||
except Exception as e:
|
||||
# Log error but continue processing
|
||||
stats["errors"].append({
|
||||
"file": str(relative_path),
|
||||
"error": str(e),
|
||||
})
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def _get_or_create_collection(
|
||||
chroma_client: chromadb.PersistentClient,
|
||||
collection_name: str,
|
||||
) -> chromadb.Collection:
|
||||
"""
|
||||
Get or create a ChromaDB collection, resetting it if it already exists.
|
||||
|
||||
Args:
|
||||
chroma_client: ChromaDB client instance
|
||||
collection_name: Name of the collection
|
||||
|
||||
Returns:
|
||||
ChromaDB collection instance
|
||||
"""
|
||||
try:
|
||||
# Try to delete existing collection
|
||||
chroma_client.delete_collection(name=collection_name)
|
||||
except Exception:
|
||||
# Collection doesn't exist, that's fine
|
||||
pass
|
||||
|
||||
# Create fresh collection
|
||||
collection = chroma_client.create_collection(
|
||||
name=collection_name,
|
||||
metadata={"hnsw:space": "cosine"} # Use cosine similarity
|
||||
)
|
||||
|
||||
return collection
|
||||
|
||||
|
||||
def _create_chunks_from_document(
|
||||
parsed_doc: ParsedDocument,
|
||||
tokenizer,
|
||||
max_chunk_tokens: int,
|
||||
overlap_tokens: int,
|
||||
vault_path: Path,
|
||||
) -> List[Chunk]:
|
||||
"""
|
||||
Transform a parsed document into chunks with metadata.
|
||||
|
||||
Implements hybrid chunking strategy:
|
||||
- Short sections (≤max_chunk_tokens): one chunk per section
|
||||
- Long sections (>max_chunk_tokens): split with sliding window
|
||||
|
||||
Args:
|
||||
parsed_doc: Parsed document from markdown_parser
|
||||
tokenizer: Tokenizer from sentence-transformers model
|
||||
max_chunk_tokens: Maximum tokens per chunk
|
||||
overlap_tokens: Number of overlapping tokens between chunks
|
||||
vault_path: Path to vault root (for relative path calculation)
|
||||
|
||||
Returns:
|
||||
List of chunk dictionaries with 'text', 'metadata', and 'id' keys
|
||||
"""
|
||||
chunks = []
|
||||
file_path = parsed_doc.file_path
|
||||
relative_path = file_path.relative_to(vault_path)
|
||||
|
||||
for section in parsed_doc.sections:
|
||||
section_text = f"{parsed_doc.title} {section.title} {section.content}"
|
||||
section_title = section.title
|
||||
line_start = section.start_line
|
||||
line_end = section.end_line
|
||||
|
||||
# Tokenize section to check length
|
||||
tokens = tokenizer.encode(section_text, add_special_tokens=False)
|
||||
|
||||
if len(tokens) <= max_chunk_tokens:
|
||||
# Short section: create single chunk
|
||||
chunk_id = f"{relative_path}::{section_title}::{line_start}-{line_end}"
|
||||
chunks.append(Chunk(chunk_id, section_text, ChunkMetadata(str(relative_path),
|
||||
section_title,
|
||||
line_start,
|
||||
line_start
|
||||
)))
|
||||
else:
|
||||
# Long section: split with sliding window
|
||||
sub_chunks = _chunk_section(
|
||||
section_text,
|
||||
tokenizer,
|
||||
max_chunk_tokens,
|
||||
overlap_tokens,
|
||||
)
|
||||
|
||||
# Create chunk for each sub-chunk
|
||||
for idx, sub_chunk_text in enumerate(sub_chunks):
|
||||
chunk_id = f"{relative_path}::{section_title}::{line_start}-{line_end}::chunk{idx}"
|
||||
chunks.append(Chunk(chunk_id, sub_chunk_text, ChunkMetadata(str(relative_path),
|
||||
section_title,
|
||||
line_start,
|
||||
line_start
|
||||
)))
|
||||
return chunks
|
||||
|
||||
|
||||
def _chunk_section(
|
||||
section_text: str,
|
||||
tokenizer,
|
||||
max_chunk_tokens: int,
|
||||
overlap_tokens: int,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Split a section into overlapping chunks using sliding window.
|
||||
|
||||
Args:
|
||||
section_text: Text content to chunk
|
||||
tokenizer: Tokenizer from sentence-transformers model
|
||||
max_chunk_tokens: Maximum tokens per chunk
|
||||
overlap_tokens: Number of overlapping tokens between chunks
|
||||
|
||||
Returns:
|
||||
List of text chunks
|
||||
"""
|
||||
# Apply safety margin to prevent decode/encode inconsistencies
|
||||
# from exceeding the max token limit
|
||||
max_chunk_tokens_to_use = int(max_chunk_tokens * 0.98)
|
||||
|
||||
# Tokenize the full text
|
||||
tokens = tokenizer.encode(section_text, add_special_tokens=False)
|
||||
|
||||
chunks = []
|
||||
start_idx = 0
|
||||
|
||||
while start_idx < len(tokens):
|
||||
# Extract chunk tokens
|
||||
end_idx = start_idx + max_chunk_tokens_to_use
|
||||
chunk_tokens = tokens[start_idx:end_idx]
|
||||
|
||||
# Decode back to text
|
||||
chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
|
||||
chunks.append(chunk_text)
|
||||
|
||||
# Move window forward (with overlap)
|
||||
start_idx += max_chunk_tokens_to_use - overlap_tokens
|
||||
|
||||
# Avoid infinite loop if overlap >= max_chunk_tokens
|
||||
if start_idx <= start_idx - (max_chunk_tokens_to_use - overlap_tokens):
|
||||
break
|
||||
|
||||
return chunks
|
||||
Reference in New Issue
Block a user