Initial commit
This commit is contained in:
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
537
tests/test_cli.py
Normal file
537
tests/test_cli.py
Normal file
@@ -0,0 +1,537 @@
|
||||
"""
|
||||
Unit tests for the CLI module.
|
||||
"""
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from typer.testing import CliRunner
|
||||
from obsidian_rag.cli import app, _display_index_results, _display_results_compact
|
||||
from obsidian_rag.indexer import index_vault
|
||||
from obsidian_rag.searcher import SearchResult
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_vault(tmp_path):
|
||||
"""
|
||||
Create a temporary vault with sample markdown files.
|
||||
"""
|
||||
vault_path = tmp_path / "test_vault"
|
||||
vault_path.mkdir()
|
||||
|
||||
# Create sample files
|
||||
file1 = vault_path / "python.md"
|
||||
file1.write_text("""# Python Programming
|
||||
|
||||
Python is a high-level programming language.
|
||||
|
||||
## Features
|
||||
|
||||
Python has dynamic typing and automatic memory management.
|
||||
""")
|
||||
|
||||
file2 = vault_path / "javascript.md"
|
||||
file2.write_text("""# JavaScript
|
||||
|
||||
JavaScript is a scripting language for web development.
|
||||
|
||||
## Usage
|
||||
|
||||
JavaScript runs in web browsers and Node.js environments.
|
||||
""")
|
||||
|
||||
file3 = vault_path / "cooking.md"
|
||||
file3.write_text("""# Cooking Tips
|
||||
|
||||
Learn how to cook delicious meals.
|
||||
|
||||
## Basics
|
||||
|
||||
Start with simple recipes and basic techniques.
|
||||
""")
|
||||
|
||||
return vault_path
|
||||
|
||||
|
||||
# Tests for 'index' command - Passing tests
|
||||
|
||||
|
||||
def test_i_can_index_vault_successfully(temp_vault, tmp_path):
|
||||
"""
|
||||
Test that we can index a vault successfully.
|
||||
"""
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
result = runner.invoke(app, [
|
||||
"index",
|
||||
str(temp_vault),
|
||||
"--chroma-path", str(chroma_path),
|
||||
])
|
||||
|
||||
assert result.exit_code == 0
|
||||
assert "Found 3 markdown files to index" in result.stdout
|
||||
assert "Indexing completed" in result.stdout
|
||||
assert "Files processed:" in result.stdout
|
||||
assert "Chunks created:" in result.stdout
|
||||
|
||||
|
||||
def test_i_can_index_with_custom_chroma_path(temp_vault, tmp_path):
|
||||
"""
|
||||
Test that we can specify a custom ChromaDB path.
|
||||
"""
|
||||
custom_chroma = tmp_path / "my_custom_db"
|
||||
|
||||
result = runner.invoke(app, [
|
||||
"index",
|
||||
str(temp_vault),
|
||||
"--chroma-path", str(custom_chroma),
|
||||
])
|
||||
|
||||
assert result.exit_code == 0
|
||||
assert custom_chroma.exists()
|
||||
assert (custom_chroma / "chroma.sqlite3").exists()
|
||||
|
||||
|
||||
def test_i_can_index_with_custom_collection_name(temp_vault, tmp_path):
|
||||
"""
|
||||
Test that we can use a custom collection name.
|
||||
"""
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
collection_name = "my_custom_collection"
|
||||
|
||||
result = runner.invoke(app, [
|
||||
"index",
|
||||
str(temp_vault),
|
||||
"--chroma-path", str(chroma_path),
|
||||
"--collection", collection_name,
|
||||
])
|
||||
|
||||
assert result.exit_code == 0
|
||||
assert f"Collection: {collection_name}" in result.stdout
|
||||
|
||||
|
||||
def test_i_can_see_errors_in_index_results(tmp_path):
|
||||
"""
|
||||
Test that errors during indexing are displayed.
|
||||
"""
|
||||
vault_path = tmp_path / "vault_with_errors"
|
||||
vault_path.mkdir()
|
||||
|
||||
# Create a valid file
|
||||
valid_file = vault_path / "valid.md"
|
||||
valid_file.write_text("# Valid File\n\nThis is valid content.")
|
||||
|
||||
# Create an invalid file (will cause parsing error)
|
||||
invalid_file = vault_path / "invalid.md"
|
||||
invalid_file.write_bytes(b"\xff\xfe\x00\x00") # Invalid UTF-8
|
||||
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
result = runner.invoke(app, [
|
||||
"index",
|
||||
str(vault_path),
|
||||
"--chroma-path", str(chroma_path),
|
||||
])
|
||||
|
||||
# Should still complete (exit code 0) but show errors
|
||||
assert result.exit_code == 0
|
||||
assert "Indexing completed" in result.stdout
|
||||
# Note: Error display might vary, just check it completed
|
||||
|
||||
|
||||
# Tests for 'index' command - Error tests
|
||||
|
||||
|
||||
def test_i_cannot_index_nonexistent_vault(tmp_path):
|
||||
"""
|
||||
Test that indexing a nonexistent vault fails with clear error.
|
||||
"""
|
||||
nonexistent_path = tmp_path / "does_not_exist"
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
result = runner.invoke(app, [
|
||||
"index",
|
||||
str(nonexistent_path),
|
||||
"--chroma-path", str(chroma_path),
|
||||
])
|
||||
|
||||
assert result.exit_code == 1
|
||||
assert "does not exist" in result.stdout
|
||||
|
||||
|
||||
def test_i_cannot_index_file_instead_of_directory(tmp_path):
|
||||
"""
|
||||
Test that indexing a file (not directory) fails.
|
||||
"""
|
||||
file_path = tmp_path / "somefile.txt"
|
||||
file_path.write_text("I am a file")
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
result = runner.invoke(app, [
|
||||
"index",
|
||||
str(file_path),
|
||||
"--chroma-path", str(chroma_path),
|
||||
])
|
||||
|
||||
assert result.exit_code == 1
|
||||
assert "not a directory" in result.stdout
|
||||
|
||||
|
||||
def test_i_can_handle_empty_vault_gracefully(tmp_path):
|
||||
"""
|
||||
Test that an empty vault (no .md files) is handled gracefully.
|
||||
"""
|
||||
empty_vault = tmp_path / "empty_vault"
|
||||
empty_vault.mkdir()
|
||||
|
||||
# Create a non-markdown file
|
||||
(empty_vault / "readme.txt").write_text("Not a markdown file")
|
||||
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
result = runner.invoke(app, [
|
||||
"index",
|
||||
str(empty_vault),
|
||||
"--chroma-path", str(chroma_path),
|
||||
])
|
||||
|
||||
assert result.exit_code == 0
|
||||
assert "No markdown files found" in result.stdout
|
||||
|
||||
|
||||
# Tests for 'search' command - Passing tests
|
||||
|
||||
|
||||
def test_i_can_search_indexed_vault(temp_vault, tmp_path):
|
||||
"""
|
||||
Test that we can search an indexed vault.
|
||||
"""
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
# First, index the vault
|
||||
index_result = runner.invoke(app, [
|
||||
"index",
|
||||
str(temp_vault),
|
||||
"--chroma-path", str(chroma_path),
|
||||
])
|
||||
assert index_result.exit_code == 0
|
||||
|
||||
# Then search
|
||||
search_result = runner.invoke(app, [
|
||||
"search",
|
||||
"Python programming",
|
||||
"--chroma-path", str(chroma_path),
|
||||
])
|
||||
|
||||
assert search_result.exit_code == 0
|
||||
assert "Found" in search_result.stdout
|
||||
assert "result(s) for:" in search_result.stdout
|
||||
assert "python.md" in search_result.stdout
|
||||
|
||||
|
||||
def test_i_can_search_with_limit_option(temp_vault, tmp_path):
|
||||
"""
|
||||
Test that the --limit option works.
|
||||
"""
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
# Index
|
||||
runner.invoke(app, [
|
||||
"index",
|
||||
str(temp_vault),
|
||||
"--chroma-path", str(chroma_path),
|
||||
])
|
||||
|
||||
# Search with limit
|
||||
result = runner.invoke(app, [
|
||||
"search",
|
||||
"programming",
|
||||
"--chroma-path", str(chroma_path),
|
||||
"--limit", "2",
|
||||
])
|
||||
|
||||
assert result.exit_code == 0
|
||||
# Count result numbers (1., 2., etc.)
|
||||
result_count = result.stdout.count("[bold cyan]")
|
||||
assert result_count <= 2
|
||||
|
||||
|
||||
def test_i_can_search_with_min_score_option(temp_vault, tmp_path):
|
||||
"""
|
||||
Test that the --min-score option works.
|
||||
"""
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
# Index
|
||||
runner.invoke(app, [
|
||||
"index",
|
||||
str(temp_vault),
|
||||
"--chroma-path", str(chroma_path),
|
||||
])
|
||||
|
||||
# Search with high min-score
|
||||
result = runner.invoke(app, [
|
||||
"search",
|
||||
"Python",
|
||||
"--chroma-path", str(chroma_path),
|
||||
"--min-score", "0.5",
|
||||
])
|
||||
|
||||
assert result.exit_code == 0
|
||||
# Should have results (Python file should match well)
|
||||
assert "Found" in result.stdout
|
||||
|
||||
|
||||
def test_i_can_search_with_custom_collection(temp_vault, tmp_path):
|
||||
"""
|
||||
Test that we can search in a custom collection.
|
||||
"""
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
collection_name = "test_collection"
|
||||
|
||||
# Index with custom collection
|
||||
runner.invoke(app, [
|
||||
"index",
|
||||
str(temp_vault),
|
||||
"--chroma-path", str(chroma_path),
|
||||
"--collection", collection_name,
|
||||
])
|
||||
|
||||
# Search in same collection
|
||||
result = runner.invoke(app, [
|
||||
"search",
|
||||
"Python",
|
||||
"--chroma-path", str(chroma_path),
|
||||
"--collection", collection_name,
|
||||
])
|
||||
|
||||
assert result.exit_code == 0
|
||||
assert "Found" in result.stdout
|
||||
|
||||
|
||||
def test_i_can_handle_no_results_gracefully(temp_vault, tmp_path):
|
||||
"""
|
||||
Test that no results scenario is handled gracefully.
|
||||
"""
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
# Index
|
||||
runner.invoke(app, [
|
||||
"index",
|
||||
str(temp_vault),
|
||||
"--chroma-path", str(chroma_path),
|
||||
])
|
||||
|
||||
# Search for something unlikely with high threshold
|
||||
result = runner.invoke(app, [
|
||||
"search",
|
||||
"quantum physics relativity",
|
||||
"--chroma-path", str(chroma_path),
|
||||
"--min-score", "0.95",
|
||||
])
|
||||
|
||||
assert result.exit_code == 0
|
||||
assert "No results found" in result.stdout
|
||||
|
||||
|
||||
def test_i_can_use_compact_format(temp_vault, tmp_path):
|
||||
"""
|
||||
Test that compact format displays correctly.
|
||||
"""
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
# Index
|
||||
runner.invoke(app, [
|
||||
"index",
|
||||
str(temp_vault),
|
||||
"--chroma-path", str(chroma_path),
|
||||
])
|
||||
|
||||
# Search with explicit compact format
|
||||
result = runner.invoke(app, [
|
||||
"search",
|
||||
"Python",
|
||||
"--chroma-path", str(chroma_path),
|
||||
"--format", "compact",
|
||||
])
|
||||
|
||||
assert result.exit_code == 0
|
||||
# Check for compact format elements
|
||||
assert "Section:" in result.stdout
|
||||
assert "Lines:" in result.stdout
|
||||
assert "score:" in result.stdout
|
||||
|
||||
|
||||
# Tests for 'search' command - Error tests
|
||||
|
||||
|
||||
def test_i_cannot_search_without_index(tmp_path):
|
||||
"""
|
||||
Test that searching without indexing fails with clear message.
|
||||
"""
|
||||
chroma_path = tmp_path / "nonexistent_chroma"
|
||||
|
||||
result = runner.invoke(app, [
|
||||
"search",
|
||||
"test query",
|
||||
"--chroma-path", str(chroma_path),
|
||||
])
|
||||
|
||||
assert result.exit_code == 1
|
||||
assert "not found" in result.stdout
|
||||
assert "index" in result.stdout.lower()
|
||||
|
||||
|
||||
def test_i_cannot_search_nonexistent_collection(temp_vault, tmp_path):
|
||||
"""
|
||||
Test that searching in a nonexistent collection fails.
|
||||
"""
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
# Index with default collection
|
||||
runner.invoke(app, [
|
||||
"index",
|
||||
str(temp_vault),
|
||||
"--chroma-path", str(chroma_path),
|
||||
])
|
||||
|
||||
# Search in different collection
|
||||
result = runner.invoke(app, [
|
||||
"search",
|
||||
"Python",
|
||||
"--chroma-path", str(chroma_path),
|
||||
"--collection", "nonexistent_collection",
|
||||
])
|
||||
|
||||
assert result.exit_code == 1
|
||||
assert "not found" in result.stdout
|
||||
|
||||
|
||||
def test_i_cannot_use_invalid_format(temp_vault, tmp_path):
|
||||
"""
|
||||
Test that an invalid format is rejected.
|
||||
"""
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
# Index
|
||||
runner.invoke(app, [
|
||||
"index",
|
||||
str(temp_vault),
|
||||
"--chroma-path", str(chroma_path),
|
||||
])
|
||||
|
||||
# Search with invalid format
|
||||
result = runner.invoke(app, [
|
||||
"search",
|
||||
"Python",
|
||||
"--chroma-path", str(chroma_path),
|
||||
"--format", "invalid_format",
|
||||
])
|
||||
|
||||
assert result.exit_code == 1
|
||||
assert "Invalid format" in result.stdout
|
||||
assert "compact" in result.stdout
|
||||
|
||||
|
||||
# Tests for helper functions
|
||||
|
||||
|
||||
def test_i_can_display_index_results(capsys):
|
||||
"""
|
||||
Test that index results are displayed correctly.
|
||||
"""
|
||||
stats = {
|
||||
"files_processed": 10,
|
||||
"chunks_created": 50,
|
||||
"collection_name": "test_collection",
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
_display_index_results(stats)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert "Indexing completed" in captured.out
|
||||
assert "10" in captured.out
|
||||
assert "50" in captured.out
|
||||
assert "test_collection" in captured.out
|
||||
|
||||
|
||||
def test_i_can_display_index_results_with_errors(capsys):
|
||||
"""
|
||||
Test that index results with errors are displayed correctly.
|
||||
"""
|
||||
stats = {
|
||||
"files_processed": 8,
|
||||
"chunks_created": 40,
|
||||
"collection_name": "test_collection",
|
||||
"errors": [
|
||||
{"file": "broken.md", "error": "Invalid encoding"},
|
||||
{"file": "corrupt.md", "error": "Parse error"},
|
||||
],
|
||||
}
|
||||
|
||||
_display_index_results(stats)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert "Indexing completed" in captured.out
|
||||
assert "2 file(s) skipped" in captured.out
|
||||
assert "broken.md" in captured.out
|
||||
assert "Invalid encoding" in captured.out
|
||||
|
||||
|
||||
def test_i_can_display_results_compact(capsys):
|
||||
"""
|
||||
Test that compact results display correctly.
|
||||
"""
|
||||
results = [
|
||||
SearchResult(
|
||||
file_path="notes/python.md",
|
||||
section_title="Introduction",
|
||||
line_start=1,
|
||||
line_end=5,
|
||||
score=0.87,
|
||||
text="Python is a high-level programming language.",
|
||||
),
|
||||
SearchResult(
|
||||
file_path="notes/javascript.md",
|
||||
section_title="Overview",
|
||||
line_start=10,
|
||||
line_end=15,
|
||||
score=0.65,
|
||||
text="JavaScript is used for web development.",
|
||||
),
|
||||
]
|
||||
|
||||
_display_results_compact(results)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert "python.md" in captured.out
|
||||
assert "javascript.md" in captured.out
|
||||
assert "0.87" in captured.out
|
||||
assert "0.65" in captured.out
|
||||
assert "Introduction" in captured.out
|
||||
assert "Overview" in captured.out
|
||||
|
||||
|
||||
def test_i_can_display_results_compact_with_long_text(capsys):
|
||||
"""
|
||||
Test that long text is truncated in compact display.
|
||||
"""
|
||||
long_text = "A" * 300 # Text longer than 200 characters
|
||||
|
||||
results = [
|
||||
SearchResult(
|
||||
file_path="notes/long.md",
|
||||
section_title="Long Section",
|
||||
line_start=1,
|
||||
line_end=10,
|
||||
score=0.75,
|
||||
text=long_text,
|
||||
),
|
||||
]
|
||||
|
||||
_display_results_compact(results)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert "..." in captured.out # Should be truncated
|
||||
assert len([line for line in captured.out.split('\n') if 'A' * 200 in line]) == 0 # Full text not shown
|
||||
381
tests/test_indexer.py
Normal file
381
tests/test_indexer.py
Normal file
@@ -0,0 +1,381 @@
|
||||
"""
|
||||
Unit tests for the indexer module.
|
||||
"""
|
||||
|
||||
import chromadb
|
||||
import pytest
|
||||
from chromadb.config import Settings
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
from indexer import (
|
||||
index_vault,
|
||||
_chunk_section,
|
||||
_create_chunks_from_document,
|
||||
_get_or_create_collection, EMBEDDING_MODEL,
|
||||
)
|
||||
from obsidian_rag.markdown_parser import ParsedDocument, MarkdownSection
|
||||
|
||||
|
||||
# Fixtures
|
||||
|
||||
@pytest.fixture
|
||||
def tokenizer():
|
||||
"""Provide sentence-transformers tokenizer."""
|
||||
model = SentenceTransformer(EMBEDDING_MODEL)
|
||||
return model.tokenizer
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def embedding_model():
|
||||
"""Provide sentence-transformers model."""
|
||||
return SentenceTransformer(EMBEDDING_MODEL)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def chroma_client(tmp_path):
|
||||
"""Provide ChromaDB client with temporary storage."""
|
||||
client = chromadb.PersistentClient(
|
||||
path=str(tmp_path / "chroma_test"),
|
||||
settings=Settings(anonymized_telemetry=False)
|
||||
)
|
||||
return client
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_vault(tmp_path):
|
||||
"""Create a temporary vault with test markdown files."""
|
||||
vault_path = tmp_path / "test_vault"
|
||||
vault_path.mkdir()
|
||||
return vault_path
|
||||
|
||||
|
||||
# Tests for _chunk_section()
|
||||
|
||||
def test_i_can_chunk_short_section_into_single_chunk(tokenizer):
|
||||
"""Test that a short section is not split."""
|
||||
# Create text with ~100 tokens
|
||||
short_text = " ".join(["word"] * 100)
|
||||
|
||||
chunks = _chunk_section(
|
||||
section_text=short_text,
|
||||
tokenizer=tokenizer,
|
||||
max_chunk_tokens=200,
|
||||
overlap_tokens=30,
|
||||
)
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0] == short_text
|
||||
|
||||
|
||||
def test_i_can_chunk_long_section_with_overlap(tokenizer):
|
||||
"""Test splitting long section with overlap."""
|
||||
# Create text with ~500 tokens
|
||||
long_text = " ".join([f"word{i}" for i in range(500)])
|
||||
|
||||
chunks = _chunk_section(
|
||||
section_text=long_text,
|
||||
tokenizer=tokenizer,
|
||||
max_chunk_tokens=200,
|
||||
overlap_tokens=30,
|
||||
)
|
||||
|
||||
# Should create multiple chunks
|
||||
assert len(chunks) >= 2
|
||||
|
||||
# Verify no chunk exceeds max tokens
|
||||
for chunk in chunks:
|
||||
tokens = tokenizer.encode(chunk, add_special_tokens=False)
|
||||
assert len(tokens) <= 200
|
||||
|
||||
# Verify overlap exists between consecutive chunks
|
||||
for i in range(len(chunks) - 1):
|
||||
# Check that some words from end of chunk[i] appear in start of chunk[i+1]
|
||||
words_chunk1 = chunks[i].split()[-10:] # Last 10 words
|
||||
words_chunk2 = chunks[i + 1].split()[:10] # First 10 words
|
||||
|
||||
# At least some overlap should exist
|
||||
overlap_found = any(word in words_chunk2 for word in words_chunk1)
|
||||
assert overlap_found
|
||||
|
||||
|
||||
def test_i_can_chunk_empty_section(tokenizer):
|
||||
"""Test chunking an empty section."""
|
||||
empty_text = ""
|
||||
|
||||
chunks = _chunk_section(
|
||||
section_text=empty_text,
|
||||
tokenizer=tokenizer,
|
||||
max_chunk_tokens=200,
|
||||
overlap_tokens=30,
|
||||
)
|
||||
|
||||
assert len(chunks) == 0
|
||||
|
||||
|
||||
# Tests for _create_chunks_from_document()
|
||||
|
||||
def test_i_can_create_chunks_from_document_with_short_sections(tmp_path, tokenizer):
|
||||
"""Test creating chunks from document with only short sections."""
|
||||
vault_path = tmp_path / "vault"
|
||||
vault_path.mkdir()
|
||||
|
||||
parsed_doc = ParsedDocument(
|
||||
file_path=vault_path / "test.md",
|
||||
title="test.md",
|
||||
sections=[
|
||||
MarkdownSection(1, "Section 1", "This is a short section with few words.", [], 1, 2),
|
||||
MarkdownSection(2, "Section 2", "Another short section here.", ["Section 1"], 3, 4),
|
||||
MarkdownSection(3, "Section 3", "Third short section.", ["Section 1", "Section 3"], 5, 6),
|
||||
],
|
||||
raw_content="" # not used in this test
|
||||
)
|
||||
|
||||
chunks = _create_chunks_from_document(
|
||||
parsed_doc=parsed_doc,
|
||||
tokenizer=tokenizer,
|
||||
max_chunk_tokens=200,
|
||||
overlap_tokens=30,
|
||||
vault_path=vault_path,
|
||||
)
|
||||
|
||||
# Should create 3 chunks (one per section)
|
||||
assert len(chunks) == 3
|
||||
|
||||
# Verify metadata
|
||||
for i, chunk in enumerate(chunks):
|
||||
metadata = chunk.metadata
|
||||
assert metadata.file_path == "test.md"
|
||||
assert metadata.section_title == f"Section {i + 1}"
|
||||
assert isinstance(metadata.line_start, int)
|
||||
assert isinstance(metadata.line_end, int)
|
||||
|
||||
# Verify ID format
|
||||
assert "test.md" in chunk.id
|
||||
assert f"Section {i + 1}" in chunk.id
|
||||
|
||||
|
||||
def test_i_can_create_chunks_from_document_with_long_section(tmp_path, tokenizer):
|
||||
"""Test creating chunks from document with a long section that needs splitting."""
|
||||
vault_path = tmp_path / "vault"
|
||||
vault_path.mkdir()
|
||||
|
||||
# Create long content (~500 tokens)
|
||||
long_content = " ".join([f"word{i}" for i in range(500)])
|
||||
|
||||
parsed_doc = ParsedDocument(
|
||||
file_path=vault_path / "test.md",
|
||||
title="test.md",
|
||||
sections=[
|
||||
MarkdownSection(1, "Long Section", long_content, [], 1, 1)
|
||||
],
|
||||
raw_content=long_content,
|
||||
)
|
||||
|
||||
chunks = _create_chunks_from_document(
|
||||
parsed_doc=parsed_doc,
|
||||
tokenizer=tokenizer,
|
||||
max_chunk_tokens=200,
|
||||
overlap_tokens=30,
|
||||
vault_path=vault_path,
|
||||
)
|
||||
|
||||
# Should create multiple chunks
|
||||
assert len(chunks) >= 2
|
||||
|
||||
# All chunks should have same section_title
|
||||
for chunk in chunks:
|
||||
assert chunk.metadata.section_title == "Long Section"
|
||||
assert chunk.metadata.line_start == 1
|
||||
assert chunk.metadata.line_end == 1
|
||||
|
||||
# IDs should include chunk numbers
|
||||
assert "::chunk0" in chunks[0].id
|
||||
assert "::chunk1" in chunks[1].id
|
||||
|
||||
|
||||
def test_i_can_create_chunks_with_correct_relative_paths(tmp_path, tokenizer):
|
||||
"""Test that relative paths are correctly computed."""
|
||||
vault_path = tmp_path / "vault"
|
||||
vault_path.mkdir()
|
||||
|
||||
# Create subdirectory
|
||||
subdir = vault_path / "subfolder"
|
||||
subdir.mkdir()
|
||||
|
||||
parsed_doc = ParsedDocument(
|
||||
file_path=subdir / "nested.md",
|
||||
title=f"{subdir} nested.md",
|
||||
sections=[
|
||||
MarkdownSection(1, "Section", "Some content here.", [], 1, 2),
|
||||
],
|
||||
raw_content="",
|
||||
)
|
||||
|
||||
chunks = _create_chunks_from_document(
|
||||
parsed_doc=parsed_doc,
|
||||
tokenizer=tokenizer,
|
||||
max_chunk_tokens=200,
|
||||
overlap_tokens=30,
|
||||
vault_path=vault_path,
|
||||
)
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].metadata.file_path == "subfolder/nested.md"
|
||||
|
||||
|
||||
# Tests for _get_or_create_collection()
|
||||
|
||||
def test_i_can_create_new_collection(chroma_client):
|
||||
"""Test creating a new collection that doesn't exist."""
|
||||
collection_name = "test_collection"
|
||||
|
||||
collection = _get_or_create_collection(chroma_client, collection_name)
|
||||
|
||||
assert collection.name == collection_name
|
||||
assert collection.count() == 0 # Should be empty
|
||||
|
||||
|
||||
def test_i_can_reset_existing_collection(chroma_client):
|
||||
"""Test that an existing collection is deleted and recreated."""
|
||||
collection_name = "test_collection"
|
||||
|
||||
# Create collection and add data
|
||||
first_collection = chroma_client.create_collection(collection_name)
|
||||
first_collection.add(
|
||||
documents=["test document"],
|
||||
ids=["test_id"],
|
||||
)
|
||||
assert first_collection.count() == 1
|
||||
|
||||
# Reset collection
|
||||
new_collection = _get_or_create_collection(chroma_client, collection_name)
|
||||
|
||||
assert new_collection.name == collection_name
|
||||
assert new_collection.count() == 0 # Should be empty after reset
|
||||
|
||||
|
||||
# Tests for index_vault()
|
||||
|
||||
def test_i_can_index_single_markdown_file(test_vault, tmp_path, embedding_model):
|
||||
"""Test indexing a single markdown file."""
|
||||
# Create test markdown file
|
||||
test_file = test_vault / "test.md"
|
||||
test_file.write_text(
|
||||
"# Title\n\nThis is a test document with some content.\n\n## Section\n\nMore content here."
|
||||
)
|
||||
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
stats = index_vault(
|
||||
vault_path=str(test_vault),
|
||||
chroma_db_path=str(chroma_path),
|
||||
collection_name="test_collection",
|
||||
)
|
||||
|
||||
assert stats["files_processed"] == 1
|
||||
assert stats["chunks_created"] > 0
|
||||
assert stats["errors"] == []
|
||||
assert stats["collection_name"] == "test_collection"
|
||||
|
||||
# Verify collection contains data
|
||||
client = chromadb.PersistentClient(
|
||||
path=str(chroma_path),
|
||||
settings=Settings(anonymized_telemetry=False)
|
||||
)
|
||||
collection = client.get_collection("test_collection")
|
||||
assert collection.count() == stats["chunks_created"]
|
||||
|
||||
|
||||
def test_i_can_index_multiple_markdown_files(test_vault, tmp_path):
|
||||
"""Test indexing multiple markdown files."""
|
||||
# Create multiple test files
|
||||
for i in range(3):
|
||||
test_file = test_vault / f"test{i}.md"
|
||||
test_file.write_text(f"# Document {i}\n\nContent for document {i}.")
|
||||
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
stats = index_vault(
|
||||
vault_path=str(test_vault),
|
||||
chroma_db_path=str(chroma_path),
|
||||
)
|
||||
|
||||
assert stats["files_processed"] == 3
|
||||
assert stats["chunks_created"] >= 3 # At least one chunk per file
|
||||
assert stats["errors"] == []
|
||||
|
||||
|
||||
def test_i_can_continue_indexing_after_file_error(test_vault, tmp_path, monkeypatch):
|
||||
"""Test that indexing continues after encountering an error."""
|
||||
# Create valid files
|
||||
(test_vault / "valid1.md").write_text("# Valid 1\n\nContent here.")
|
||||
(test_vault / "valid2.md").write_text("# Valid 2\n\nMore content.")
|
||||
(test_vault / "problematic.md").write_text("# Problem\n\nThis will fail.")
|
||||
|
||||
# Mock parse_markdown_file to fail for problematic.md
|
||||
from obsidian_rag import markdown_parser
|
||||
original_parse = markdown_parser.parse_markdown_file
|
||||
|
||||
def mock_parse(file_path):
|
||||
if "problematic.md" in str(file_path):
|
||||
raise ValueError("Simulated parsing error")
|
||||
return original_parse(file_path)
|
||||
|
||||
monkeypatch.setattr("indexer.parse_markdown_file", mock_parse)
|
||||
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
stats = index_vault(
|
||||
vault_path=str(test_vault),
|
||||
chroma_db_path=str(chroma_path),
|
||||
)
|
||||
|
||||
# Should process 2 valid files
|
||||
assert stats["files_processed"] == 2
|
||||
assert len(stats["errors"]) == 1
|
||||
assert "problematic.md" in stats["errors"][0]["file"]
|
||||
assert "Simulated parsing error" in stats["errors"][0]["error"]
|
||||
|
||||
|
||||
def test_i_cannot_index_nonexistent_vault(tmp_path):
|
||||
"""Test that indexing a nonexistent vault raises an error."""
|
||||
nonexistent_path = tmp_path / "nonexistent_vault"
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
with pytest.raises(ValueError, match="Vault path does not exist"):
|
||||
index_vault(
|
||||
vault_path=str(nonexistent_path),
|
||||
chroma_db_path=str(chroma_path),
|
||||
)
|
||||
|
||||
|
||||
def test_i_can_verify_embeddings_are_generated(test_vault, tmp_path):
|
||||
"""Test that embeddings are properly generated and stored."""
|
||||
# Create test file
|
||||
test_file = test_vault / "test.md"
|
||||
test_file.write_text("# Test\n\nThis is test content for embedding generation.")
|
||||
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
|
||||
stats = index_vault(
|
||||
vault_path=str(test_vault),
|
||||
chroma_db_path=str(chroma_path),
|
||||
)
|
||||
|
||||
# Verify embeddings in collection
|
||||
client = chromadb.PersistentClient(
|
||||
path=str(chroma_path),
|
||||
settings=Settings(anonymized_telemetry=False)
|
||||
)
|
||||
collection = client.get_collection("obsidian_vault")
|
||||
|
||||
# Get all items
|
||||
results = collection.get(include=["embeddings"])
|
||||
|
||||
assert len(results["ids"]) == stats["chunks_created"]
|
||||
assert results["embeddings"] is not None
|
||||
|
||||
# Verify embeddings are non-zero vectors of correct dimension
|
||||
for embedding in results["embeddings"]:
|
||||
assert len(embedding) == 384 # all-MiniLM-L6-v2 dimension
|
||||
assert any(val != 0 for val in embedding) # Not all zeros
|
||||
238
tests/test_markdown_parser.py
Normal file
238
tests/test_markdown_parser.py
Normal file
@@ -0,0 +1,238 @@
|
||||
"""Unit tests for markdown_parser module."""
|
||||
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from markdown_parser import (
|
||||
parse_markdown_file,
|
||||
find_section_at_line,
|
||||
MarkdownSection,
|
||||
ParsedDocument
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tmp_markdown_file(tmp_path):
|
||||
"""Fixture to create temporary markdown files for testing.
|
||||
|
||||
Args:
|
||||
tmp_path: pytest temporary directory fixture
|
||||
|
||||
Returns:
|
||||
Function that creates a markdown file with given content
|
||||
"""
|
||||
|
||||
def _create_file(content: str, filename: str = "test.md") -> Path:
|
||||
file_path = tmp_path / filename
|
||||
file_path.write_text(content, encoding="utf-8")
|
||||
return file_path
|
||||
|
||||
return _create_file
|
||||
|
||||
|
||||
# Tests for parse_markdown_file()
|
||||
|
||||
def test_i_can_parse_file_with_single_section(tmp_markdown_file):
|
||||
"""Test parsing a file with a single header section."""
|
||||
content = """# Main Title
|
||||
This is the content of the section.
|
||||
It has multiple lines."""
|
||||
|
||||
file_path = tmp_markdown_file(content)
|
||||
doc = parse_markdown_file(file_path)
|
||||
|
||||
assert len(doc.sections) == 1
|
||||
assert doc.sections[0].level == 1
|
||||
assert doc.sections[0].title == "Main Title"
|
||||
assert "This is the content" in doc.sections[0].content
|
||||
assert doc.sections[0].start_line == 1
|
||||
assert doc.sections[0].end_line == 3
|
||||
|
||||
|
||||
def test_i_can_parse_file_with_multiple_sections(tmp_markdown_file):
|
||||
"""Test parsing a file with multiple sections at the same level."""
|
||||
content = """# Section One
|
||||
Content of section one.
|
||||
|
||||
# Section Two
|
||||
Content of section two.
|
||||
|
||||
# Section Three
|
||||
Content of section three."""
|
||||
|
||||
file_path = tmp_markdown_file(content)
|
||||
doc = parse_markdown_file(file_path)
|
||||
|
||||
assert len(doc.sections) == 3
|
||||
assert doc.sections[0].title == "Section One"
|
||||
assert doc.sections[1].title == "Section Two"
|
||||
assert doc.sections[2].title == "Section Three"
|
||||
assert all(section.level == 1 for section in doc.sections)
|
||||
|
||||
|
||||
def test_i_can_parse_file_with_nested_sections(tmp_markdown_file):
|
||||
"""Test parsing a file with nested headers (different levels)."""
|
||||
content = """# Main Title
|
||||
Introduction text.
|
||||
|
||||
## Subsection A
|
||||
Content A.
|
||||
|
||||
## Subsection B
|
||||
Content B.
|
||||
|
||||
### Sub-subsection
|
||||
Nested content."""
|
||||
|
||||
file_path = tmp_markdown_file(content)
|
||||
doc = parse_markdown_file(file_path)
|
||||
|
||||
assert len(doc.sections) == 4
|
||||
assert doc.sections[0].level == 1
|
||||
assert doc.sections[0].title == "Main Title"
|
||||
assert doc.sections[1].level == 2
|
||||
assert doc.sections[1].title == "Subsection A"
|
||||
assert doc.sections[2].level == 2
|
||||
assert doc.sections[2].title == "Subsection B"
|
||||
assert doc.sections[3].level == 3
|
||||
assert doc.sections[3].title == "Sub-subsection"
|
||||
|
||||
|
||||
def test_i_can_parse_file_without_headers(tmp_markdown_file):
|
||||
"""Test parsing a file with no headers (plain text)."""
|
||||
content = """This is a plain text file.
|
||||
It has no headers at all.
|
||||
Just regular content."""
|
||||
|
||||
file_path = tmp_markdown_file(content)
|
||||
doc = parse_markdown_file(file_path)
|
||||
|
||||
assert len(doc.sections) == 1
|
||||
assert doc.sections[0].level == 0
|
||||
assert doc.sections[0].title == ""
|
||||
assert doc.sections[0].content == content
|
||||
assert doc.sections[0].start_line == 1
|
||||
assert doc.sections[0].end_line == 3
|
||||
|
||||
|
||||
def test_i_can_parse_empty_file(tmp_markdown_file):
|
||||
"""Test parsing an empty file."""
|
||||
content = ""
|
||||
|
||||
file_path = tmp_markdown_file(content)
|
||||
doc = parse_markdown_file(file_path)
|
||||
|
||||
assert len(doc.sections) == 1
|
||||
assert doc.sections[0].level == 0
|
||||
assert doc.sections[0].title == ""
|
||||
assert doc.sections[0].content == ""
|
||||
assert doc.sections[0].start_line == 1
|
||||
assert doc.sections[0].end_line == 1
|
||||
|
||||
|
||||
def test_i_can_track_correct_line_numbers(tmp_markdown_file):
|
||||
"""Test that line numbers are correctly tracked for each section."""
|
||||
content = """# First Section
|
||||
Line 2
|
||||
Line 3
|
||||
|
||||
# Second Section
|
||||
Line 6
|
||||
Line 7
|
||||
Line 8"""
|
||||
|
||||
file_path = tmp_markdown_file(content)
|
||||
doc = parse_markdown_file(file_path)
|
||||
|
||||
assert doc.sections[0].start_line == 1
|
||||
assert doc.sections[0].end_line == 4
|
||||
assert doc.sections[1].start_line == 5
|
||||
assert doc.sections[1].end_line == 8
|
||||
|
||||
|
||||
def test_i_cannot_parse_nonexistent_file():
|
||||
"""Test that parsing a non-existent file raises FileNotFoundError."""
|
||||
fake_path = Path("/nonexistent/path/to/file.md")
|
||||
|
||||
with pytest.raises(FileNotFoundError):
|
||||
parse_markdown_file(fake_path)
|
||||
|
||||
|
||||
# Tests for find_section_at_line()
|
||||
|
||||
def test_i_can_find_section_at_specific_line(tmp_markdown_file):
|
||||
"""Test finding a section at a line in the middle of content."""
|
||||
content = """# Section One
|
||||
Line 2
|
||||
Line 3
|
||||
|
||||
# Section Two
|
||||
Line 6
|
||||
Line 7"""
|
||||
|
||||
file_path = tmp_markdown_file(content)
|
||||
doc = parse_markdown_file(file_path)
|
||||
|
||||
section = find_section_at_line(doc, 3)
|
||||
|
||||
assert section is not None
|
||||
assert section.title == "Section One"
|
||||
|
||||
section = find_section_at_line(doc, 6)
|
||||
|
||||
assert section is not None
|
||||
assert section.title == "Section Two"
|
||||
|
||||
|
||||
def test_i_can_find_section_at_first_line(tmp_markdown_file):
|
||||
"""Test finding a section at the header line itself."""
|
||||
content = """# Main Title
|
||||
Content here."""
|
||||
|
||||
file_path = tmp_markdown_file(content)
|
||||
doc = parse_markdown_file(file_path)
|
||||
|
||||
section = find_section_at_line(doc, 1)
|
||||
|
||||
assert section is not None
|
||||
assert section.title == "Main Title"
|
||||
|
||||
|
||||
def test_i_can_find_section_at_last_line(tmp_markdown_file):
|
||||
"""Test finding a section at its last line."""
|
||||
content = """# Section One
|
||||
Line 2
|
||||
Line 3
|
||||
|
||||
# Section Two
|
||||
Line 6"""
|
||||
|
||||
file_path = tmp_markdown_file(content)
|
||||
doc = parse_markdown_file(file_path)
|
||||
|
||||
section = find_section_at_line(doc, 3)
|
||||
|
||||
assert section is not None
|
||||
assert section.title == "Section One"
|
||||
|
||||
section = find_section_at_line(doc, 6)
|
||||
|
||||
assert section is not None
|
||||
assert section.title == "Section Two"
|
||||
|
||||
|
||||
def test_i_cannot_find_section_for_invalid_line_number(tmp_markdown_file):
|
||||
"""Test that invalid line numbers return None."""
|
||||
content = """# Title
|
||||
Content"""
|
||||
|
||||
file_path = tmp_markdown_file(content)
|
||||
doc = parse_markdown_file(file_path)
|
||||
|
||||
# Negative line number
|
||||
assert find_section_at_line(doc, -1) is None
|
||||
|
||||
# Zero line number
|
||||
assert find_section_at_line(doc, 0) is None
|
||||
|
||||
# Line number beyond file length
|
||||
assert find_section_at_line(doc, 1000) is None
|
||||
337
tests/test_searcher.py
Normal file
337
tests/test_searcher.py
Normal file
@@ -0,0 +1,337 @@
|
||||
"""
|
||||
Unit tests for the searcher module.
|
||||
"""
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from indexer import index_vault
|
||||
from searcher import search_vault, _parse_search_results, SearchResult
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_vault(tmp_path):
|
||||
"""
|
||||
Create a temporary vault with sample markdown files.
|
||||
"""
|
||||
vault_path = tmp_path / "test_vault"
|
||||
vault_path.mkdir()
|
||||
|
||||
# Create sample files
|
||||
file1 = vault_path / "python_basics.md"
|
||||
file1.write_text("""# Python Programming
|
||||
|
||||
Python is a high-level programming language known for its simplicity and readability.
|
||||
|
||||
## Variables and Data Types
|
||||
|
||||
In Python, you can create variables without declaring their type explicitly.
|
||||
Numbers, strings, and booleans are the basic data types.
|
||||
|
||||
## Functions
|
||||
|
||||
Functions in Python are defined using the def keyword.
|
||||
They help organize code into reusable blocks.
|
||||
""")
|
||||
|
||||
file2 = vault_path / "machine_learning.md"
|
||||
file2.write_text("""# Machine Learning
|
||||
|
||||
Machine learning is a subset of artificial intelligence.
|
||||
|
||||
## Supervised Learning
|
||||
|
||||
Supervised learning uses labeled data to train models.
|
||||
Common algorithms include linear regression and decision trees.
|
||||
|
||||
## Deep Learning
|
||||
|
||||
Deep learning uses neural networks with multiple layers.
|
||||
It's particularly effective for image and speech recognition.
|
||||
""")
|
||||
|
||||
file3 = vault_path / "cooking.md"
|
||||
file3.write_text("""# Italian Cuisine
|
||||
|
||||
Italian cooking emphasizes fresh ingredients and simple preparation.
|
||||
|
||||
## Pasta Dishes
|
||||
|
||||
Pasta is a staple of Italian cuisine.
|
||||
There are hundreds of pasta shapes and sauce combinations.
|
||||
|
||||
## Pizza Making
|
||||
|
||||
Traditional Italian pizza uses a thin crust and fresh mozzarella.
|
||||
""")
|
||||
|
||||
return vault_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def indexed_vault(temp_vault, tmp_path):
|
||||
"""
|
||||
Create and index a temporary vault.
|
||||
"""
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
chroma_path.mkdir()
|
||||
|
||||
# Index the vault
|
||||
stats = index_vault(
|
||||
vault_path=str(temp_vault),
|
||||
chroma_db_path=str(chroma_path),
|
||||
collection_name="test_collection",
|
||||
)
|
||||
|
||||
return {
|
||||
"vault_path": temp_vault,
|
||||
"chroma_path": chroma_path,
|
||||
"collection_name": "test_collection",
|
||||
"stats": stats,
|
||||
}
|
||||
|
||||
|
||||
# Passing tests
|
||||
|
||||
|
||||
def test_i_can_search_vault_with_valid_query(indexed_vault):
|
||||
"""
|
||||
Test that a basic search returns valid results.
|
||||
"""
|
||||
results = search_vault(
|
||||
query="Python programming language",
|
||||
chroma_db_path=str(indexed_vault["chroma_path"]),
|
||||
collection_name=indexed_vault["collection_name"],
|
||||
)
|
||||
|
||||
# Should return results
|
||||
assert len(results) > 0
|
||||
|
||||
# All results should be SearchResult instances
|
||||
for result in results:
|
||||
assert isinstance(result, SearchResult)
|
||||
|
||||
# Check that all fields are present
|
||||
assert isinstance(result.file_path, str)
|
||||
assert isinstance(result.section_title, str)
|
||||
assert isinstance(result.line_start, int)
|
||||
assert isinstance(result.line_end, int)
|
||||
assert isinstance(result.score, float)
|
||||
assert isinstance(result.text, str)
|
||||
|
||||
# Scores should be between 0 and 1
|
||||
assert 0.0 <= result.score <= 1.0
|
||||
|
||||
# Results should be sorted by score (descending)
|
||||
scores = [r.score for r in results]
|
||||
assert scores == sorted(scores, reverse=True)
|
||||
|
||||
|
||||
def test_i_can_search_vault_with_limit_parameter(indexed_vault):
|
||||
"""
|
||||
Test that the limit parameter is respected.
|
||||
"""
|
||||
limit = 3
|
||||
results = search_vault(
|
||||
query="learning",
|
||||
chroma_db_path=str(indexed_vault["chroma_path"]),
|
||||
collection_name=indexed_vault["collection_name"],
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
# Should return at most 'limit' results
|
||||
assert len(results) <= limit
|
||||
|
||||
|
||||
def test_i_can_search_vault_with_min_score_filter(indexed_vault):
|
||||
"""
|
||||
Test that only results above min_score are returned.
|
||||
"""
|
||||
min_score = 0.5
|
||||
results = search_vault(
|
||||
query="Python",
|
||||
chroma_db_path=str(indexed_vault["chroma_path"]),
|
||||
collection_name=indexed_vault["collection_name"],
|
||||
min_score=min_score,
|
||||
)
|
||||
|
||||
# All results should have score >= min_score
|
||||
for result in results:
|
||||
assert result.score >= min_score
|
||||
|
||||
|
||||
def test_i_can_get_correct_metadata_in_results(indexed_vault):
|
||||
"""
|
||||
Test that metadata in results is correct.
|
||||
"""
|
||||
results = search_vault(
|
||||
query="Python programming",
|
||||
chroma_db_path=str(indexed_vault["chroma_path"]),
|
||||
collection_name=indexed_vault["collection_name"],
|
||||
limit=1,
|
||||
)
|
||||
|
||||
assert len(results) > 0
|
||||
top_result = results[0]
|
||||
|
||||
# Should find python_basics.md as most relevant
|
||||
assert "python_basics.md" in top_result.file_path
|
||||
|
||||
# Should have a section title
|
||||
assert len(top_result.section_title) > 0
|
||||
|
||||
# Line numbers should be positive
|
||||
assert top_result.line_start > 0
|
||||
assert top_result.line_end >= top_result.line_start
|
||||
|
||||
# Text should not be empty
|
||||
assert len(top_result.text) > 0
|
||||
|
||||
|
||||
def test_i_can_search_with_different_collection_name(temp_vault, tmp_path):
|
||||
"""
|
||||
Test that we can search in a collection with a custom name.
|
||||
"""
|
||||
chroma_path = tmp_path / "chroma_custom"
|
||||
chroma_path.mkdir()
|
||||
custom_collection = "my_custom_collection"
|
||||
|
||||
# Index with custom collection name
|
||||
index_vault(
|
||||
vault_path=str(temp_vault),
|
||||
chroma_db_path=str(chroma_path),
|
||||
collection_name=custom_collection,
|
||||
)
|
||||
|
||||
# Search with the same custom collection name
|
||||
results = search_vault(
|
||||
query="Python",
|
||||
chroma_db_path=str(chroma_path),
|
||||
collection_name=custom_collection,
|
||||
)
|
||||
|
||||
assert len(results) > 0
|
||||
|
||||
|
||||
def test_i_can_get_empty_results_when_no_match(indexed_vault):
|
||||
"""
|
||||
Test that a search with no matches returns an empty list.
|
||||
"""
|
||||
results = search_vault(
|
||||
query="quantum physics relativity theory",
|
||||
chroma_db_path=str(indexed_vault["chroma_path"]),
|
||||
collection_name=indexed_vault["collection_name"],
|
||||
min_score=0.9, # Very high threshold
|
||||
)
|
||||
|
||||
# Should return empty list, not raise exception
|
||||
assert isinstance(results, list)
|
||||
assert len(results) == 0
|
||||
|
||||
|
||||
# Error tests
|
||||
|
||||
|
||||
def test_i_cannot_search_with_empty_query(indexed_vault):
|
||||
"""
|
||||
Test that an empty query raises ValueError.
|
||||
"""
|
||||
with pytest.raises(ValueError, match="Query cannot be empty"):
|
||||
search_vault(
|
||||
query="",
|
||||
chroma_db_path=str(indexed_vault["chroma_path"]),
|
||||
collection_name=indexed_vault["collection_name"],
|
||||
)
|
||||
|
||||
|
||||
def test_i_cannot_search_nonexistent_collection(tmp_path):
|
||||
"""
|
||||
Test that searching a nonexistent collection raises ValueError.
|
||||
"""
|
||||
chroma_path = tmp_path / "empty_chroma"
|
||||
chroma_path.mkdir()
|
||||
|
||||
with pytest.raises(ValueError, match="not found"):
|
||||
search_vault(
|
||||
query="test query",
|
||||
chroma_db_path=str(chroma_path),
|
||||
collection_name="nonexistent_collection",
|
||||
)
|
||||
|
||||
|
||||
def test_i_cannot_search_with_whitespace_only_query(indexed_vault):
|
||||
"""
|
||||
Test that a query with only whitespace raises ValueError.
|
||||
"""
|
||||
with pytest.raises(ValueError, match="Query cannot be empty"):
|
||||
search_vault(
|
||||
query=" ",
|
||||
chroma_db_path=str(indexed_vault["chroma_path"]),
|
||||
collection_name=indexed_vault["collection_name"],
|
||||
)
|
||||
|
||||
|
||||
# Helper function tests
|
||||
|
||||
|
||||
def test_i_can_parse_search_results_correctly():
|
||||
"""
|
||||
Test that ChromaDB results are parsed correctly.
|
||||
"""
|
||||
# Mock ChromaDB query results
|
||||
raw_results = {
|
||||
"documents": [[
|
||||
"Python is a programming language",
|
||||
"Machine learning basics",
|
||||
]],
|
||||
"metadatas": [[
|
||||
{
|
||||
"file_path": "notes/python.md",
|
||||
"section_title": "Introduction",
|
||||
"line_start": 1,
|
||||
"line_end": 5,
|
||||
},
|
||||
{
|
||||
"file_path": "notes/ml.md",
|
||||
"section_title": "Overview",
|
||||
"line_start": 10,
|
||||
"line_end": 15,
|
||||
},
|
||||
]],
|
||||
"distances": [[0.2, 0.4]], # ChromaDB distances (lower = more similar)
|
||||
}
|
||||
|
||||
results = _parse_search_results(raw_results, min_score=0.0)
|
||||
|
||||
assert len(results) == 2
|
||||
|
||||
# Check first result
|
||||
assert results[0].file_path == "notes/python.md"
|
||||
assert results[0].section_title == "Introduction"
|
||||
assert results[0].line_start == 1
|
||||
assert results[0].line_end == 5
|
||||
assert results[0].text == "Python is a programming language"
|
||||
assert results[0].score == pytest.approx(0.8) # 1 - 0.2
|
||||
|
||||
# Check second result
|
||||
assert results[1].score == pytest.approx(0.6) # 1 - 0.4
|
||||
|
||||
|
||||
def test_i_can_filter_results_by_min_score():
|
||||
"""
|
||||
Test that results are filtered by min_score during parsing.
|
||||
"""
|
||||
raw_results = {
|
||||
"documents": [["text1", "text2", "text3"]],
|
||||
"metadatas": [[
|
||||
{"file_path": "a.md", "section_title": "A", "line_start": 1, "line_end": 2},
|
||||
{"file_path": "b.md", "section_title": "B", "line_start": 1, "line_end": 2},
|
||||
{"file_path": "c.md", "section_title": "C", "line_start": 1, "line_end": 2},
|
||||
]],
|
||||
"distances": [[0.1, 0.5, 0.8]], # Scores will be: 0.9, 0.5, 0.2
|
||||
}
|
||||
|
||||
results = _parse_search_results(raw_results, min_score=0.6)
|
||||
|
||||
# Only first result should pass (score 0.9 >= 0.6)
|
||||
assert len(results) == 1
|
||||
assert results[0].file_path == "a.md"
|
||||
assert results[0].score == pytest.approx(0.9)
|
||||
Reference in New Issue
Block a user