Files
MyObsidianAI/tests/test_searcher.py
Kodjo Sossouvi d4925f7969 Initial commit
2025-12-12 11:31:44 +01:00

338 lines
8.8 KiB
Python

"""
Unit tests for the searcher module.
"""
import pytest
from pathlib import Path
from indexer import index_vault
from searcher import search_vault, _parse_search_results, SearchResult
@pytest.fixture
def temp_vault(tmp_path):
"""
Create a temporary vault with sample markdown files.
"""
vault_path = tmp_path / "test_vault"
vault_path.mkdir()
# Create sample files
file1 = vault_path / "python_basics.md"
file1.write_text("""# Python Programming
Python is a high-level programming language known for its simplicity and readability.
## Variables and Data Types
In Python, you can create variables without declaring their type explicitly.
Numbers, strings, and booleans are the basic data types.
## Functions
Functions in Python are defined using the def keyword.
They help organize code into reusable blocks.
""")
file2 = vault_path / "machine_learning.md"
file2.write_text("""# Machine Learning
Machine learning is a subset of artificial intelligence.
## Supervised Learning
Supervised learning uses labeled data to train models.
Common algorithms include linear regression and decision trees.
## Deep Learning
Deep learning uses neural networks with multiple layers.
It's particularly effective for image and speech recognition.
""")
file3 = vault_path / "cooking.md"
file3.write_text("""# Italian Cuisine
Italian cooking emphasizes fresh ingredients and simple preparation.
## Pasta Dishes
Pasta is a staple of Italian cuisine.
There are hundreds of pasta shapes and sauce combinations.
## Pizza Making
Traditional Italian pizza uses a thin crust and fresh mozzarella.
""")
return vault_path
@pytest.fixture
def indexed_vault(temp_vault, tmp_path):
"""
Create and index a temporary vault.
"""
chroma_path = tmp_path / "chroma_db"
chroma_path.mkdir()
# Index the vault
stats = index_vault(
vault_path=str(temp_vault),
chroma_db_path=str(chroma_path),
collection_name="test_collection",
)
return {
"vault_path": temp_vault,
"chroma_path": chroma_path,
"collection_name": "test_collection",
"stats": stats,
}
# Passing tests
def test_i_can_search_vault_with_valid_query(indexed_vault):
"""
Test that a basic search returns valid results.
"""
results = search_vault(
query="Python programming language",
chroma_db_path=str(indexed_vault["chroma_path"]),
collection_name=indexed_vault["collection_name"],
)
# Should return results
assert len(results) > 0
# All results should be SearchResult instances
for result in results:
assert isinstance(result, SearchResult)
# Check that all fields are present
assert isinstance(result.file_path, str)
assert isinstance(result.section_title, str)
assert isinstance(result.line_start, int)
assert isinstance(result.line_end, int)
assert isinstance(result.score, float)
assert isinstance(result.text, str)
# Scores should be between 0 and 1
assert 0.0 <= result.score <= 1.0
# Results should be sorted by score (descending)
scores = [r.score for r in results]
assert scores == sorted(scores, reverse=True)
def test_i_can_search_vault_with_limit_parameter(indexed_vault):
"""
Test that the limit parameter is respected.
"""
limit = 3
results = search_vault(
query="learning",
chroma_db_path=str(indexed_vault["chroma_path"]),
collection_name=indexed_vault["collection_name"],
limit=limit,
)
# Should return at most 'limit' results
assert len(results) <= limit
def test_i_can_search_vault_with_min_score_filter(indexed_vault):
"""
Test that only results above min_score are returned.
"""
min_score = 0.5
results = search_vault(
query="Python",
chroma_db_path=str(indexed_vault["chroma_path"]),
collection_name=indexed_vault["collection_name"],
min_score=min_score,
)
# All results should have score >= min_score
for result in results:
assert result.score >= min_score
def test_i_can_get_correct_metadata_in_results(indexed_vault):
"""
Test that metadata in results is correct.
"""
results = search_vault(
query="Python programming",
chroma_db_path=str(indexed_vault["chroma_path"]),
collection_name=indexed_vault["collection_name"],
limit=1,
)
assert len(results) > 0
top_result = results[0]
# Should find python_basics.md as most relevant
assert "python_basics.md" in top_result.file_path
# Should have a section title
assert len(top_result.section_title) > 0
# Line numbers should be positive
assert top_result.line_start > 0
assert top_result.line_end >= top_result.line_start
# Text should not be empty
assert len(top_result.text) > 0
def test_i_can_search_with_different_collection_name(temp_vault, tmp_path):
"""
Test that we can search in a collection with a custom name.
"""
chroma_path = tmp_path / "chroma_custom"
chroma_path.mkdir()
custom_collection = "my_custom_collection"
# Index with custom collection name
index_vault(
vault_path=str(temp_vault),
chroma_db_path=str(chroma_path),
collection_name=custom_collection,
)
# Search with the same custom collection name
results = search_vault(
query="Python",
chroma_db_path=str(chroma_path),
collection_name=custom_collection,
)
assert len(results) > 0
def test_i_can_get_empty_results_when_no_match(indexed_vault):
"""
Test that a search with no matches returns an empty list.
"""
results = search_vault(
query="quantum physics relativity theory",
chroma_db_path=str(indexed_vault["chroma_path"]),
collection_name=indexed_vault["collection_name"],
min_score=0.9, # Very high threshold
)
# Should return empty list, not raise exception
assert isinstance(results, list)
assert len(results) == 0
# Error tests
def test_i_cannot_search_with_empty_query(indexed_vault):
"""
Test that an empty query raises ValueError.
"""
with pytest.raises(ValueError, match="Query cannot be empty"):
search_vault(
query="",
chroma_db_path=str(indexed_vault["chroma_path"]),
collection_name=indexed_vault["collection_name"],
)
def test_i_cannot_search_nonexistent_collection(tmp_path):
"""
Test that searching a nonexistent collection raises ValueError.
"""
chroma_path = tmp_path / "empty_chroma"
chroma_path.mkdir()
with pytest.raises(ValueError, match="not found"):
search_vault(
query="test query",
chroma_db_path=str(chroma_path),
collection_name="nonexistent_collection",
)
def test_i_cannot_search_with_whitespace_only_query(indexed_vault):
"""
Test that a query with only whitespace raises ValueError.
"""
with pytest.raises(ValueError, match="Query cannot be empty"):
search_vault(
query=" ",
chroma_db_path=str(indexed_vault["chroma_path"]),
collection_name=indexed_vault["collection_name"],
)
# Helper function tests
def test_i_can_parse_search_results_correctly():
"""
Test that ChromaDB results are parsed correctly.
"""
# Mock ChromaDB query results
raw_results = {
"documents": [[
"Python is a programming language",
"Machine learning basics",
]],
"metadatas": [[
{
"file_path": "notes/python.md",
"section_title": "Introduction",
"line_start": 1,
"line_end": 5,
},
{
"file_path": "notes/ml.md",
"section_title": "Overview",
"line_start": 10,
"line_end": 15,
},
]],
"distances": [[0.2, 0.4]], # ChromaDB distances (lower = more similar)
}
results = _parse_search_results(raw_results, min_score=0.0)
assert len(results) == 2
# Check first result
assert results[0].file_path == "notes/python.md"
assert results[0].section_title == "Introduction"
assert results[0].line_start == 1
assert results[0].line_end == 5
assert results[0].text == "Python is a programming language"
assert results[0].score == pytest.approx(0.8) # 1 - 0.2
# Check second result
assert results[1].score == pytest.approx(0.6) # 1 - 0.4
def test_i_can_filter_results_by_min_score():
"""
Test that results are filtered by min_score during parsing.
"""
raw_results = {
"documents": [["text1", "text2", "text3"]],
"metadatas": [[
{"file_path": "a.md", "section_title": "A", "line_start": 1, "line_end": 2},
{"file_path": "b.md", "section_title": "B", "line_start": 1, "line_end": 2},
{"file_path": "c.md", "section_title": "C", "line_start": 1, "line_end": 2},
]],
"distances": [[0.1, 0.5, 0.8]], # Scores will be: 0.9, 0.5, 0.2
}
results = _parse_search_results(raw_results, min_score=0.6)
# Only first result should pass (score 0.9 >= 0.6)
assert len(results) == 1
assert results[0].file_path == "a.md"
assert results[0].score == pytest.approx(0.9)