Initial commit
This commit is contained in:
337
tests/test_searcher.py
Normal file
337
tests/test_searcher.py
Normal file
@@ -0,0 +1,337 @@
|
||||
"""
|
||||
Unit tests for the searcher module.
|
||||
"""
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from indexer import index_vault
|
||||
from searcher import search_vault, _parse_search_results, SearchResult
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_vault(tmp_path):
|
||||
"""
|
||||
Create a temporary vault with sample markdown files.
|
||||
"""
|
||||
vault_path = tmp_path / "test_vault"
|
||||
vault_path.mkdir()
|
||||
|
||||
# Create sample files
|
||||
file1 = vault_path / "python_basics.md"
|
||||
file1.write_text("""# Python Programming
|
||||
|
||||
Python is a high-level programming language known for its simplicity and readability.
|
||||
|
||||
## Variables and Data Types
|
||||
|
||||
In Python, you can create variables without declaring their type explicitly.
|
||||
Numbers, strings, and booleans are the basic data types.
|
||||
|
||||
## Functions
|
||||
|
||||
Functions in Python are defined using the def keyword.
|
||||
They help organize code into reusable blocks.
|
||||
""")
|
||||
|
||||
file2 = vault_path / "machine_learning.md"
|
||||
file2.write_text("""# Machine Learning
|
||||
|
||||
Machine learning is a subset of artificial intelligence.
|
||||
|
||||
## Supervised Learning
|
||||
|
||||
Supervised learning uses labeled data to train models.
|
||||
Common algorithms include linear regression and decision trees.
|
||||
|
||||
## Deep Learning
|
||||
|
||||
Deep learning uses neural networks with multiple layers.
|
||||
It's particularly effective for image and speech recognition.
|
||||
""")
|
||||
|
||||
file3 = vault_path / "cooking.md"
|
||||
file3.write_text("""# Italian Cuisine
|
||||
|
||||
Italian cooking emphasizes fresh ingredients and simple preparation.
|
||||
|
||||
## Pasta Dishes
|
||||
|
||||
Pasta is a staple of Italian cuisine.
|
||||
There are hundreds of pasta shapes and sauce combinations.
|
||||
|
||||
## Pizza Making
|
||||
|
||||
Traditional Italian pizza uses a thin crust and fresh mozzarella.
|
||||
""")
|
||||
|
||||
return vault_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def indexed_vault(temp_vault, tmp_path):
|
||||
"""
|
||||
Create and index a temporary vault.
|
||||
"""
|
||||
chroma_path = tmp_path / "chroma_db"
|
||||
chroma_path.mkdir()
|
||||
|
||||
# Index the vault
|
||||
stats = index_vault(
|
||||
vault_path=str(temp_vault),
|
||||
chroma_db_path=str(chroma_path),
|
||||
collection_name="test_collection",
|
||||
)
|
||||
|
||||
return {
|
||||
"vault_path": temp_vault,
|
||||
"chroma_path": chroma_path,
|
||||
"collection_name": "test_collection",
|
||||
"stats": stats,
|
||||
}
|
||||
|
||||
|
||||
# Passing tests
|
||||
|
||||
|
||||
def test_i_can_search_vault_with_valid_query(indexed_vault):
|
||||
"""
|
||||
Test that a basic search returns valid results.
|
||||
"""
|
||||
results = search_vault(
|
||||
query="Python programming language",
|
||||
chroma_db_path=str(indexed_vault["chroma_path"]),
|
||||
collection_name=indexed_vault["collection_name"],
|
||||
)
|
||||
|
||||
# Should return results
|
||||
assert len(results) > 0
|
||||
|
||||
# All results should be SearchResult instances
|
||||
for result in results:
|
||||
assert isinstance(result, SearchResult)
|
||||
|
||||
# Check that all fields are present
|
||||
assert isinstance(result.file_path, str)
|
||||
assert isinstance(result.section_title, str)
|
||||
assert isinstance(result.line_start, int)
|
||||
assert isinstance(result.line_end, int)
|
||||
assert isinstance(result.score, float)
|
||||
assert isinstance(result.text, str)
|
||||
|
||||
# Scores should be between 0 and 1
|
||||
assert 0.0 <= result.score <= 1.0
|
||||
|
||||
# Results should be sorted by score (descending)
|
||||
scores = [r.score for r in results]
|
||||
assert scores == sorted(scores, reverse=True)
|
||||
|
||||
|
||||
def test_i_can_search_vault_with_limit_parameter(indexed_vault):
|
||||
"""
|
||||
Test that the limit parameter is respected.
|
||||
"""
|
||||
limit = 3
|
||||
results = search_vault(
|
||||
query="learning",
|
||||
chroma_db_path=str(indexed_vault["chroma_path"]),
|
||||
collection_name=indexed_vault["collection_name"],
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
# Should return at most 'limit' results
|
||||
assert len(results) <= limit
|
||||
|
||||
|
||||
def test_i_can_search_vault_with_min_score_filter(indexed_vault):
|
||||
"""
|
||||
Test that only results above min_score are returned.
|
||||
"""
|
||||
min_score = 0.5
|
||||
results = search_vault(
|
||||
query="Python",
|
||||
chroma_db_path=str(indexed_vault["chroma_path"]),
|
||||
collection_name=indexed_vault["collection_name"],
|
||||
min_score=min_score,
|
||||
)
|
||||
|
||||
# All results should have score >= min_score
|
||||
for result in results:
|
||||
assert result.score >= min_score
|
||||
|
||||
|
||||
def test_i_can_get_correct_metadata_in_results(indexed_vault):
|
||||
"""
|
||||
Test that metadata in results is correct.
|
||||
"""
|
||||
results = search_vault(
|
||||
query="Python programming",
|
||||
chroma_db_path=str(indexed_vault["chroma_path"]),
|
||||
collection_name=indexed_vault["collection_name"],
|
||||
limit=1,
|
||||
)
|
||||
|
||||
assert len(results) > 0
|
||||
top_result = results[0]
|
||||
|
||||
# Should find python_basics.md as most relevant
|
||||
assert "python_basics.md" in top_result.file_path
|
||||
|
||||
# Should have a section title
|
||||
assert len(top_result.section_title) > 0
|
||||
|
||||
# Line numbers should be positive
|
||||
assert top_result.line_start > 0
|
||||
assert top_result.line_end >= top_result.line_start
|
||||
|
||||
# Text should not be empty
|
||||
assert len(top_result.text) > 0
|
||||
|
||||
|
||||
def test_i_can_search_with_different_collection_name(temp_vault, tmp_path):
|
||||
"""
|
||||
Test that we can search in a collection with a custom name.
|
||||
"""
|
||||
chroma_path = tmp_path / "chroma_custom"
|
||||
chroma_path.mkdir()
|
||||
custom_collection = "my_custom_collection"
|
||||
|
||||
# Index with custom collection name
|
||||
index_vault(
|
||||
vault_path=str(temp_vault),
|
||||
chroma_db_path=str(chroma_path),
|
||||
collection_name=custom_collection,
|
||||
)
|
||||
|
||||
# Search with the same custom collection name
|
||||
results = search_vault(
|
||||
query="Python",
|
||||
chroma_db_path=str(chroma_path),
|
||||
collection_name=custom_collection,
|
||||
)
|
||||
|
||||
assert len(results) > 0
|
||||
|
||||
|
||||
def test_i_can_get_empty_results_when_no_match(indexed_vault):
|
||||
"""
|
||||
Test that a search with no matches returns an empty list.
|
||||
"""
|
||||
results = search_vault(
|
||||
query="quantum physics relativity theory",
|
||||
chroma_db_path=str(indexed_vault["chroma_path"]),
|
||||
collection_name=indexed_vault["collection_name"],
|
||||
min_score=0.9, # Very high threshold
|
||||
)
|
||||
|
||||
# Should return empty list, not raise exception
|
||||
assert isinstance(results, list)
|
||||
assert len(results) == 0
|
||||
|
||||
|
||||
# Error tests
|
||||
|
||||
|
||||
def test_i_cannot_search_with_empty_query(indexed_vault):
|
||||
"""
|
||||
Test that an empty query raises ValueError.
|
||||
"""
|
||||
with pytest.raises(ValueError, match="Query cannot be empty"):
|
||||
search_vault(
|
||||
query="",
|
||||
chroma_db_path=str(indexed_vault["chroma_path"]),
|
||||
collection_name=indexed_vault["collection_name"],
|
||||
)
|
||||
|
||||
|
||||
def test_i_cannot_search_nonexistent_collection(tmp_path):
|
||||
"""
|
||||
Test that searching a nonexistent collection raises ValueError.
|
||||
"""
|
||||
chroma_path = tmp_path / "empty_chroma"
|
||||
chroma_path.mkdir()
|
||||
|
||||
with pytest.raises(ValueError, match="not found"):
|
||||
search_vault(
|
||||
query="test query",
|
||||
chroma_db_path=str(chroma_path),
|
||||
collection_name="nonexistent_collection",
|
||||
)
|
||||
|
||||
|
||||
def test_i_cannot_search_with_whitespace_only_query(indexed_vault):
|
||||
"""
|
||||
Test that a query with only whitespace raises ValueError.
|
||||
"""
|
||||
with pytest.raises(ValueError, match="Query cannot be empty"):
|
||||
search_vault(
|
||||
query=" ",
|
||||
chroma_db_path=str(indexed_vault["chroma_path"]),
|
||||
collection_name=indexed_vault["collection_name"],
|
||||
)
|
||||
|
||||
|
||||
# Helper function tests
|
||||
|
||||
|
||||
def test_i_can_parse_search_results_correctly():
|
||||
"""
|
||||
Test that ChromaDB results are parsed correctly.
|
||||
"""
|
||||
# Mock ChromaDB query results
|
||||
raw_results = {
|
||||
"documents": [[
|
||||
"Python is a programming language",
|
||||
"Machine learning basics",
|
||||
]],
|
||||
"metadatas": [[
|
||||
{
|
||||
"file_path": "notes/python.md",
|
||||
"section_title": "Introduction",
|
||||
"line_start": 1,
|
||||
"line_end": 5,
|
||||
},
|
||||
{
|
||||
"file_path": "notes/ml.md",
|
||||
"section_title": "Overview",
|
||||
"line_start": 10,
|
||||
"line_end": 15,
|
||||
},
|
||||
]],
|
||||
"distances": [[0.2, 0.4]], # ChromaDB distances (lower = more similar)
|
||||
}
|
||||
|
||||
results = _parse_search_results(raw_results, min_score=0.0)
|
||||
|
||||
assert len(results) == 2
|
||||
|
||||
# Check first result
|
||||
assert results[0].file_path == "notes/python.md"
|
||||
assert results[0].section_title == "Introduction"
|
||||
assert results[0].line_start == 1
|
||||
assert results[0].line_end == 5
|
||||
assert results[0].text == "Python is a programming language"
|
||||
assert results[0].score == pytest.approx(0.8) # 1 - 0.2
|
||||
|
||||
# Check second result
|
||||
assert results[1].score == pytest.approx(0.6) # 1 - 0.4
|
||||
|
||||
|
||||
def test_i_can_filter_results_by_min_score():
|
||||
"""
|
||||
Test that results are filtered by min_score during parsing.
|
||||
"""
|
||||
raw_results = {
|
||||
"documents": [["text1", "text2", "text3"]],
|
||||
"metadatas": [[
|
||||
{"file_path": "a.md", "section_title": "A", "line_start": 1, "line_end": 2},
|
||||
{"file_path": "b.md", "section_title": "B", "line_start": 1, "line_end": 2},
|
||||
{"file_path": "c.md", "section_title": "C", "line_start": 1, "line_end": 2},
|
||||
]],
|
||||
"distances": [[0.1, 0.5, 0.8]], # Scores will be: 0.9, 0.5, 0.2
|
||||
}
|
||||
|
||||
results = _parse_search_results(raw_results, min_score=0.6)
|
||||
|
||||
# Only first result should pass (score 0.9 >= 0.6)
|
||||
assert len(results) == 1
|
||||
assert results[0].file_path == "a.md"
|
||||
assert results[0].score == pytest.approx(0.9)
|
||||
Reference in New Issue
Block a user