338 lines
8.8 KiB
Python
338 lines
8.8 KiB
Python
"""
|
|
Unit tests for the searcher module.
|
|
"""
|
|
import pytest
|
|
from pathlib import Path
|
|
from indexer import index_vault
|
|
from searcher import search_vault, _parse_search_results, SearchResult
|
|
|
|
|
|
@pytest.fixture
|
|
def temp_vault(tmp_path):
|
|
"""
|
|
Create a temporary vault with sample markdown files.
|
|
"""
|
|
vault_path = tmp_path / "test_vault"
|
|
vault_path.mkdir()
|
|
|
|
# Create sample files
|
|
file1 = vault_path / "python_basics.md"
|
|
file1.write_text("""# Python Programming
|
|
|
|
Python is a high-level programming language known for its simplicity and readability.
|
|
|
|
## Variables and Data Types
|
|
|
|
In Python, you can create variables without declaring their type explicitly.
|
|
Numbers, strings, and booleans are the basic data types.
|
|
|
|
## Functions
|
|
|
|
Functions in Python are defined using the def keyword.
|
|
They help organize code into reusable blocks.
|
|
""")
|
|
|
|
file2 = vault_path / "machine_learning.md"
|
|
file2.write_text("""# Machine Learning
|
|
|
|
Machine learning is a subset of artificial intelligence.
|
|
|
|
## Supervised Learning
|
|
|
|
Supervised learning uses labeled data to train models.
|
|
Common algorithms include linear regression and decision trees.
|
|
|
|
## Deep Learning
|
|
|
|
Deep learning uses neural networks with multiple layers.
|
|
It's particularly effective for image and speech recognition.
|
|
""")
|
|
|
|
file3 = vault_path / "cooking.md"
|
|
file3.write_text("""# Italian Cuisine
|
|
|
|
Italian cooking emphasizes fresh ingredients and simple preparation.
|
|
|
|
## Pasta Dishes
|
|
|
|
Pasta is a staple of Italian cuisine.
|
|
There are hundreds of pasta shapes and sauce combinations.
|
|
|
|
## Pizza Making
|
|
|
|
Traditional Italian pizza uses a thin crust and fresh mozzarella.
|
|
""")
|
|
|
|
return vault_path
|
|
|
|
|
|
@pytest.fixture
|
|
def indexed_vault(temp_vault, tmp_path):
|
|
"""
|
|
Create and index a temporary vault.
|
|
"""
|
|
chroma_path = tmp_path / "chroma_db"
|
|
chroma_path.mkdir()
|
|
|
|
# Index the vault
|
|
stats = index_vault(
|
|
vault_path=str(temp_vault),
|
|
chroma_db_path=str(chroma_path),
|
|
collection_name="test_collection",
|
|
)
|
|
|
|
return {
|
|
"vault_path": temp_vault,
|
|
"chroma_path": chroma_path,
|
|
"collection_name": "test_collection",
|
|
"stats": stats,
|
|
}
|
|
|
|
|
|
# Passing tests
|
|
|
|
|
|
def test_i_can_search_vault_with_valid_query(indexed_vault):
|
|
"""
|
|
Test that a basic search returns valid results.
|
|
"""
|
|
results = search_vault(
|
|
query="Python programming language",
|
|
chroma_db_path=str(indexed_vault["chroma_path"]),
|
|
collection_name=indexed_vault["collection_name"],
|
|
)
|
|
|
|
# Should return results
|
|
assert len(results) > 0
|
|
|
|
# All results should be SearchResult instances
|
|
for result in results:
|
|
assert isinstance(result, SearchResult)
|
|
|
|
# Check that all fields are present
|
|
assert isinstance(result.file_path, str)
|
|
assert isinstance(result.section_title, str)
|
|
assert isinstance(result.line_start, int)
|
|
assert isinstance(result.line_end, int)
|
|
assert isinstance(result.score, float)
|
|
assert isinstance(result.text, str)
|
|
|
|
# Scores should be between 0 and 1
|
|
assert 0.0 <= result.score <= 1.0
|
|
|
|
# Results should be sorted by score (descending)
|
|
scores = [r.score for r in results]
|
|
assert scores == sorted(scores, reverse=True)
|
|
|
|
|
|
def test_i_can_search_vault_with_limit_parameter(indexed_vault):
|
|
"""
|
|
Test that the limit parameter is respected.
|
|
"""
|
|
limit = 3
|
|
results = search_vault(
|
|
query="learning",
|
|
chroma_db_path=str(indexed_vault["chroma_path"]),
|
|
collection_name=indexed_vault["collection_name"],
|
|
limit=limit,
|
|
)
|
|
|
|
# Should return at most 'limit' results
|
|
assert len(results) <= limit
|
|
|
|
|
|
def test_i_can_search_vault_with_min_score_filter(indexed_vault):
|
|
"""
|
|
Test that only results above min_score are returned.
|
|
"""
|
|
min_score = 0.5
|
|
results = search_vault(
|
|
query="Python",
|
|
chroma_db_path=str(indexed_vault["chroma_path"]),
|
|
collection_name=indexed_vault["collection_name"],
|
|
min_score=min_score,
|
|
)
|
|
|
|
# All results should have score >= min_score
|
|
for result in results:
|
|
assert result.score >= min_score
|
|
|
|
|
|
def test_i_can_get_correct_metadata_in_results(indexed_vault):
|
|
"""
|
|
Test that metadata in results is correct.
|
|
"""
|
|
results = search_vault(
|
|
query="Python programming",
|
|
chroma_db_path=str(indexed_vault["chroma_path"]),
|
|
collection_name=indexed_vault["collection_name"],
|
|
limit=1,
|
|
)
|
|
|
|
assert len(results) > 0
|
|
top_result = results[0]
|
|
|
|
# Should find python_basics.md as most relevant
|
|
assert "python_basics.md" in top_result.file_path
|
|
|
|
# Should have a section title
|
|
assert len(top_result.section_title) > 0
|
|
|
|
# Line numbers should be positive
|
|
assert top_result.line_start > 0
|
|
assert top_result.line_end >= top_result.line_start
|
|
|
|
# Text should not be empty
|
|
assert len(top_result.text) > 0
|
|
|
|
|
|
def test_i_can_search_with_different_collection_name(temp_vault, tmp_path):
|
|
"""
|
|
Test that we can search in a collection with a custom name.
|
|
"""
|
|
chroma_path = tmp_path / "chroma_custom"
|
|
chroma_path.mkdir()
|
|
custom_collection = "my_custom_collection"
|
|
|
|
# Index with custom collection name
|
|
index_vault(
|
|
vault_path=str(temp_vault),
|
|
chroma_db_path=str(chroma_path),
|
|
collection_name=custom_collection,
|
|
)
|
|
|
|
# Search with the same custom collection name
|
|
results = search_vault(
|
|
query="Python",
|
|
chroma_db_path=str(chroma_path),
|
|
collection_name=custom_collection,
|
|
)
|
|
|
|
assert len(results) > 0
|
|
|
|
|
|
def test_i_can_get_empty_results_when_no_match(indexed_vault):
|
|
"""
|
|
Test that a search with no matches returns an empty list.
|
|
"""
|
|
results = search_vault(
|
|
query="quantum physics relativity theory",
|
|
chroma_db_path=str(indexed_vault["chroma_path"]),
|
|
collection_name=indexed_vault["collection_name"],
|
|
min_score=0.9, # Very high threshold
|
|
)
|
|
|
|
# Should return empty list, not raise exception
|
|
assert isinstance(results, list)
|
|
assert len(results) == 0
|
|
|
|
|
|
# Error tests
|
|
|
|
|
|
def test_i_cannot_search_with_empty_query(indexed_vault):
|
|
"""
|
|
Test that an empty query raises ValueError.
|
|
"""
|
|
with pytest.raises(ValueError, match="Query cannot be empty"):
|
|
search_vault(
|
|
query="",
|
|
chroma_db_path=str(indexed_vault["chroma_path"]),
|
|
collection_name=indexed_vault["collection_name"],
|
|
)
|
|
|
|
|
|
def test_i_cannot_search_nonexistent_collection(tmp_path):
|
|
"""
|
|
Test that searching a nonexistent collection raises ValueError.
|
|
"""
|
|
chroma_path = tmp_path / "empty_chroma"
|
|
chroma_path.mkdir()
|
|
|
|
with pytest.raises(ValueError, match="not found"):
|
|
search_vault(
|
|
query="test query",
|
|
chroma_db_path=str(chroma_path),
|
|
collection_name="nonexistent_collection",
|
|
)
|
|
|
|
|
|
def test_i_cannot_search_with_whitespace_only_query(indexed_vault):
|
|
"""
|
|
Test that a query with only whitespace raises ValueError.
|
|
"""
|
|
with pytest.raises(ValueError, match="Query cannot be empty"):
|
|
search_vault(
|
|
query=" ",
|
|
chroma_db_path=str(indexed_vault["chroma_path"]),
|
|
collection_name=indexed_vault["collection_name"],
|
|
)
|
|
|
|
|
|
# Helper function tests
|
|
|
|
|
|
def test_i_can_parse_search_results_correctly():
|
|
"""
|
|
Test that ChromaDB results are parsed correctly.
|
|
"""
|
|
# Mock ChromaDB query results
|
|
raw_results = {
|
|
"documents": [[
|
|
"Python is a programming language",
|
|
"Machine learning basics",
|
|
]],
|
|
"metadatas": [[
|
|
{
|
|
"file_path": "notes/python.md",
|
|
"section_title": "Introduction",
|
|
"line_start": 1,
|
|
"line_end": 5,
|
|
},
|
|
{
|
|
"file_path": "notes/ml.md",
|
|
"section_title": "Overview",
|
|
"line_start": 10,
|
|
"line_end": 15,
|
|
},
|
|
]],
|
|
"distances": [[0.2, 0.4]], # ChromaDB distances (lower = more similar)
|
|
}
|
|
|
|
results = _parse_search_results(raw_results, min_score=0.0)
|
|
|
|
assert len(results) == 2
|
|
|
|
# Check first result
|
|
assert results[0].file_path == "notes/python.md"
|
|
assert results[0].section_title == "Introduction"
|
|
assert results[0].line_start == 1
|
|
assert results[0].line_end == 5
|
|
assert results[0].text == "Python is a programming language"
|
|
assert results[0].score == pytest.approx(0.8) # 1 - 0.2
|
|
|
|
# Check second result
|
|
assert results[1].score == pytest.approx(0.6) # 1 - 0.4
|
|
|
|
|
|
def test_i_can_filter_results_by_min_score():
|
|
"""
|
|
Test that results are filtered by min_score during parsing.
|
|
"""
|
|
raw_results = {
|
|
"documents": [["text1", "text2", "text3"]],
|
|
"metadatas": [[
|
|
{"file_path": "a.md", "section_title": "A", "line_start": 1, "line_end": 2},
|
|
{"file_path": "b.md", "section_title": "B", "line_start": 1, "line_end": 2},
|
|
{"file_path": "c.md", "section_title": "C", "line_start": 1, "line_end": 2},
|
|
]],
|
|
"distances": [[0.1, 0.5, 0.8]], # Scores will be: 0.9, 0.5, 0.2
|
|
}
|
|
|
|
results = _parse_search_results(raw_results, min_score=0.6)
|
|
|
|
# Only first result should pass (score 0.9 >= 0.6)
|
|
assert len(results) == 1
|
|
assert results[0].file_path == "a.md"
|
|
assert results[0].score == pytest.approx(0.9)
|