""" Unit tests for the searcher module. """ import pytest from pathlib import Path from indexer import index_vault from searcher import search_vault, _parse_search_results, SearchResult @pytest.fixture def temp_vault(tmp_path): """ Create a temporary vault with sample markdown files. """ vault_path = tmp_path / "test_vault" vault_path.mkdir() # Create sample files file1 = vault_path / "python_basics.md" file1.write_text("""# Python Programming Python is a high-level programming language known for its simplicity and readability. ## Variables and Data Types In Python, you can create variables without declaring their type explicitly. Numbers, strings, and booleans are the basic data types. ## Functions Functions in Python are defined using the def keyword. They help organize code into reusable blocks. """) file2 = vault_path / "machine_learning.md" file2.write_text("""# Machine Learning Machine learning is a subset of artificial intelligence. ## Supervised Learning Supervised learning uses labeled data to train models. Common algorithms include linear regression and decision trees. ## Deep Learning Deep learning uses neural networks with multiple layers. It's particularly effective for image and speech recognition. """) file3 = vault_path / "cooking.md" file3.write_text("""# Italian Cuisine Italian cooking emphasizes fresh ingredients and simple preparation. ## Pasta Dishes Pasta is a staple of Italian cuisine. There are hundreds of pasta shapes and sauce combinations. ## Pizza Making Traditional Italian pizza uses a thin crust and fresh mozzarella. """) return vault_path @pytest.fixture def indexed_vault(temp_vault, tmp_path): """ Create and index a temporary vault. """ chroma_path = tmp_path / "chroma_db" chroma_path.mkdir() # Index the vault stats = index_vault( vault_path=str(temp_vault), chroma_db_path=str(chroma_path), collection_name="test_collection", ) return { "vault_path": temp_vault, "chroma_path": chroma_path, "collection_name": "test_collection", "stats": stats, } # Passing tests def test_i_can_search_vault_with_valid_query(indexed_vault): """ Test that a basic search returns valid results. """ results = search_vault( query="Python programming language", chroma_db_path=str(indexed_vault["chroma_path"]), collection_name=indexed_vault["collection_name"], ) # Should return results assert len(results) > 0 # All results should be SearchResult instances for result in results: assert isinstance(result, SearchResult) # Check that all fields are present assert isinstance(result.file_path, str) assert isinstance(result.section_title, str) assert isinstance(result.line_start, int) assert isinstance(result.line_end, int) assert isinstance(result.score, float) assert isinstance(result.text, str) # Scores should be between 0 and 1 assert 0.0 <= result.score <= 1.0 # Results should be sorted by score (descending) scores = [r.score for r in results] assert scores == sorted(scores, reverse=True) def test_i_can_search_vault_with_limit_parameter(indexed_vault): """ Test that the limit parameter is respected. """ limit = 3 results = search_vault( query="learning", chroma_db_path=str(indexed_vault["chroma_path"]), collection_name=indexed_vault["collection_name"], limit=limit, ) # Should return at most 'limit' results assert len(results) <= limit def test_i_can_search_vault_with_min_score_filter(indexed_vault): """ Test that only results above min_score are returned. """ min_score = 0.5 results = search_vault( query="Python", chroma_db_path=str(indexed_vault["chroma_path"]), collection_name=indexed_vault["collection_name"], min_score=min_score, ) # All results should have score >= min_score for result in results: assert result.score >= min_score def test_i_can_get_correct_metadata_in_results(indexed_vault): """ Test that metadata in results is correct. """ results = search_vault( query="Python programming", chroma_db_path=str(indexed_vault["chroma_path"]), collection_name=indexed_vault["collection_name"], limit=1, ) assert len(results) > 0 top_result = results[0] # Should find python_basics.md as most relevant assert "python_basics.md" in top_result.file_path # Should have a section title assert len(top_result.section_title) > 0 # Line numbers should be positive assert top_result.line_start > 0 assert top_result.line_end >= top_result.line_start # Text should not be empty assert len(top_result.text) > 0 def test_i_can_search_with_different_collection_name(temp_vault, tmp_path): """ Test that we can search in a collection with a custom name. """ chroma_path = tmp_path / "chroma_custom" chroma_path.mkdir() custom_collection = "my_custom_collection" # Index with custom collection name index_vault( vault_path=str(temp_vault), chroma_db_path=str(chroma_path), collection_name=custom_collection, ) # Search with the same custom collection name results = search_vault( query="Python", chroma_db_path=str(chroma_path), collection_name=custom_collection, ) assert len(results) > 0 def test_i_can_get_empty_results_when_no_match(indexed_vault): """ Test that a search with no matches returns an empty list. """ results = search_vault( query="quantum physics relativity theory", chroma_db_path=str(indexed_vault["chroma_path"]), collection_name=indexed_vault["collection_name"], min_score=0.9, # Very high threshold ) # Should return empty list, not raise exception assert isinstance(results, list) assert len(results) == 0 # Error tests def test_i_cannot_search_with_empty_query(indexed_vault): """ Test that an empty query raises ValueError. """ with pytest.raises(ValueError, match="Query cannot be empty"): search_vault( query="", chroma_db_path=str(indexed_vault["chroma_path"]), collection_name=indexed_vault["collection_name"], ) def test_i_cannot_search_nonexistent_collection(tmp_path): """ Test that searching a nonexistent collection raises ValueError. """ chroma_path = tmp_path / "empty_chroma" chroma_path.mkdir() with pytest.raises(ValueError, match="not found"): search_vault( query="test query", chroma_db_path=str(chroma_path), collection_name="nonexistent_collection", ) def test_i_cannot_search_with_whitespace_only_query(indexed_vault): """ Test that a query with only whitespace raises ValueError. """ with pytest.raises(ValueError, match="Query cannot be empty"): search_vault( query=" ", chroma_db_path=str(indexed_vault["chroma_path"]), collection_name=indexed_vault["collection_name"], ) # Helper function tests def test_i_can_parse_search_results_correctly(): """ Test that ChromaDB results are parsed correctly. """ # Mock ChromaDB query results raw_results = { "documents": [[ "Python is a programming language", "Machine learning basics", ]], "metadatas": [[ { "file_path": "notes/python.md", "section_title": "Introduction", "line_start": 1, "line_end": 5, }, { "file_path": "notes/ml.md", "section_title": "Overview", "line_start": 10, "line_end": 15, }, ]], "distances": [[0.2, 0.4]], # ChromaDB distances (lower = more similar) } results = _parse_search_results(raw_results, min_score=0.0) assert len(results) == 2 # Check first result assert results[0].file_path == "notes/python.md" assert results[0].section_title == "Introduction" assert results[0].line_start == 1 assert results[0].line_end == 5 assert results[0].text == "Python is a programming language" assert results[0].score == pytest.approx(0.8) # 1 - 0.2 # Check second result assert results[1].score == pytest.approx(0.6) # 1 - 0.4 def test_i_can_filter_results_by_min_score(): """ Test that results are filtered by min_score during parsing. """ raw_results = { "documents": [["text1", "text2", "text3"]], "metadatas": [[ {"file_path": "a.md", "section_title": "A", "line_start": 1, "line_end": 2}, {"file_path": "b.md", "section_title": "B", "line_start": 1, "line_end": 2}, {"file_path": "c.md", "section_title": "C", "line_start": 1, "line_end": 2}, ]], "distances": [[0.1, 0.5, 0.8]], # Scores will be: 0.9, 0.5, 0.2 } results = _parse_search_results(raw_results, min_score=0.6) # Only first result should pass (score 0.9 >= 0.6) assert len(results) == 1 assert results[0].file_path == "a.md" assert results[0].score == pytest.approx(0.9)