MyObsidianAI/tests/test_searcher.py

"""
Unit tests for the searcher module.
"""
import pytest
from pathlib import Path
from indexer import index_vault
from searcher import search_vault, _parse_search_results, SearchResult


@pytest.fixture
def temp_vault(tmp_path):
  """
  Create a temporary vault with sample markdown files.
  """
  vault_path = tmp_path / "test_vault"
  vault_path.mkdir()

  # Create sample files
  file1 = vault_path / "python_basics.md"
  file1.write_text("""# Python Programming

Python is a high-level programming language known for its simplicity and readability.

## Variables and Data Types

In Python, you can create variables without declaring their type explicitly.
Numbers, strings, and booleans are the basic data types.

## Functions

Functions in Python are defined using the def keyword.
They help organize code into reusable blocks.
""")

  file2 = vault_path / "machine_learning.md"
  file2.write_text("""# Machine Learning

Machine learning is a subset of artificial intelligence.

## Supervised Learning

Supervised learning uses labeled data to train models.
Common algorithms include linear regression and decision trees.

## Deep Learning

Deep learning uses neural networks with multiple layers.
It's particularly effective for image and speech recognition.
""")

  file3 = vault_path / "cooking.md"
  file3.write_text("""# Italian Cuisine

Italian cooking emphasizes fresh ingredients and simple preparation.

## Pasta Dishes

Pasta is a staple of Italian cuisine.
There are hundreds of pasta shapes and sauce combinations.

## Pizza Making

Traditional Italian pizza uses a thin crust and fresh mozzarella.
""")

  return vault_path


@pytest.fixture
def indexed_vault(temp_vault, tmp_path):
  """
  Create and index a temporary vault.
  """
  chroma_path = tmp_path / "chroma_db"
  chroma_path.mkdir()

  # Index the vault
  stats = index_vault(
    vault_path=str(temp_vault),
    chroma_db_path=str(chroma_path),
    collection_name="test_collection",
  )

  return {
      "vault_path": temp_vault,
      "chroma_path": chroma_path,
      "collection_name": "test_collection",
      "stats": stats,
  }


# Passing tests


def test_i_can_search_vault_with_valid_query(indexed_vault):
  """
  Test that a basic search returns valid results.
  """
  results = search_vault(
    query="Python programming language",
    chroma_db_path=str(indexed_vault["chroma_path"]),
    collection_name=indexed_vault["collection_name"],
  )

  # Should return results
  assert len(results) > 0

  # All results should be SearchResult instances
  for result in results:
    assert isinstance(result, SearchResult)

    # Check that all fields are present
    assert isinstance(result.file_path, str)
    assert isinstance(result.section_title, str)
    assert isinstance(result.line_start, int)
    assert isinstance(result.line_end, int)
    assert isinstance(result.score, float)
    assert isinstance(result.text, str)

    # Scores should be between 0 and 1
    assert 0.0 <= result.score <= 1.0

  # Results should be sorted by score (descending)
  scores = [r.score for r in results]
  assert scores == sorted(scores, reverse=True)


def test_i_can_search_vault_with_limit_parameter(indexed_vault):
  """
  Test that the limit parameter is respected.
  """
  limit = 3
  results = search_vault(
    query="learning",
    chroma_db_path=str(indexed_vault["chroma_path"]),
    collection_name=indexed_vault["collection_name"],
    limit=limit,
  )

  # Should return at most 'limit' results
  assert len(results) <= limit


def test_i_can_search_vault_with_min_score_filter(indexed_vault):
  """
  Test that only results above min_score are returned.
  """
  min_score = 0.5
  results = search_vault(
    query="Python",
    chroma_db_path=str(indexed_vault["chroma_path"]),
    collection_name=indexed_vault["collection_name"],
    min_score=min_score,
  )

  # All results should have score >= min_score
  for result in results:
    assert result.score >= min_score


def test_i_can_get_correct_metadata_in_results(indexed_vault):
  """
  Test that metadata in results is correct.
  """
  results = search_vault(
    query="Python programming",
    chroma_db_path=str(indexed_vault["chroma_path"]),
    collection_name=indexed_vault["collection_name"],
    limit=1,
  )

  assert len(results) > 0
  top_result = results[0]

  # Should find python_basics.md as most relevant
  assert "python_basics.md" in top_result.file_path

  # Should have a section title
  assert len(top_result.section_title) > 0

  # Line numbers should be positive
  assert top_result.line_start > 0
  assert top_result.line_end >= top_result.line_start

  # Text should not be empty
  assert len(top_result.text) > 0


def test_i_can_search_with_different_collection_name(temp_vault, tmp_path):
  """
  Test that we can search in a collection with a custom name.
  """
  chroma_path = tmp_path / "chroma_custom"
  chroma_path.mkdir()
  custom_collection = "my_custom_collection"

  # Index with custom collection name
  index_vault(
    vault_path=str(temp_vault),
    chroma_db_path=str(chroma_path),
    collection_name=custom_collection,
  )

  # Search with the same custom collection name
  results = search_vault(
    query="Python",
    chroma_db_path=str(chroma_path),
    collection_name=custom_collection,
  )

  assert len(results) > 0


def test_i_can_get_empty_results_when_no_match(indexed_vault):
  """
  Test that a search with no matches returns an empty list.
  """
  results = search_vault(
    query="quantum physics relativity theory",
    chroma_db_path=str(indexed_vault["chroma_path"]),
    collection_name=indexed_vault["collection_name"],
    min_score=0.9,  # Very high threshold
  )

  # Should return empty list, not raise exception
  assert isinstance(results, list)
  assert len(results) == 0


# Error tests


def test_i_cannot_search_with_empty_query(indexed_vault):
  """
  Test that an empty query raises ValueError.
  """
  with pytest.raises(ValueError, match="Query cannot be empty"):
    search_vault(
      query="",
      chroma_db_path=str(indexed_vault["chroma_path"]),
      collection_name=indexed_vault["collection_name"],
    )


def test_i_cannot_search_nonexistent_collection(tmp_path):
  """
  Test that searching a nonexistent collection raises ValueError.
  """
  chroma_path = tmp_path / "empty_chroma"
  chroma_path.mkdir()

  with pytest.raises(ValueError, match="not found"):
    search_vault(
      query="test query",
      chroma_db_path=str(chroma_path),
      collection_name="nonexistent_collection",
    )


def test_i_cannot_search_with_whitespace_only_query(indexed_vault):
  """
  Test that a query with only whitespace raises ValueError.
  """
  with pytest.raises(ValueError, match="Query cannot be empty"):
    search_vault(
      query="   ",
      chroma_db_path=str(indexed_vault["chroma_path"]),
      collection_name=indexed_vault["collection_name"],
    )


# Helper function tests


def test_i_can_parse_search_results_correctly():
  """
  Test that ChromaDB results are parsed correctly.
  """
  # Mock ChromaDB query results
  raw_results = {
      "documents": [[
          "Python is a programming language",
          "Machine learning basics",
      ]],
      "metadatas": [[
          {
              "file_path": "notes/python.md",
              "section_title": "Introduction",
              "line_start": 1,
              "line_end": 5,
          },
          {
              "file_path": "notes/ml.md",
              "section_title": "Overview",
              "line_start": 10,
              "line_end": 15,
          },
      ]],
      "distances": [[0.2, 0.4]],  # ChromaDB distances (lower = more similar)
  }

  results = _parse_search_results(raw_results, min_score=0.0)

  assert len(results) == 2

  # Check first result
  assert results[0].file_path == "notes/python.md"
  assert results[0].section_title == "Introduction"
  assert results[0].line_start == 1
  assert results[0].line_end == 5
  assert results[0].text == "Python is a programming language"
  assert results[0].score == pytest.approx(0.8)  # 1 - 0.2

  # Check second result
  assert results[1].score == pytest.approx(0.6)  # 1 - 0.4


def test_i_can_filter_results_by_min_score():
  """
  Test that results are filtered by min_score during parsing.
  """
  raw_results = {
      "documents": [["text1", "text2", "text3"]],
      "metadatas": [[
          {"file_path": "a.md", "section_title": "A", "line_start": 1, "line_end": 2},
          {"file_path": "b.md", "section_title": "B", "line_start": 1, "line_end": 2},
          {"file_path": "c.md", "section_title": "C", "line_start": 1, "line_end": 2},
      ]],
      "distances": [[0.1, 0.5, 0.8]],  # Scores will be: 0.9, 0.5, 0.2
  }

  results = _parse_search_results(raw_results, min_score=0.6)

  # Only first result should pass (score 0.9 >= 0.6)
  assert len(results) == 1
  assert results[0].file_path == "a.md"
  assert results[0].score == pytest.approx(0.9)