import os from datetime import datetime import pytest from app.models.document import FileDocument, FileType from app.utils.document_matching import fuzzy_matching, subsequence_matching def get_doc(filename: str = None): """Sample FileDocument data for testing.""" return FileDocument( filename=f"{filename}", filepath=f"/path/to/{filename}", file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456", file_type=FileType(os.path.splitext(filename)[1].lstrip(".") or "txt"), detected_at=datetime.now(), file_size=1024, mime_type="application/pdf" ) class TestFuzzyMatching: def test_i_can_find_exact_match_with_fuzzy(self): # Exact match should always pass docs = [get_doc(filename="hello.txt")] result = fuzzy_matching("hello.txt", docs) assert len(result) == 1 assert result[0].filename == "hello.txt" def test_i_can_find_close_match_with_fuzzy(self): # "helo.txt" should match "hello.txt" with high similarity docs = [get_doc(filename="hello.txt")] result = fuzzy_matching("helo.txt", docs, similarity_threshold=0.7) assert len(result) == 1 assert result[0].filename == "hello.txt" def test_i_cannot_find_dissimilar_match_with_fuzzy(self): # "world.txt" should not match "hello.txt" docs = [get_doc(filename="hello.txt")] result = fuzzy_matching("world.txt", docs, similarity_threshold=0.7) assert len(result) == 0 def test_i_can_sort_by_similarity_in_fuzzy(self): # "helo.txt" is closer to "hello.txt" than "hllll.txt" docs = [ get_doc(filename="hello.txt"), get_doc(filename="hllll.txt"), ] result = fuzzy_matching("helo.txt", docs, similarity_threshold=0.5) assert result[0].filename == "hello.txt" class TestSubsequenceMatching: def test_i_can_match_subsequence_simple(self): # "ifb" should match "ilFaitBeau.txt" docs = [get_doc(filename="ilFaitBeau.txt")] result = subsequence_matching("ifb", docs) assert len(result) == 1 assert result[0].filename == "ilFaitBeau.txt" def test_i_cannot_match_wrong_order_subsequence(self): # "fib" should not match "ilFaitBeau.txt" because the order is wrong docs = [get_doc(filename="ilFaitBeau.txt")] result = subsequence_matching("bfi", docs) assert len(result) == 0 def test_i_can_match_multiple_documents_subsequence(self): # "ifb" should match both filenames, but "ilFaitBeau.txt" has a higher score docs = [ get_doc(filename="ilFaitBeau.txt"), get_doc(filename="information_base.txt"), ] result = subsequence_matching("ifb", docs) assert len(result) == 2 assert result[0].filename == "ilFaitBeau.txt" assert result[1].filename == "information_base.txt" def test_i_cannot_match_unrelated_subsequence(self): # "xyz" should not match any file docs = [get_doc(filename="ilFaitBeau.txt")] result = subsequence_matching("xyz", docs) assert len(result) == 0 def test_i_can_handle_case_insensitivity_in_subsequence(self): # Matching should be case-insensitive docs = [get_doc(filename="HelloWorld.txt")] result = subsequence_matching("hw", docs) assert len(result) == 1 assert result[0].filename == "HelloWorld.txt"