Implemented default pipeline

2025-09-26 22:08:39 +02:00
parent f1b551d243
commit 4de732b0ae
56 changed files with 4534 additions and 2837 deletions
--- a/tests/utils/test_document_matching.py
+++ b/tests/utils/test_document_matching.py
@@ -0,0 +1,89 @@
+import os
+from datetime import datetime
+
+import pytest
+from app.models.document import FileDocument, FileType
+from app.utils.document_matching import fuzzy_matching, subsequence_matching
+
+
+def get_doc(filename: str = None):
+  """Sample FileDocument data for testing."""
+  return FileDocument(
+    filename=f"{filename}",
+    filepath=f"/path/to/{filename}",
+    file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456",
+    file_type=FileType(os.path.splitext(filename)[1].lstrip(".") or "txt"),
+    detected_at=datetime.now(),
+    file_size=1024,
+    mime_type="application/pdf"
+  )
+
+
+class TestFuzzyMatching:
+  def test_i_can_find_exact_match_with_fuzzy(self):
+    # Exact match should always pass
+    docs = [get_doc(filename="hello.txt")]
+    result = fuzzy_matching("hello.txt", docs)
+    assert len(result) == 1
+    assert result[0].filename == "hello.txt"
+  
+  def test_i_can_find_close_match_with_fuzzy(self):
+    # "helo.txt" should match "hello.txt" with high similarity
+    docs = [get_doc(filename="hello.txt")]
+    result = fuzzy_matching("helo.txt", docs, similarity_threshold=0.7)
+    assert len(result) == 1
+    assert result[0].filename == "hello.txt"
+  
+  def test_i_cannot_find_dissimilar_match_with_fuzzy(self):
+    # "world.txt" should not match "hello.txt"
+    docs = [get_doc(filename="hello.txt")]
+    result = fuzzy_matching("world.txt", docs, similarity_threshold=0.7)
+    assert len(result) == 0
+  
+  def test_i_can_sort_by_similarity_in_fuzzy(self):
+    # "helo.txt" is closer to "hello.txt" than "hllll.txt"
+    docs = [
+        get_doc(filename="hello.txt"),
+        get_doc(filename="hllll.txt"),
+    ]
+    result = fuzzy_matching("helo.txt", docs, similarity_threshold=0.5)
+    assert result[0].filename == "hello.txt"
+
+
+class TestSubsequenceMatching:
+  def test_i_can_match_subsequence_simple(self):
+    # "ifb" should match "ilFaitBeau.txt"
+    docs = [get_doc(filename="ilFaitBeau.txt")]
+    result = subsequence_matching("ifb", docs)
+    assert len(result) == 1
+    assert result[0].filename == "ilFaitBeau.txt"
+  
+  def test_i_cannot_match_wrong_order_subsequence(self):
+    # "fib" should not match "ilFaitBeau.txt" because the order is wrong
+    docs = [get_doc(filename="ilFaitBeau.txt")]
+    result = subsequence_matching("bfi", docs)
+    assert len(result) == 0
+  
+  def test_i_can_match_multiple_documents_subsequence(self):
+    # "ifb" should match both filenames, but "ilFaitBeau.txt" has a higher score
+    docs = [
+        get_doc(filename="ilFaitBeau.txt"),
+        get_doc(filename="information_base.txt"),
+    ]
+    result = subsequence_matching("ifb", docs)
+    assert len(result) == 2
+    assert result[0].filename == "ilFaitBeau.txt"
+    assert result[1].filename == "information_base.txt"
+  
+  def test_i_cannot_match_unrelated_subsequence(self):
+    # "xyz" should not match any file
+    docs = [get_doc(filename="ilFaitBeau.txt")]
+    result = subsequence_matching("xyz", docs)
+    assert len(result) == 0
+  
+  def test_i_can_handle_case_insensitivity_in_subsequence(self):
+    # Matching should be case-insensitive
+    docs = [get_doc(filename="HelloWorld.txt")]
+    result = subsequence_matching("hw", docs)
+    assert len(result) == 1
+    assert result[0].filename == "HelloWorld.txt"