Fixed unit tests

2025-09-19 21:06:09 +02:00
parent c3ea80363f
commit e8b306ac4a
6 changed files with 195 additions and 66 deletions
--- a/src/file-processor/app/database/repositories/document_repository.py
+++ b/src/file-processor/app/database/repositories/document_repository.py
@@ -12,6 +12,20 @@ from difflib import SequenceMatcher
 from motor.motor_asyncio import AsyncIOMotorCollection
 from app.models.document import FileDocument
 from app.database.connection import get_database
+from app.utils.ducment_matching import fuzzy_matching, subsequence_matching
+
+
+class MatchMethodBase:
+  pass
+
+
+class SubsequenceMatching(MatchMethodBase):
+  pass
+
+
+class FuzzyMatching(MatchMethodBase):
+  def __init__(self, threshold: float = 0.6):
+    self.threshold = threshold


 class FileDocumentRepository:
@@ -28,6 +42,14 @@ class FileDocumentRepository:
    self.collection: AsyncIOMotorCollection = self.db.files
    self._ensure_indexes()
  
+  async def initialize(self):
+    """
+    Initialize repository by ensuring required indexes exist.
+
+    Should be called after repository instantiation to setup database indexes.
+    """
+    await self._ensure_indexes()
+  
  async def _ensure_indexes(self):
    """
    Ensure required database indexes exist.
@@ -64,7 +86,7 @@ class FileDocumentRepository:
      return file_data
    
    except DuplicateKeyError as e:
-      raise DuplicateKeyError(f"File with same hash already exists: {e}")
+      raise DuplicateKeyError(f"File with same file path already exists: {e}")
    except PyMongoError as e:
      raise ValueError(f"Failed to create file document: {e}")
  
@@ -128,13 +150,13 @@ class FileDocumentRepository:
    except PyMongoError:
      return None
  
-  async def find_document_by_name(self, filename: str, similarity_threshold: float = 0.6) -> List[FileDocument]:
+  async def find_document_by_name(self, filename: str, matching_method: MatchMethodBase = None) -> List[FileDocument]:
    """
    Find file documents by filename using fuzzy matching.
    
    Args:
        filename (str): Filename to search for
-        similarity_threshold (float): Minimum similarity ratio (0.0 to 1.0)
+        matching_method (MatchMethodBase): Minimum similarity ratio (0.0 to 1.0)
        
    Returns:
        List[FileDocument]: List of matching files sorted by similarity score
@@ -143,21 +165,12 @@ class FileDocumentRepository:
      # Get all files from database
      cursor = self.collection.find({})
      all_files = await cursor.to_list(length=None)
+      all_documents = [FileDocument(**file_doc) for file_doc in all_files]
      
-      matches = []
-      for file_doc in all_files:
-        file_obj = FileDocument(**file_doc)
-        # Calculate similarity between search term and filename
-        similarity = SequenceMatcher(None, filename.lower(), file_obj.filename.lower()).ratio()
-        
-        if similarity >= similarity_threshold:
-          matches.append((file_obj, similarity))
+      if isinstance(matching_method, FuzzyMatching):
+        return fuzzy_matching(filename, all_documents, matching_method.threshold)
      
-      # Sort by similarity score (highest first)
-      matches.sort(key=lambda x: x[1], reverse=True)
-      
-      # Return only the FileDocument objects
-      return [match[0] for match in matches]
+      return subsequence_matching(filename, all_documents)
    
    except PyMongoError:
      return []
--- a/src/file-processor/app/database/repositories/user_repository.py
+++ b/src/file-processor/app/database/repositories/user_repository.py
@@ -34,6 +34,14 @@ class UserRepository:
    self.collection: AsyncIOMotorCollection = database.users
    self._ensure_indexes()
  
+  async def initialize(self):
+    """
+    Initialize repository by ensuring required indexes exist.
+
+    Should be called after repository instantiation to setup database indexes.
+    """
+    await self._ensure_indexes()
+  
  async def _ensure_indexes(self):
    """
    Ensure required database indexes exist.
--- a/src/file-processor/app/utils/ducment_matching.py
+++ b/src/file-processor/app/utils/ducment_matching.py
@@ -0,0 +1,60 @@
+from difflib import SequenceMatcher
+
+from app.models.document import FileDocument
+
+
+def _is_subsequence(query: str, target: str) -> tuple[bool, float]:
+  """
+  Check if query is a subsequence of target (case-insensitive).
+  Returns (match, score).
+  Score is higher when the query letters are closer together in the target.
+  """
+  query = query.lower()
+  target = target.lower()
+  
+  positions = []
+  idx = 0
+  
+  for char in query:
+    idx = target.find(char, idx)
+    if idx == -1:
+      return False, 0.0
+    positions.append(idx)
+    idx += 1
+  
+  # Smallest window containing all matched chars
+  window_size = positions[-1] - positions[0] + 1
+  
+  # Score: ratio of query length vs window size (compactness)
+  score = len(query) / window_size
+  
+  return True, score
+
+def fuzzy_matching(filename: str, documents: list[FileDocument], similarity_threshold: float = 0.7):
+  matches = []
+  for file_doc in documents:
+    # Calculate similarity between search term and filename
+    similarity = SequenceMatcher(None, filename.lower(), file_doc.filename.lower()).ratio()
+    
+    if similarity >= similarity_threshold:
+      matches.append((file_doc, similarity))
+  
+  # Sort by similarity score (highest first)
+  matches.sort(key=lambda x: x[1], reverse=True)
+  
+  # Return only the FileDocument objects
+  return [match[0] for match in matches]
+  
+
+def subsequence_matching(query: str, documents: list[FileDocument]):
+  matches = []
+  for file_doc in documents:
+    matched, score = _is_subsequence(query, file_doc.filename)
+    if matched:
+      matches.append((file_doc, score))
+  
+  # Sort by score (highest first)
+  matches.sort(key=lambda x: x[1], reverse=True)
+  
+  # Return only the FileDocument objects
+  return [match[0] for match in matches]