Fixed unit tests

This commit is contained in:
2025-09-19 21:06:09 +02:00
parent c3ea80363f
commit e8b306ac4a
6 changed files with 195 additions and 66 deletions

View File

@@ -12,6 +12,20 @@ from difflib import SequenceMatcher
from motor.motor_asyncio import AsyncIOMotorCollection
from app.models.document import FileDocument
from app.database.connection import get_database
from app.utils.ducment_matching import fuzzy_matching, subsequence_matching
class MatchMethodBase:
pass
class SubsequenceMatching(MatchMethodBase):
pass
class FuzzyMatching(MatchMethodBase):
def __init__(self, threshold: float = 0.6):
self.threshold = threshold
class FileDocumentRepository:
@@ -28,6 +42,14 @@ class FileDocumentRepository:
self.collection: AsyncIOMotorCollection = self.db.files
self._ensure_indexes()
async def initialize(self):
"""
Initialize repository by ensuring required indexes exist.
Should be called after repository instantiation to setup database indexes.
"""
await self._ensure_indexes()
async def _ensure_indexes(self):
"""
Ensure required database indexes exist.
@@ -64,7 +86,7 @@ class FileDocumentRepository:
return file_data
except DuplicateKeyError as e:
raise DuplicateKeyError(f"File with same hash already exists: {e}")
raise DuplicateKeyError(f"File with same file path already exists: {e}")
except PyMongoError as e:
raise ValueError(f"Failed to create file document: {e}")
@@ -128,13 +150,13 @@ class FileDocumentRepository:
except PyMongoError:
return None
async def find_document_by_name(self, filename: str, similarity_threshold: float = 0.6) -> List[FileDocument]:
async def find_document_by_name(self, filename: str, matching_method: MatchMethodBase = None) -> List[FileDocument]:
"""
Find file documents by filename using fuzzy matching.
Args:
filename (str): Filename to search for
similarity_threshold (float): Minimum similarity ratio (0.0 to 1.0)
matching_method (MatchMethodBase): Minimum similarity ratio (0.0 to 1.0)
Returns:
List[FileDocument]: List of matching files sorted by similarity score
@@ -143,21 +165,12 @@ class FileDocumentRepository:
# Get all files from database
cursor = self.collection.find({})
all_files = await cursor.to_list(length=None)
all_documents = [FileDocument(**file_doc) for file_doc in all_files]
matches = []
for file_doc in all_files:
file_obj = FileDocument(**file_doc)
# Calculate similarity between search term and filename
similarity = SequenceMatcher(None, filename.lower(), file_obj.filename.lower()).ratio()
if similarity >= similarity_threshold:
matches.append((file_obj, similarity))
if isinstance(matching_method, FuzzyMatching):
return fuzzy_matching(filename, all_documents, matching_method.threshold)
# Sort by similarity score (highest first)
matches.sort(key=lambda x: x[1], reverse=True)
# Return only the FileDocument objects
return [match[0] for match in matches]
return subsequence_matching(filename, all_documents)
except PyMongoError:
return []

View File

@@ -34,6 +34,14 @@ class UserRepository:
self.collection: AsyncIOMotorCollection = database.users
self._ensure_indexes()
async def initialize(self):
"""
Initialize repository by ensuring required indexes exist.
Should be called after repository instantiation to setup database indexes.
"""
await self._ensure_indexes()
async def _ensure_indexes(self):
"""
Ensure required database indexes exist.

View File

@@ -0,0 +1,60 @@
from difflib import SequenceMatcher
from app.models.document import FileDocument
def _is_subsequence(query: str, target: str) -> tuple[bool, float]:
"""
Check if query is a subsequence of target (case-insensitive).
Returns (match, score).
Score is higher when the query letters are closer together in the target.
"""
query = query.lower()
target = target.lower()
positions = []
idx = 0
for char in query:
idx = target.find(char, idx)
if idx == -1:
return False, 0.0
positions.append(idx)
idx += 1
# Smallest window containing all matched chars
window_size = positions[-1] - positions[0] + 1
# Score: ratio of query length vs window size (compactness)
score = len(query) / window_size
return True, score
def fuzzy_matching(filename: str, documents: list[FileDocument], similarity_threshold: float = 0.7):
matches = []
for file_doc in documents:
# Calculate similarity between search term and filename
similarity = SequenceMatcher(None, filename.lower(), file_doc.filename.lower()).ratio()
if similarity >= similarity_threshold:
matches.append((file_doc, similarity))
# Sort by similarity score (highest first)
matches.sort(key=lambda x: x[1], reverse=True)
# Return only the FileDocument objects
return [match[0] for match in matches]
def subsequence_matching(query: str, documents: list[FileDocument]):
matches = []
for file_doc in documents:
matched, score = _is_subsequence(query, file_doc.filename)
if matched:
matches.append((file_doc, score))
# Sort by score (highest first)
matches.sort(key=lambda x: x[1], reverse=True)
# Return only the FileDocument objects
return [match[0] for match in matches]