Fixed unit tests
This commit is contained in:
@@ -12,6 +12,20 @@ from difflib import SequenceMatcher
|
||||
from motor.motor_asyncio import AsyncIOMotorCollection
|
||||
from app.models.document import FileDocument
|
||||
from app.database.connection import get_database
|
||||
from app.utils.ducment_matching import fuzzy_matching, subsequence_matching
|
||||
|
||||
|
||||
class MatchMethodBase:
|
||||
pass
|
||||
|
||||
|
||||
class SubsequenceMatching(MatchMethodBase):
|
||||
pass
|
||||
|
||||
|
||||
class FuzzyMatching(MatchMethodBase):
|
||||
def __init__(self, threshold: float = 0.6):
|
||||
self.threshold = threshold
|
||||
|
||||
|
||||
class FileDocumentRepository:
|
||||
@@ -28,6 +42,14 @@ class FileDocumentRepository:
|
||||
self.collection: AsyncIOMotorCollection = self.db.files
|
||||
self._ensure_indexes()
|
||||
|
||||
async def initialize(self):
|
||||
"""
|
||||
Initialize repository by ensuring required indexes exist.
|
||||
|
||||
Should be called after repository instantiation to setup database indexes.
|
||||
"""
|
||||
await self._ensure_indexes()
|
||||
|
||||
async def _ensure_indexes(self):
|
||||
"""
|
||||
Ensure required database indexes exist.
|
||||
@@ -64,7 +86,7 @@ class FileDocumentRepository:
|
||||
return file_data
|
||||
|
||||
except DuplicateKeyError as e:
|
||||
raise DuplicateKeyError(f"File with same hash already exists: {e}")
|
||||
raise DuplicateKeyError(f"File with same file path already exists: {e}")
|
||||
except PyMongoError as e:
|
||||
raise ValueError(f"Failed to create file document: {e}")
|
||||
|
||||
@@ -128,13 +150,13 @@ class FileDocumentRepository:
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def find_document_by_name(self, filename: str, similarity_threshold: float = 0.6) -> List[FileDocument]:
|
||||
async def find_document_by_name(self, filename: str, matching_method: MatchMethodBase = None) -> List[FileDocument]:
|
||||
"""
|
||||
Find file documents by filename using fuzzy matching.
|
||||
|
||||
Args:
|
||||
filename (str): Filename to search for
|
||||
similarity_threshold (float): Minimum similarity ratio (0.0 to 1.0)
|
||||
matching_method (MatchMethodBase): Minimum similarity ratio (0.0 to 1.0)
|
||||
|
||||
Returns:
|
||||
List[FileDocument]: List of matching files sorted by similarity score
|
||||
@@ -143,21 +165,12 @@ class FileDocumentRepository:
|
||||
# Get all files from database
|
||||
cursor = self.collection.find({})
|
||||
all_files = await cursor.to_list(length=None)
|
||||
all_documents = [FileDocument(**file_doc) for file_doc in all_files]
|
||||
|
||||
matches = []
|
||||
for file_doc in all_files:
|
||||
file_obj = FileDocument(**file_doc)
|
||||
# Calculate similarity between search term and filename
|
||||
similarity = SequenceMatcher(None, filename.lower(), file_obj.filename.lower()).ratio()
|
||||
|
||||
if similarity >= similarity_threshold:
|
||||
matches.append((file_obj, similarity))
|
||||
if isinstance(matching_method, FuzzyMatching):
|
||||
return fuzzy_matching(filename, all_documents, matching_method.threshold)
|
||||
|
||||
# Sort by similarity score (highest first)
|
||||
matches.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
# Return only the FileDocument objects
|
||||
return [match[0] for match in matches]
|
||||
return subsequence_matching(filename, all_documents)
|
||||
|
||||
except PyMongoError:
|
||||
return []
|
||||
|
||||
@@ -34,6 +34,14 @@ class UserRepository:
|
||||
self.collection: AsyncIOMotorCollection = database.users
|
||||
self._ensure_indexes()
|
||||
|
||||
async def initialize(self):
|
||||
"""
|
||||
Initialize repository by ensuring required indexes exist.
|
||||
|
||||
Should be called after repository instantiation to setup database indexes.
|
||||
"""
|
||||
await self._ensure_indexes()
|
||||
|
||||
async def _ensure_indexes(self):
|
||||
"""
|
||||
Ensure required database indexes exist.
|
||||
|
||||
60
src/file-processor/app/utils/ducment_matching.py
Normal file
60
src/file-processor/app/utils/ducment_matching.py
Normal file
@@ -0,0 +1,60 @@
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
from app.models.document import FileDocument
|
||||
|
||||
|
||||
def _is_subsequence(query: str, target: str) -> tuple[bool, float]:
|
||||
"""
|
||||
Check if query is a subsequence of target (case-insensitive).
|
||||
Returns (match, score).
|
||||
Score is higher when the query letters are closer together in the target.
|
||||
"""
|
||||
query = query.lower()
|
||||
target = target.lower()
|
||||
|
||||
positions = []
|
||||
idx = 0
|
||||
|
||||
for char in query:
|
||||
idx = target.find(char, idx)
|
||||
if idx == -1:
|
||||
return False, 0.0
|
||||
positions.append(idx)
|
||||
idx += 1
|
||||
|
||||
# Smallest window containing all matched chars
|
||||
window_size = positions[-1] - positions[0] + 1
|
||||
|
||||
# Score: ratio of query length vs window size (compactness)
|
||||
score = len(query) / window_size
|
||||
|
||||
return True, score
|
||||
|
||||
def fuzzy_matching(filename: str, documents: list[FileDocument], similarity_threshold: float = 0.7):
|
||||
matches = []
|
||||
for file_doc in documents:
|
||||
# Calculate similarity between search term and filename
|
||||
similarity = SequenceMatcher(None, filename.lower(), file_doc.filename.lower()).ratio()
|
||||
|
||||
if similarity >= similarity_threshold:
|
||||
matches.append((file_doc, similarity))
|
||||
|
||||
# Sort by similarity score (highest first)
|
||||
matches.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
# Return only the FileDocument objects
|
||||
return [match[0] for match in matches]
|
||||
|
||||
|
||||
def subsequence_matching(query: str, documents: list[FileDocument]):
|
||||
matches = []
|
||||
for file_doc in documents:
|
||||
matched, score = _is_subsequence(query, file_doc.filename)
|
||||
if matched:
|
||||
matches.append((file_doc, score))
|
||||
|
||||
# Sort by score (highest first)
|
||||
matches.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
# Return only the FileDocument objects
|
||||
return [match[0] for match in matches]
|
||||
Reference in New Issue
Block a user