Files
MyDocManager/src/file-processor/app/utils/document_matching.py

61 lines
1.8 KiB
Python

from difflib import SequenceMatcher
from app.models.document import FileDocument
def _is_subsequence(query: str, target: str) -> tuple[bool, float]:
"""
Check if query is a subsequence of target (case-insensitive).
Returns (match, score).
Score is higher when the query letters are closer together in the target.
"""
query = query.lower()
target = target.lower()
positions = []
idx = 0
for char in query:
idx = target.find(char, idx)
if idx == -1:
return False, 0.0
positions.append(idx)
idx += 1
# Smallest window containing all matched chars
window_size = positions[-1] - positions[0] + 1
# Score: ratio of query length vs window size (compactness)
score = len(query) / window_size
return True, score
def fuzzy_matching(filename: str, documents: list[FileDocument], similarity_threshold: float = 0.7):
matches = []
for file_doc in documents:
# Calculate similarity between search term and filename
similarity = SequenceMatcher(None, filename.lower(), file_doc.filename.lower()).ratio()
if similarity >= similarity_threshold:
matches.append((file_doc, similarity))
# Sort by similarity score (highest first)
matches.sort(key=lambda x: x[1], reverse=True)
# Return only the FileDocument objects
return [match[0] for match in matches]
def subsequence_matching(query: str, documents: list[FileDocument]):
matches = []
for file_doc in documents:
matched, score = _is_subsequence(query, file_doc.filename)
if matched:
matches.append((file_doc, score))
# Sort by score (highest first)
matches.sort(key=lambda x: x[1], reverse=True)
# Return only the FileDocument objects
return [match[0] for match in matches]