61 lines
1.8 KiB
Python
61 lines
1.8 KiB
Python
from difflib import SequenceMatcher
|
|
|
|
from app.models.document import FileDocument
|
|
|
|
|
|
def _is_subsequence(query: str, target: str) -> tuple[bool, float]:
|
|
"""
|
|
Check if query is a subsequence of target (case-insensitive).
|
|
Returns (match, score).
|
|
Score is higher when the query letters are closer together in the target.
|
|
"""
|
|
query = query.lower()
|
|
target = target.lower()
|
|
|
|
positions = []
|
|
idx = 0
|
|
|
|
for char in query:
|
|
idx = target.find(char, idx)
|
|
if idx == -1:
|
|
return False, 0.0
|
|
positions.append(idx)
|
|
idx += 1
|
|
|
|
# Smallest window containing all matched chars
|
|
window_size = positions[-1] - positions[0] + 1
|
|
|
|
# Score: ratio of query length vs window size (compactness)
|
|
score = len(query) / window_size
|
|
|
|
return True, score
|
|
|
|
def fuzzy_matching(filename: str, documents: list[FileDocument], similarity_threshold: float = 0.7):
|
|
matches = []
|
|
for file_doc in documents:
|
|
# Calculate similarity between search term and filename
|
|
similarity = SequenceMatcher(None, filename.lower(), file_doc.filename.lower()).ratio()
|
|
|
|
if similarity >= similarity_threshold:
|
|
matches.append((file_doc, similarity))
|
|
|
|
# Sort by similarity score (highest first)
|
|
matches.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
# Return only the FileDocument objects
|
|
return [match[0] for match in matches]
|
|
|
|
|
|
def subsequence_matching(query: str, documents: list[FileDocument]):
|
|
matches = []
|
|
for file_doc in documents:
|
|
matched, score = _is_subsequence(query, file_doc.filename)
|
|
if matched:
|
|
matches.append((file_doc, score))
|
|
|
|
# Sort by score (highest first)
|
|
matches.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
# Return only the FileDocument objects
|
|
return [match[0] for match in matches]
|