from difflib import SequenceMatcher from app.models.document import FileDocument def _is_subsequence(query: str, target: str) -> tuple[bool, float]: """ Check if query is a subsequence of target (case-insensitive). Returns (match, score). Score is higher when the query letters are closer together in the target. """ query = query.lower() target = target.lower() positions = [] idx = 0 for char in query: idx = target.find(char, idx) if idx == -1: return False, 0.0 positions.append(idx) idx += 1 # Smallest window containing all matched chars window_size = positions[-1] - positions[0] + 1 # Score: ratio of query length vs window size (compactness) score = len(query) / window_size return True, score def fuzzy_matching(filename: str, documents: list[FileDocument], similarity_threshold: float = 0.7): matches = [] for file_doc in documents: # Calculate similarity between search term and filename similarity = SequenceMatcher(None, filename.lower(), file_doc.filename.lower()).ratio() if similarity >= similarity_threshold: matches.append((file_doc, similarity)) # Sort by similarity score (highest first) matches.sort(key=lambda x: x[1], reverse=True) # Return only the FileDocument objects return [match[0] for match in matches] def subsequence_matching(query: str, documents: list[FileDocument]): matches = [] for file_doc in documents: matched, score = _is_subsequence(query, file_doc.filename) if matched: matches.append((file_doc, score)) # Sort by score (highest first) matches.sort(key=lambda x: x[1], reverse=True) # Return only the FileDocument objects return [match[0] for match in matches]