from difflib import SequenceMatcher from typing import Any def _is_subsequence(query: str, target: str) -> tuple[bool, float]: """ Determines if a query string is a subsequence of a target string and calculates a score based on the compactness of the match. The match is case-insensitive. The function iterates through each character of the query and checks if it exists in the target string while maintaining the order. If all characters of the query are found in order, it calculates a score based on the smallest window in the target that contains all the matched characters. :param query: The query string to check as a subsequence. :param target: The target string in which to find the subsequence. :return: A tuple where the first value is a boolean indicating if a valid subsequence exists, and the second value is a float representing the compactness score of the match. :rtype: tuple[bool, float] """ query = query.lower() target = target.lower() positions = [] idx = 0 for char in query: idx = target.find(char, idx) if idx == -1: return False, 0.0 positions.append(idx) idx += 1 # Smallest window containing all matched chars window_size = positions[-1] - positions[0] + 1 # Score: ratio of query length vs window size (compactness) score = len(query) / window_size return True, score def fuzzy_matching(query: str, choices: list[Any], similarity_threshold: float = 0.7, get_attr=None): """ Perform fuzzy matching on a list of items to find the items that are similar to the given query based on a similarity threshold. :param query: The search query to be matched, provided as a string. :param choices: A list of strings representing the items to be compared against the query. :param similarity_threshold: A float value representing the minimum similarity score (between 0 and 1) an item needs to achieve to be considered a match. Defaults to 0.7. :param get_attr: When choice is a object, give the property to use :return: A list of strings containing the items from the input list that meet or exceed the similarity threshold, sorted in descending order of similarity. """ get_attr = get_attr or (lambda x: x) matches = [] for file_doc in choices: # Calculate similarity between search term and filename similarity = SequenceMatcher(None, query.lower(), get_attr(file_doc).lower()).ratio() if similarity >= similarity_threshold: matches.append((file_doc, similarity)) # Sort by similarity score (highest first) matches.sort(key=lambda x: x[1], reverse=True) # Return only the FileDocument objects return [match[0] for match in matches] def subsequence_matching(query: str, choices: list[Any], get_attr=None): get_attr = get_attr or (lambda x: x) matches = [] for item in choices: matched, score = _is_subsequence(query, get_attr(item)) if matched: matches.append((item, score)) # Sort by score (highest first) matches.sort(key=lambda x: x[1], reverse=True) # Return only the FileDocument objects return [match[0] for match in matches]