86 lines
3.1 KiB
Python
86 lines
3.1 KiB
Python
from difflib import SequenceMatcher
|
|
from typing import Any
|
|
|
|
|
|
def _is_subsequence(query: str, target: str) -> tuple[bool, float]:
|
|
"""
|
|
Determines if a query string is a subsequence of a target string and calculates
|
|
a score based on the compactness of the match. The match is case-insensitive.
|
|
|
|
The function iterates through each character of the query and checks if it
|
|
exists in the target string while maintaining the order. If all characters of
|
|
the query are found in order, it calculates a score based on the smallest
|
|
window in the target that contains all the matched characters.
|
|
|
|
:param query: The query string to check as a subsequence.
|
|
:param target: The target string in which to find the subsequence.
|
|
:return: A tuple where the first value is a boolean indicating if a valid
|
|
subsequence exists, and the second value is a float representing the
|
|
compactness score of the match.
|
|
:rtype: tuple[bool, float]
|
|
"""
|
|
query = query.lower()
|
|
target = target.lower()
|
|
|
|
positions = []
|
|
idx = 0
|
|
|
|
for char in query:
|
|
idx = target.find(char, idx)
|
|
if idx == -1:
|
|
return False, 0.0
|
|
positions.append(idx)
|
|
idx += 1
|
|
|
|
# Smallest window containing all matched chars
|
|
window_size = positions[-1] - positions[0] + 1
|
|
|
|
# Score: ratio of query length vs window size (compactness)
|
|
score = len(query) / window_size
|
|
|
|
return True, score
|
|
|
|
|
|
def fuzzy_matching(query: str, choices: list[Any], similarity_threshold: float = 0.7, get_attr=None):
|
|
"""
|
|
Perform fuzzy matching on a list of items to find the items that are similar
|
|
to the given query based on a similarity threshold.
|
|
|
|
:param query: The search query to be matched, provided as a string.
|
|
:param choices: A list of strings representing the items to be compared against the query.
|
|
:param similarity_threshold: A float value representing the minimum similarity score
|
|
(between 0 and 1) an item needs to achieve to be considered a match. Defaults to 0.7.
|
|
:param get_attr: When choice is a object, give the property to use
|
|
:return: A list of strings containing the items from the input list that meet or exceed
|
|
the similarity threshold, sorted in descending order of similarity.
|
|
"""
|
|
get_attr = get_attr or (lambda x: x)
|
|
matches = []
|
|
for file_doc in choices:
|
|
# Calculate similarity between search term and filename
|
|
similarity = SequenceMatcher(None, query.lower(), get_attr(file_doc).lower()).ratio()
|
|
|
|
if similarity >= similarity_threshold:
|
|
matches.append((file_doc, similarity))
|
|
|
|
# Sort by similarity score (highest first)
|
|
matches.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
# Return only the FileDocument objects
|
|
return [match[0] for match in matches]
|
|
|
|
|
|
def subsequence_matching(query: str, choices: list[Any], get_attr=None):
|
|
get_attr = get_attr or (lambda x: x)
|
|
matches = []
|
|
for item in choices:
|
|
matched, score = _is_subsequence(query, get_attr(item))
|
|
if matched:
|
|
matches.append((item, score))
|
|
|
|
# Sort by score (highest first)
|
|
matches.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
# Return only the FileDocument objects
|
|
return [match[0] for match in matches]
|