MyDocManager/src/file-processor/app/utils/document_matching.py

from difflib import SequenceMatcher

from app.models.document import FileDocument


def _is_subsequence(query: str, target: str) -> tuple[bool, float]:
  """
  Check if query is a subsequence of target (case-insensitive).
  Returns (match, score).
  Score is higher when the query letters are closer together in the target.
  """
  query = query.lower()
  target = target.lower()

  positions = []
  idx = 0

  for char in query:
    idx = target.find(char, idx)
    if idx == -1:
      return False, 0.0
    positions.append(idx)
    idx += 1

  # Smallest window containing all matched chars
  window_size = positions[-1] - positions[0] + 1

  # Score: ratio of query length vs window size (compactness)
  score = len(query) / window_size

  return True, score

def fuzzy_matching(filename: str, documents: list[FileDocument], similarity_threshold: float = 0.7):
  matches = []
  for file_doc in documents:
    # Calculate similarity between search term and filename
    similarity = SequenceMatcher(None, filename.lower(), file_doc.filename.lower()).ratio()

    if similarity >= similarity_threshold:
      matches.append((file_doc, similarity))

  # Sort by similarity score (highest first)
  matches.sort(key=lambda x: x[1], reverse=True)

  # Return only the FileDocument objects
  return [match[0] for match in matches]


def subsequence_matching(query: str, documents: list[FileDocument]):
  matches = []
  for file_doc in documents:
    matched, score = _is_subsequence(query, file_doc.filename)
    if matched:
      matches.append((file_doc, score))

  # Sort by score (highest first)
  matches.sort(key=lambda x: x[1], reverse=True)

  # Return only the FileDocument objects
  return [match[0] for match in matches]