MyObsidianAI/obsidian_rag/markdown_parser.py

"""Markdown parser for Obsidian vault files.

This module provides functionality to parse markdown files and extract
their structure (sections, line numbers) for semantic search indexing.
"""

import re
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional


@dataclass
class MarkdownSection:
  """Represents a section in a markdown document.

  Attributes:
      level: Header level (0 for no header, 1 for #, 2 for ##, etc.)
      title: Section title (empty string if level=0)
      content: Text content without the header line
      start_line: Line number where section starts (1-indexed)
      end_line: Line number where section ends (1-indexed, inclusive)
  """
  level: int
  title: str
  content: str
  parents: list[str]
  start_line: int
  end_line: int


@dataclass
class ParsedDocument:
  """Represents a parsed markdown document.

  Attributes:
      file_path: Path to the markdown file
      sections: List of sections extracted from the document
      raw_content: Full file content as string
  """
  file_path: Path
  title: str
  sections: List[MarkdownSection]
  raw_content: str


def _compute_parents(current_parents, previous_level, previous_title, current_level):
  """Computes the parents of `current_parents`."""
  return current_parents


def parse_markdown_file(file_path: Path, vault_path=None) -> ParsedDocument:
  """Parse a markdown file and extract its structure.

  This function reads a markdown file, identifies all header sections,
  and extracts their content with precise line number tracking.
  Files without headers are treated as a single section with level 0.

  Args:
      file_path: Path to the markdown file to parse
      vault_path: Path to the vault file.

  Returns:
      ParsedDocument containing the file structure and content

  Raises:
      FileNotFoundError: If the file does not exist

  Example:
      >>> doc = parse_markdown_file(Path("notes/example.md"))
      >>> print(f"Found {len(doc.sections)} sections")
      >>> print(doc.sections[0].title)
  """
  if not file_path.exists():
    raise FileNotFoundError(f"File not found: {file_path}")

  if vault_path:
    title = str(file_path.relative_to(vault_path)).replace(".md", "")
    title = title.replace("\\", " ").replace("/", " ")
  else:
    title = file_path.stem

  raw_content = file_path.read_text(encoding="utf-8")
  lines = raw_content.splitlines()

  sections: List[MarkdownSection] = []
  current_section_start = 1
  current_level = 0
  current_title = ""
  current_parents = []
  current_content_lines: List[str] = []

  header_pattern = re.compile(r"^(#{1,6})\s+(.+)$")

  for line_num, line in enumerate(lines, start=1):
    match = header_pattern.match(line)

    if match:
      # Save the previous section only if it actually has content.
      if current_content_lines:
        content = "\n".join(current_content_lines)
        sections.append(
          MarkdownSection(
            level=current_level,
            title=current_title,
            content=content,
            parents=current_parents,
            start_line=current_section_start,
            end_line=line_num - 1,
          )
        )

      # Start a new section with the detected header.
      previous_level = current_level
      previous_title = current_title
      current_level = len(match.group(1))
      current_title = match.group(2).strip()
      current_section_start = line_num
      current_parents = _compute_parents(current_parents, previous_level, previous_title, current_level)
      current_content_lines = []
    else:
      current_content_lines.append(line)

  # Handle the final section (or whole file if no headers were found).
  if lines:
    content = "\n".join(current_content_lines)
    end_line = len(lines)

    # Case 1 – no header was ever found.
    if not sections and current_level == 0:
      sections.append(
        MarkdownSection(
          level=0,
          title="",
          content=content,
          parents=current_parents,
          start_line=1,
          end_line=end_line,
        )
      )
    # Case 2 – a single header was found (sections empty but we have a title).
    elif not sections:
      sections.append(
        MarkdownSection(
          level=current_level,
          title=current_title,
          content=content,
          parents=current_parents,
          start_line=current_section_start,
          end_line=end_line,
        )
      )
    # Case 3 – multiple headers were found (sections already contains earlier ones).
    else:
      sections.append(
        MarkdownSection(
          level=current_level,
          title=current_title,
          content=content,
          parents=current_parents,
          start_line=current_section_start,
          end_line=end_line,
        )
      )
  else:
    # Empty file: create a single empty level‑0 section.
    sections.append(
      MarkdownSection(
        level=0,
        title="",
        content="",
        parents=[],
        start_line=1,
        end_line=1,
      )
    )

  return ParsedDocument(
    file_path=file_path,
    title=title,
    sections=sections,
    raw_content=raw_content,
  )


def find_section_at_line(
    document: ParsedDocument,
    line_number: int,
) -> Optional[MarkdownSection]:
  """Find which section contains a given line number.

  This function searches through the document's sections to find
  which section contains the specified line number.

  Args:
      document: Parsed markdown document
      line_number: Line number to search for (1-indexed)

  Returns:
      MarkdownSection containing the line, or None if line number
      is invalid or out of range

  Example:
      >>> section = find_section_at_line(doc, 42)
      >>> if section:
      ...     print(f"Line 42 is in section: {section.title}")
  """
  if line_number < 1:
    return None

  for section in document.sections:
    if section.start_line <= line_number <= section.end_line:
      return section