"""Markdown parser for Obsidian vault files. This module provides functionality to parse markdown files and extract their structure (sections, line numbers) for semantic search indexing. """ import re from dataclasses import dataclass from pathlib import Path from typing import List, Optional @dataclass class MarkdownSection: """Represents a section in a markdown document. Attributes: level: Header level (0 for no header, 1 for #, 2 for ##, etc.) title: Section title (empty string if level=0) content: Text content without the header line start_line: Line number where section starts (1-indexed) end_line: Line number where section ends (1-indexed, inclusive) """ level: int title: str content: str parents: list[str] start_line: int end_line: int @dataclass class ParsedDocument: """Represents a parsed markdown document. Attributes: file_path: Path to the markdown file sections: List of sections extracted from the document raw_content: Full file content as string """ file_path: Path title: str sections: List[MarkdownSection] raw_content: str def _compute_parents(current_parents, previous_level, previous_title, current_level): """Computes the parents of `current_parents`.""" return current_parents def parse_markdown_file(file_path: Path, vault_path=None) -> ParsedDocument: """Parse a markdown file and extract its structure. This function reads a markdown file, identifies all header sections, and extracts their content with precise line number tracking. Files without headers are treated as a single section with level 0. Args: file_path: Path to the markdown file to parse vault_path: Path to the vault file. Returns: ParsedDocument containing the file structure and content Raises: FileNotFoundError: If the file does not exist Example: >>> doc = parse_markdown_file(Path("notes/example.md")) >>> print(f"Found {len(doc.sections)} sections") >>> print(doc.sections[0].title) """ if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") if vault_path: title = str(file_path.relative_to(vault_path)).replace(".md", "") title = title.replace("\\", " ").replace("/", " ") else: title = file_path.stem raw_content = file_path.read_text(encoding="utf-8") lines = raw_content.splitlines() sections: List[MarkdownSection] = [] current_section_start = 1 current_level = 0 current_title = "" current_parents = [] current_content_lines: List[str] = [] header_pattern = re.compile(r"^(#{1,6})\s+(.+)$") for line_num, line in enumerate(lines, start=1): match = header_pattern.match(line) if match: # Save the previous section only if it actually has content. if current_content_lines: content = "\n".join(current_content_lines) sections.append( MarkdownSection( level=current_level, title=current_title, content=content, parents=current_parents, start_line=current_section_start, end_line=line_num - 1, ) ) # Start a new section with the detected header. previous_level = current_level previous_title = current_title current_level = len(match.group(1)) current_title = match.group(2).strip() current_section_start = line_num current_parents = _compute_parents(current_parents, previous_level, previous_title, current_level) current_content_lines = [] else: current_content_lines.append(line) # Handle the final section (or whole file if no headers were found). if lines: content = "\n".join(current_content_lines) end_line = len(lines) # Case 1 – no header was ever found. if not sections and current_level == 0: sections.append( MarkdownSection( level=0, title="", content=content, parents=current_parents, start_line=1, end_line=end_line, ) ) # Case 2 – a single header was found (sections empty but we have a title). elif not sections: sections.append( MarkdownSection( level=current_level, title=current_title, content=content, parents=current_parents, start_line=current_section_start, end_line=end_line, ) ) # Case 3 – multiple headers were found (sections already contains earlier ones). else: sections.append( MarkdownSection( level=current_level, title=current_title, content=content, parents=current_parents, start_line=current_section_start, end_line=end_line, ) ) else: # Empty file: create a single empty level‑0 section. sections.append( MarkdownSection( level=0, title="", content="", parents=[], start_line=1, end_line=1, ) ) return ParsedDocument( file_path=file_path, title=title, sections=sections, raw_content=raw_content, ) def find_section_at_line( document: ParsedDocument, line_number: int, ) -> Optional[MarkdownSection]: """Find which section contains a given line number. This function searches through the document's sections to find which section contains the specified line number. Args: document: Parsed markdown document line_number: Line number to search for (1-indexed) Returns: MarkdownSection containing the line, or None if line number is invalid or out of range Example: >>> section = find_section_at_line(doc, 42) >>> if section: ... print(f"Line 42 is in section: {section.title}") """ if line_number < 1: return None for section in document.sections: if section.start_line <= line_number <= section.end_line: return section