Files
MyObsidianAI/obsidian_rag/markdown_parser.py
Kodjo Sossouvi d4925f7969 Initial commit
2025-12-12 11:31:44 +01:00

214 lines
5.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Markdown parser for Obsidian vault files.
This module provides functionality to parse markdown files and extract
their structure (sections, line numbers) for semantic search indexing.
"""
import re
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional
@dataclass
class MarkdownSection:
"""Represents a section in a markdown document.
Attributes:
level: Header level (0 for no header, 1 for #, 2 for ##, etc.)
title: Section title (empty string if level=0)
content: Text content without the header line
start_line: Line number where section starts (1-indexed)
end_line: Line number where section ends (1-indexed, inclusive)
"""
level: int
title: str
content: str
parents: list[str]
start_line: int
end_line: int
@dataclass
class ParsedDocument:
"""Represents a parsed markdown document.
Attributes:
file_path: Path to the markdown file
sections: List of sections extracted from the document
raw_content: Full file content as string
"""
file_path: Path
title: str
sections: List[MarkdownSection]
raw_content: str
def _compute_parents(current_parents, previous_level, previous_title, current_level):
"""Computes the parents of `current_parents`."""
return current_parents
def parse_markdown_file(file_path: Path, vault_path=None) -> ParsedDocument:
"""Parse a markdown file and extract its structure.
This function reads a markdown file, identifies all header sections,
and extracts their content with precise line number tracking.
Files without headers are treated as a single section with level 0.
Args:
file_path: Path to the markdown file to parse
vault_path: Path to the vault file.
Returns:
ParsedDocument containing the file structure and content
Raises:
FileNotFoundError: If the file does not exist
Example:
>>> doc = parse_markdown_file(Path("notes/example.md"))
>>> print(f"Found {len(doc.sections)} sections")
>>> print(doc.sections[0].title)
"""
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
if vault_path:
title = str(file_path.relative_to(vault_path)).replace(".md", "")
title = title.replace("\\", " ").replace("/", " ")
else:
title = file_path.stem
raw_content = file_path.read_text(encoding="utf-8")
lines = raw_content.splitlines()
sections: List[MarkdownSection] = []
current_section_start = 1
current_level = 0
current_title = ""
current_parents = []
current_content_lines: List[str] = []
header_pattern = re.compile(r"^(#{1,6})\s+(.+)$")
for line_num, line in enumerate(lines, start=1):
match = header_pattern.match(line)
if match:
# Save the previous section only if it actually has content.
if current_content_lines:
content = "\n".join(current_content_lines)
sections.append(
MarkdownSection(
level=current_level,
title=current_title,
content=content,
parents=current_parents,
start_line=current_section_start,
end_line=line_num - 1,
)
)
# Start a new section with the detected header.
previous_level = current_level
previous_title = current_title
current_level = len(match.group(1))
current_title = match.group(2).strip()
current_section_start = line_num
current_parents = _compute_parents(current_parents, previous_level, previous_title, current_level)
current_content_lines = []
else:
current_content_lines.append(line)
# Handle the final section (or whole file if no headers were found).
if lines:
content = "\n".join(current_content_lines)
end_line = len(lines)
# Case 1 no header was ever found.
if not sections and current_level == 0:
sections.append(
MarkdownSection(
level=0,
title="",
content=content,
parents=current_parents,
start_line=1,
end_line=end_line,
)
)
# Case 2 a single header was found (sections empty but we have a title).
elif not sections:
sections.append(
MarkdownSection(
level=current_level,
title=current_title,
content=content,
parents=current_parents,
start_line=current_section_start,
end_line=end_line,
)
)
# Case 3 multiple headers were found (sections already contains earlier ones).
else:
sections.append(
MarkdownSection(
level=current_level,
title=current_title,
content=content,
parents=current_parents,
start_line=current_section_start,
end_line=end_line,
)
)
else:
# Empty file: create a single empty level0 section.
sections.append(
MarkdownSection(
level=0,
title="",
content="",
parents=[],
start_line=1,
end_line=1,
)
)
return ParsedDocument(
file_path=file_path,
title=title,
sections=sections,
raw_content=raw_content,
)
def find_section_at_line(
document: ParsedDocument,
line_number: int,
) -> Optional[MarkdownSection]:
"""Find which section contains a given line number.
This function searches through the document's sections to find
which section contains the specified line number.
Args:
document: Parsed markdown document
line_number: Line number to search for (1-indexed)
Returns:
MarkdownSection containing the line, or None if line number
is invalid or out of range
Example:
>>> section = find_section_at_line(doc, 42)
>>> if section:
... print(f"Line 42 is in section: {section.title}")
"""
if line_number < 1:
return None
for section in document.sections:
if section.start_line <= line_number <= section.end_line:
return section