214 lines
5.9 KiB
Python
214 lines
5.9 KiB
Python
"""Markdown parser for Obsidian vault files.
|
||
|
||
This module provides functionality to parse markdown files and extract
|
||
their structure (sections, line numbers) for semantic search indexing.
|
||
"""
|
||
|
||
import re
|
||
from dataclasses import dataclass
|
||
from pathlib import Path
|
||
from typing import List, Optional
|
||
|
||
|
||
@dataclass
|
||
class MarkdownSection:
|
||
"""Represents a section in a markdown document.
|
||
|
||
Attributes:
|
||
level: Header level (0 for no header, 1 for #, 2 for ##, etc.)
|
||
title: Section title (empty string if level=0)
|
||
content: Text content without the header line
|
||
start_line: Line number where section starts (1-indexed)
|
||
end_line: Line number where section ends (1-indexed, inclusive)
|
||
"""
|
||
level: int
|
||
title: str
|
||
content: str
|
||
parents: list[str]
|
||
start_line: int
|
||
end_line: int
|
||
|
||
|
||
@dataclass
|
||
class ParsedDocument:
|
||
"""Represents a parsed markdown document.
|
||
|
||
Attributes:
|
||
file_path: Path to the markdown file
|
||
sections: List of sections extracted from the document
|
||
raw_content: Full file content as string
|
||
"""
|
||
file_path: Path
|
||
title: str
|
||
sections: List[MarkdownSection]
|
||
raw_content: str
|
||
|
||
|
||
def _compute_parents(current_parents, previous_level, previous_title, current_level):
|
||
"""Computes the parents of `current_parents`."""
|
||
return current_parents
|
||
|
||
|
||
def parse_markdown_file(file_path: Path, vault_path=None) -> ParsedDocument:
|
||
"""Parse a markdown file and extract its structure.
|
||
|
||
This function reads a markdown file, identifies all header sections,
|
||
and extracts their content with precise line number tracking.
|
||
Files without headers are treated as a single section with level 0.
|
||
|
||
Args:
|
||
file_path: Path to the markdown file to parse
|
||
vault_path: Path to the vault file.
|
||
|
||
Returns:
|
||
ParsedDocument containing the file structure and content
|
||
|
||
Raises:
|
||
FileNotFoundError: If the file does not exist
|
||
|
||
Example:
|
||
>>> doc = parse_markdown_file(Path("notes/example.md"))
|
||
>>> print(f"Found {len(doc.sections)} sections")
|
||
>>> print(doc.sections[0].title)
|
||
"""
|
||
if not file_path.exists():
|
||
raise FileNotFoundError(f"File not found: {file_path}")
|
||
|
||
if vault_path:
|
||
title = str(file_path.relative_to(vault_path)).replace(".md", "")
|
||
title = title.replace("\\", " ").replace("/", " ")
|
||
else:
|
||
title = file_path.stem
|
||
|
||
raw_content = file_path.read_text(encoding="utf-8")
|
||
lines = raw_content.splitlines()
|
||
|
||
sections: List[MarkdownSection] = []
|
||
current_section_start = 1
|
||
current_level = 0
|
||
current_title = ""
|
||
current_parents = []
|
||
current_content_lines: List[str] = []
|
||
|
||
header_pattern = re.compile(r"^(#{1,6})\s+(.+)$")
|
||
|
||
for line_num, line in enumerate(lines, start=1):
|
||
match = header_pattern.match(line)
|
||
|
||
if match:
|
||
# Save the previous section only if it actually has content.
|
||
if current_content_lines:
|
||
content = "\n".join(current_content_lines)
|
||
sections.append(
|
||
MarkdownSection(
|
||
level=current_level,
|
||
title=current_title,
|
||
content=content,
|
||
parents=current_parents,
|
||
start_line=current_section_start,
|
||
end_line=line_num - 1,
|
||
)
|
||
)
|
||
|
||
# Start a new section with the detected header.
|
||
previous_level = current_level
|
||
previous_title = current_title
|
||
current_level = len(match.group(1))
|
||
current_title = match.group(2).strip()
|
||
current_section_start = line_num
|
||
current_parents = _compute_parents(current_parents, previous_level, previous_title, current_level)
|
||
current_content_lines = []
|
||
else:
|
||
current_content_lines.append(line)
|
||
|
||
# Handle the final section (or whole file if no headers were found).
|
||
if lines:
|
||
content = "\n".join(current_content_lines)
|
||
end_line = len(lines)
|
||
|
||
# Case 1 – no header was ever found.
|
||
if not sections and current_level == 0:
|
||
sections.append(
|
||
MarkdownSection(
|
||
level=0,
|
||
title="",
|
||
content=content,
|
||
parents=current_parents,
|
||
start_line=1,
|
||
end_line=end_line,
|
||
)
|
||
)
|
||
# Case 2 – a single header was found (sections empty but we have a title).
|
||
elif not sections:
|
||
sections.append(
|
||
MarkdownSection(
|
||
level=current_level,
|
||
title=current_title,
|
||
content=content,
|
||
parents=current_parents,
|
||
start_line=current_section_start,
|
||
end_line=end_line,
|
||
)
|
||
)
|
||
# Case 3 – multiple headers were found (sections already contains earlier ones).
|
||
else:
|
||
sections.append(
|
||
MarkdownSection(
|
||
level=current_level,
|
||
title=current_title,
|
||
content=content,
|
||
parents=current_parents,
|
||
start_line=current_section_start,
|
||
end_line=end_line,
|
||
)
|
||
)
|
||
else:
|
||
# Empty file: create a single empty level‑0 section.
|
||
sections.append(
|
||
MarkdownSection(
|
||
level=0,
|
||
title="",
|
||
content="",
|
||
parents=[],
|
||
start_line=1,
|
||
end_line=1,
|
||
)
|
||
)
|
||
|
||
return ParsedDocument(
|
||
file_path=file_path,
|
||
title=title,
|
||
sections=sections,
|
||
raw_content=raw_content,
|
||
)
|
||
|
||
|
||
def find_section_at_line(
|
||
document: ParsedDocument,
|
||
line_number: int,
|
||
) -> Optional[MarkdownSection]:
|
||
"""Find which section contains a given line number.
|
||
|
||
This function searches through the document's sections to find
|
||
which section contains the specified line number.
|
||
|
||
Args:
|
||
document: Parsed markdown document
|
||
line_number: Line number to search for (1-indexed)
|
||
|
||
Returns:
|
||
MarkdownSection containing the line, or None if line number
|
||
is invalid or out of range
|
||
|
||
Example:
|
||
>>> section = find_section_at_line(doc, 42)
|
||
>>> if section:
|
||
... print(f"Line 42 is in section: {section.title}")
|
||
"""
|
||
if line_number < 1:
|
||
return None
|
||
|
||
for section in document.sections:
|
||
if section.start_line <= line_number <= section.end_line:
|
||
return section
|