Initial commit
This commit is contained in:
213
obsidian_rag/markdown_parser.py
Normal file
213
obsidian_rag/markdown_parser.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""Markdown parser for Obsidian vault files.
|
||||
|
||||
This module provides functionality to parse markdown files and extract
|
||||
their structure (sections, line numbers) for semantic search indexing.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class MarkdownSection:
|
||||
"""Represents a section in a markdown document.
|
||||
|
||||
Attributes:
|
||||
level: Header level (0 for no header, 1 for #, 2 for ##, etc.)
|
||||
title: Section title (empty string if level=0)
|
||||
content: Text content without the header line
|
||||
start_line: Line number where section starts (1-indexed)
|
||||
end_line: Line number where section ends (1-indexed, inclusive)
|
||||
"""
|
||||
level: int
|
||||
title: str
|
||||
content: str
|
||||
parents: list[str]
|
||||
start_line: int
|
||||
end_line: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedDocument:
|
||||
"""Represents a parsed markdown document.
|
||||
|
||||
Attributes:
|
||||
file_path: Path to the markdown file
|
||||
sections: List of sections extracted from the document
|
||||
raw_content: Full file content as string
|
||||
"""
|
||||
file_path: Path
|
||||
title: str
|
||||
sections: List[MarkdownSection]
|
||||
raw_content: str
|
||||
|
||||
|
||||
def _compute_parents(current_parents, previous_level, previous_title, current_level):
|
||||
"""Computes the parents of `current_parents`."""
|
||||
return current_parents
|
||||
|
||||
|
||||
def parse_markdown_file(file_path: Path, vault_path=None) -> ParsedDocument:
|
||||
"""Parse a markdown file and extract its structure.
|
||||
|
||||
This function reads a markdown file, identifies all header sections,
|
||||
and extracts their content with precise line number tracking.
|
||||
Files without headers are treated as a single section with level 0.
|
||||
|
||||
Args:
|
||||
file_path: Path to the markdown file to parse
|
||||
vault_path: Path to the vault file.
|
||||
|
||||
Returns:
|
||||
ParsedDocument containing the file structure and content
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist
|
||||
|
||||
Example:
|
||||
>>> doc = parse_markdown_file(Path("notes/example.md"))
|
||||
>>> print(f"Found {len(doc.sections)} sections")
|
||||
>>> print(doc.sections[0].title)
|
||||
"""
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
if vault_path:
|
||||
title = str(file_path.relative_to(vault_path)).replace(".md", "")
|
||||
title = title.replace("\\", " ").replace("/", " ")
|
||||
else:
|
||||
title = file_path.stem
|
||||
|
||||
raw_content = file_path.read_text(encoding="utf-8")
|
||||
lines = raw_content.splitlines()
|
||||
|
||||
sections: List[MarkdownSection] = []
|
||||
current_section_start = 1
|
||||
current_level = 0
|
||||
current_title = ""
|
||||
current_parents = []
|
||||
current_content_lines: List[str] = []
|
||||
|
||||
header_pattern = re.compile(r"^(#{1,6})\s+(.+)$")
|
||||
|
||||
for line_num, line in enumerate(lines, start=1):
|
||||
match = header_pattern.match(line)
|
||||
|
||||
if match:
|
||||
# Save the previous section only if it actually has content.
|
||||
if current_content_lines:
|
||||
content = "\n".join(current_content_lines)
|
||||
sections.append(
|
||||
MarkdownSection(
|
||||
level=current_level,
|
||||
title=current_title,
|
||||
content=content,
|
||||
parents=current_parents,
|
||||
start_line=current_section_start,
|
||||
end_line=line_num - 1,
|
||||
)
|
||||
)
|
||||
|
||||
# Start a new section with the detected header.
|
||||
previous_level = current_level
|
||||
previous_title = current_title
|
||||
current_level = len(match.group(1))
|
||||
current_title = match.group(2).strip()
|
||||
current_section_start = line_num
|
||||
current_parents = _compute_parents(current_parents, previous_level, previous_title, current_level)
|
||||
current_content_lines = []
|
||||
else:
|
||||
current_content_lines.append(line)
|
||||
|
||||
# Handle the final section (or whole file if no headers were found).
|
||||
if lines:
|
||||
content = "\n".join(current_content_lines)
|
||||
end_line = len(lines)
|
||||
|
||||
# Case 1 – no header was ever found.
|
||||
if not sections and current_level == 0:
|
||||
sections.append(
|
||||
MarkdownSection(
|
||||
level=0,
|
||||
title="",
|
||||
content=content,
|
||||
parents=current_parents,
|
||||
start_line=1,
|
||||
end_line=end_line,
|
||||
)
|
||||
)
|
||||
# Case 2 – a single header was found (sections empty but we have a title).
|
||||
elif not sections:
|
||||
sections.append(
|
||||
MarkdownSection(
|
||||
level=current_level,
|
||||
title=current_title,
|
||||
content=content,
|
||||
parents=current_parents,
|
||||
start_line=current_section_start,
|
||||
end_line=end_line,
|
||||
)
|
||||
)
|
||||
# Case 3 – multiple headers were found (sections already contains earlier ones).
|
||||
else:
|
||||
sections.append(
|
||||
MarkdownSection(
|
||||
level=current_level,
|
||||
title=current_title,
|
||||
content=content,
|
||||
parents=current_parents,
|
||||
start_line=current_section_start,
|
||||
end_line=end_line,
|
||||
)
|
||||
)
|
||||
else:
|
||||
# Empty file: create a single empty level‑0 section.
|
||||
sections.append(
|
||||
MarkdownSection(
|
||||
level=0,
|
||||
title="",
|
||||
content="",
|
||||
parents=[],
|
||||
start_line=1,
|
||||
end_line=1,
|
||||
)
|
||||
)
|
||||
|
||||
return ParsedDocument(
|
||||
file_path=file_path,
|
||||
title=title,
|
||||
sections=sections,
|
||||
raw_content=raw_content,
|
||||
)
|
||||
|
||||
|
||||
def find_section_at_line(
|
||||
document: ParsedDocument,
|
||||
line_number: int,
|
||||
) -> Optional[MarkdownSection]:
|
||||
"""Find which section contains a given line number.
|
||||
|
||||
This function searches through the document's sections to find
|
||||
which section contains the specified line number.
|
||||
|
||||
Args:
|
||||
document: Parsed markdown document
|
||||
line_number: Line number to search for (1-indexed)
|
||||
|
||||
Returns:
|
||||
MarkdownSection containing the line, or None if line number
|
||||
is invalid or out of range
|
||||
|
||||
Example:
|
||||
>>> section = find_section_at_line(doc, 42)
|
||||
>>> if section:
|
||||
... print(f"Line 42 is in section: {section.title}")
|
||||
"""
|
||||
if line_number < 1:
|
||||
return None
|
||||
|
||||
for section in document.sections:
|
||||
if section.start_line <= line_number <= section.end_line:
|
||||
return section
|
||||
Reference in New Issue
Block a user