Initial commit

2025-12-12 11:31:44 +01:00
commit d4925f7969
21 changed files with 2957 additions and 0 deletions
--- a/obsidian_rag/markdown_parser.py
+++ b/obsidian_rag/markdown_parser.py
@@ -0,0 +1,213 @@
+"""Markdown parser for Obsidian vault files.
+
+This module provides functionality to parse markdown files and extract
+their structure (sections, line numbers) for semantic search indexing.
+"""
+
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+
+
+@dataclass
+class MarkdownSection:
+  """Represents a section in a markdown document.
+
+  Attributes:
+      level: Header level (0 for no header, 1 for #, 2 for ##, etc.)
+      title: Section title (empty string if level=0)
+      content: Text content without the header line
+      start_line: Line number where section starts (1-indexed)
+      end_line: Line number where section ends (1-indexed, inclusive)
+  """
+  level: int
+  title: str
+  content: str
+  parents: list[str]
+  start_line: int
+  end_line: int
+
+
+@dataclass
+class ParsedDocument:
+  """Represents a parsed markdown document.
+
+  Attributes:
+      file_path: Path to the markdown file
+      sections: List of sections extracted from the document
+      raw_content: Full file content as string
+  """
+  file_path: Path
+  title: str
+  sections: List[MarkdownSection]
+  raw_content: str
+
+
+def _compute_parents(current_parents, previous_level, previous_title, current_level):
+  """Computes the parents of `current_parents`."""
+  return current_parents
+
+
+def parse_markdown_file(file_path: Path, vault_path=None) -> ParsedDocument:
+  """Parse a markdown file and extract its structure.
+
+  This function reads a markdown file, identifies all header sections,
+  and extracts their content with precise line number tracking.
+  Files without headers are treated as a single section with level 0.
+
+  Args:
+      file_path: Path to the markdown file to parse
+      vault_path: Path to the vault file.
+
+  Returns:
+      ParsedDocument containing the file structure and content
+
+  Raises:
+      FileNotFoundError: If the file does not exist
+
+  Example:
+      >>> doc = parse_markdown_file(Path("notes/example.md"))
+      >>> print(f"Found {len(doc.sections)} sections")
+      >>> print(doc.sections[0].title)
+  """
+  if not file_path.exists():
+    raise FileNotFoundError(f"File not found: {file_path}")
+  
+  if vault_path:
+    title = str(file_path.relative_to(vault_path)).replace(".md", "")
+    title = title.replace("\\", " ").replace("/", " ")
+  else:
+    title = file_path.stem
+  
+  raw_content = file_path.read_text(encoding="utf-8")
+  lines = raw_content.splitlines()
+  
+  sections: List[MarkdownSection] = []
+  current_section_start = 1
+  current_level = 0
+  current_title = ""
+  current_parents = []
+  current_content_lines: List[str] = []
+  
+  header_pattern = re.compile(r"^(#{1,6})\s+(.+)$")
+  
+  for line_num, line in enumerate(lines, start=1):
+    match = header_pattern.match(line)
+    
+    if match:
+      # Save the previous section only if it actually has content.
+      if current_content_lines:
+        content = "\n".join(current_content_lines)
+        sections.append(
+          MarkdownSection(
+            level=current_level,
+            title=current_title,
+            content=content,
+            parents=current_parents,
+            start_line=current_section_start,
+            end_line=line_num - 1,
+          )
+        )
+      
+      # Start a new section with the detected header.
+      previous_level = current_level
+      previous_title = current_title
+      current_level = len(match.group(1))
+      current_title = match.group(2).strip()
+      current_section_start = line_num
+      current_parents = _compute_parents(current_parents, previous_level, previous_title, current_level)
+      current_content_lines = []
+    else:
+      current_content_lines.append(line)
+  
+  # Handle the final section (or whole file if no headers were found).
+  if lines:
+    content = "\n".join(current_content_lines)
+    end_line = len(lines)
+    
+    # Case 1 – no header was ever found.
+    if not sections and current_level == 0:
+      sections.append(
+        MarkdownSection(
+          level=0,
+          title="",
+          content=content,
+          parents=current_parents,
+          start_line=1,
+          end_line=end_line,
+        )
+      )
+    # Case 2 – a single header was found (sections empty but we have a title).
+    elif not sections:
+      sections.append(
+        MarkdownSection(
+          level=current_level,
+          title=current_title,
+          content=content,
+          parents=current_parents,
+          start_line=current_section_start,
+          end_line=end_line,
+        )
+      )
+    # Case 3 – multiple headers were found (sections already contains earlier ones).
+    else:
+      sections.append(
+        MarkdownSection(
+          level=current_level,
+          title=current_title,
+          content=content,
+          parents=current_parents,
+          start_line=current_section_start,
+          end_line=end_line,
+        )
+      )
+  else:
+    # Empty file: create a single empty level‑0 section.
+    sections.append(
+      MarkdownSection(
+        level=0,
+        title="",
+        content="",
+        parents=[],
+        start_line=1,
+        end_line=1,
+      )
+    )
+  
+  return ParsedDocument(
+    file_path=file_path,
+    title=title,
+    sections=sections,
+    raw_content=raw_content,
+  )
+
+
+def find_section_at_line(
+    document: ParsedDocument,
+    line_number: int,
+) -> Optional[MarkdownSection]:
+  """Find which section contains a given line number.
+
+  This function searches through the document's sections to find
+  which section contains the specified line number.
+
+  Args:
+      document: Parsed markdown document
+      line_number: Line number to search for (1-indexed)
+
+  Returns:
+      MarkdownSection containing the line, or None if line number
+      is invalid or out of range
+
+  Example:
+      >>> section = find_section_at_line(doc, 42)
+      >>> if section:
+      ...     print(f"Line 42 is in section: {section.title}")
+  """
+  if line_number < 1:
+    return None
+  
+  for section in document.sections:
+    if section.start_line <= line_number <= section.end_line:
+      return section