Working on Formating DSL completion

This commit is contained in:
2026-01-31 19:09:14 +01:00
parent 778e5ac69d
commit d7ec99c3d9
77 changed files with 7563 additions and 63 deletions

View File

View File

@@ -0,0 +1,84 @@
"""
Base class for DSL definitions.
DSLDefinition provides the interface for defining domain-specific languages
that can be used with the DslEditor control and CodeMirror.
"""
from abc import ABC, abstractmethod
from functools import cached_property
from typing import List, Dict, Any
from myfasthtml.core.dsl.lark_to_lezer import (
lark_to_lezer_grammar,
extract_completions_from_grammar,
)
class DSLDefinition(ABC):
"""
Base class for DSL definitions.
Subclasses must implement get_grammar() to provide the Lark grammar.
The Lezer grammar and completions are automatically derived.
Attributes:
name: Human-readable name of the DSL.
"""
name: str = "DSL"
@abstractmethod
def get_grammar(self) -> str:
"""
Return the Lark grammar string for this DSL.
Returns:
The Lark grammar as a string.
"""
pass
@cached_property
def lezer_grammar(self) -> str:
"""
Return the Lezer grammar derived from the Lark grammar.
This is cached after first computation.
Returns:
The Lezer grammar as a string.
"""
return lark_to_lezer_grammar(self.get_grammar())
@cached_property
def completions(self) -> Dict[str, List[str]]:
"""
Return completion items extracted from the grammar.
This is cached after first computation.
Returns:
Dictionary with completion categories:
- 'keywords': Language keywords (if, not, and, etc.)
- 'operators': Comparison and arithmetic operators
- 'functions': Function-like constructs (style, format, etc.)
- 'types': Type names (number, date, boolean, etc.)
- 'literals': Literal values (True, False, etc.)
"""
return extract_completions_from_grammar(self.get_grammar())
def get_editor_config(self) -> Dict[str, Any]:
"""
Return the configuration for the DslEditor JavaScript initialization.
Returns:
Dictionary with:
- 'lezerGrammar': The Lezer grammar string
- 'completions': The completion items
- 'name': The DSL name
"""
return {
"name": self.name,
"lezerGrammar": self.lezer_grammar,
"completions": self.completions,
}

View File

@@ -0,0 +1,172 @@
"""
Base completion engine for DSL autocompletion.
Provides an abstract base class that specific DSL implementations
can extend to provide context-aware autocompletion.
"""
from abc import ABC, abstractmethod
from typing import Any
from . import utils
from .base_provider import BaseMetadataProvider
from .types import Position, Suggestion, CompletionResult
class BaseCompletionEngine(ABC):
"""
Abstract base class for DSL completion engines.
Subclasses must implement:
- detect_scope(): Find the current scope from previous lines
- detect_context(): Determine what kind of completion is expected
- get_suggestions(): Generate suggestions for the detected context
The main entry point is get_completions(), which orchestrates the flow.
"""
def __init__(self, provider: BaseMetadataProvider):
"""
Initialize the completion engine.
Args:
provider: Metadata provider for context-aware suggestions
"""
self.provider = provider
def get_completions(self, text: str, cursor: Position) -> CompletionResult:
"""
Get autocompletion suggestions for the given cursor position.
This is the main entry point. It:
1. Checks if cursor is in a comment (no suggestions)
2. Detects the current scope (e.g., which column)
3. Detects the completion context (what kind of token is expected)
4. Generates and filters suggestions
Args:
text: The full DSL document text
cursor: Cursor position
Returns:
CompletionResult with suggestions and replacement range
"""
# Get the current line up to cursor
line = utils.get_line_at(text, cursor.line)
line_to_cursor = utils.get_line_up_to_cursor(text, cursor)
# Check if in comment - no suggestions
if utils.is_in_comment(line, cursor.ch):
return self._empty_result(cursor)
# Find word boundaries for replacement range
word_range = utils.find_word_boundaries(line, cursor.ch)
prefix = line[word_range.start: cursor.ch]
# Detect scope from previous lines
scope = self.detect_scope(text, cursor.line)
# Detect completion context
context = self.detect_context(text, cursor, scope)
# Get suggestions for this context
suggestions = self.get_suggestions(context, scope, prefix)
# Filter suggestions by prefix
if prefix:
suggestions = self._filter_suggestions(suggestions, prefix)
# Build result with correct positions
from_pos = Position(line=cursor.line, ch=word_range.start)
to_pos = Position(line=cursor.line, ch=word_range.end)
return CompletionResult(
from_pos=from_pos,
to_pos=to_pos,
suggestions=suggestions,
)
@abstractmethod
def detect_scope(self, text: str, current_line: int) -> Any:
"""
Detect the current scope by scanning previous lines.
The scope determines which data context we're in (e.g., which column
for column values suggestions).
Args:
text: The full document text
current_line: Current line number (0-based)
Returns:
Scope object (type depends on the specific DSL)
"""
pass
@abstractmethod
def detect_context(self, text: str, cursor: Position, scope: Any) -> Any:
"""
Detect the completion context at the cursor position.
Analyzes the current line to determine what kind of token
is expected (e.g., keyword, preset name, operator).
Args:
text: The full document text
cursor: Cursor position
scope: The detected scope
Returns:
Context identifier (type depends on the specific DSL)
"""
pass
@abstractmethod
def get_suggestions(self, context: Any, scope: Any, prefix: str) -> list[Suggestion]:
"""
Generate suggestions for the given context.
Args:
context: The detected completion context
scope: The detected scope
prefix: The current word prefix (for filtering)
Returns:
List of suggestions
"""
pass
def _filter_suggestions(
self, suggestions: list[Suggestion], prefix: str
) -> list[Suggestion]:
"""
Filter suggestions by prefix (case-insensitive).
Args:
suggestions: List of suggestions
prefix: Prefix to filter by
Returns:
Filtered list of suggestions
"""
prefix_lower = prefix.lower()
return [s for s in suggestions if s.label.lower().startswith(prefix_lower)]
def _empty_result(self, cursor: Position) -> CompletionResult:
"""
Return an empty completion result.
Args:
cursor: Cursor position
Returns:
CompletionResult with no suggestions
"""
return CompletionResult(
from_pos=cursor,
to_pos=cursor,
suggestions=[],
)
def get_id(self):
return type(self).__name__

View File

@@ -0,0 +1,38 @@
"""
Base provider protocol for DSL autocompletion.
Defines the minimal interface that metadata providers must implement
to support context-aware autocompletion.
"""
from typing import Protocol
class BaseMetadataProvider(Protocol):
"""
Protocol defining the interface for metadata providers.
Metadata providers give the autocompletion engine access to
context-specific data (e.g., column names, available values).
This is a minimal interface. Specific DSL implementations
can extend this with additional methods.
"""
def get_style_presets(self) -> list[str]:
"""
Return the list of available style preset names.
Returns:
List of style preset names (e.g., ["primary", "error", "success"])
"""
...
def get_format_presets(self) -> list[str]:
"""
Return the list of available format preset names.
Returns:
List of format preset names (e.g., ["EUR", "USD", "percentage"])
"""
...

View File

@@ -0,0 +1,256 @@
"""
Utilities for converting Lark grammars to Lezer format and extracting completions.
This module provides functions to:
1. Transform a Lark grammar to a Lezer grammar for CodeMirror
2. Extract completion items (keywords, operators, etc.) from a Lark grammar
"""
import re
from typing import Dict, List, Set
def lark_to_lezer_grammar(lark_grammar: str) -> str:
"""
Convert a Lark grammar to a Lezer grammar.
This is a simplified converter that handles common Lark patterns.
Complex grammars may require manual adjustment.
Args:
lark_grammar: The Lark grammar string.
Returns:
The Lezer grammar string.
"""
lines = lark_grammar.strip().split("\n")
lezer_rules = []
tokens = []
for line in lines:
line = line.strip()
# Skip empty lines and comments
if not line or line.startswith("//") or line.startswith("#"):
continue
# Skip Lark-specific directives
if line.startswith("%"):
continue
# Parse rule definitions (lowercase names only)
rule_match = re.match(r"^([a-z_][a-z0-9_]*)\s*:\s*(.+)$", line)
if rule_match:
name, body = rule_match.groups()
lezer_rule = _convert_rule(name, body)
if lezer_rule:
lezer_rules.append(lezer_rule)
continue
# Parse terminal definitions (uppercase names)
terminal_match = re.match(r"^([A-Z_][A-Z0-9_]*)\s*:\s*(.+)$", line)
if terminal_match:
name, pattern = terminal_match.groups()
token = _convert_terminal(name, pattern)
if token:
tokens.append(token)
# Build Lezer grammar
lezer_output = ["@top Start { scope+ }", ""]
# Add rules
for rule in lezer_rules:
lezer_output.append(rule)
lezer_output.append("")
lezer_output.append("@tokens {")
# Add tokens
for token in tokens:
lezer_output.append(f" {token}")
# Add common tokens
lezer_output.extend([
' whitespace { $[ \\t]+ }',
' newline { $[\\n\\r] }',
' Comment { "#" ![$\\n]* }',
])
lezer_output.append("}")
lezer_output.append("")
lezer_output.append("@skip { whitespace | Comment }")
return "\n".join(lezer_output)
def _convert_rule(name: str, body: str) -> str:
"""Convert a single Lark rule to Lezer format."""
# Skip internal rules (starting with _)
if name.startswith("_"):
return ""
# Convert rule name to PascalCase for Lezer
lezer_name = _to_pascal_case(name)
# Convert body
lezer_body = _convert_body(body)
if lezer_body:
return f"{lezer_name} {{ {lezer_body} }}"
return ""
def _convert_terminal(name: str, pattern: str) -> str:
"""Convert a Lark terminal to Lezer token format."""
pattern = pattern.strip()
# Handle regex patterns
if pattern.startswith("/") and pattern.endswith("/"):
regex = pattern[1:-1]
# Convert to Lezer regex format
return f'{name} {{ ${regex}$ }}'
# Handle string literals
if pattern.startswith('"') or pattern.startswith("'"):
return f'{name} {{ {pattern} }}'
# Handle alternatives (literal strings separated by |)
if "|" in pattern:
alternatives = [alt.strip() for alt in pattern.split("|")]
if all(alt.startswith('"') or alt.startswith("'") for alt in alternatives):
return f'{name} {{ {" | ".join(alternatives)} }}'
return ""
def _convert_body(body: str) -> str:
"""Convert the body of a Lark rule to Lezer format."""
# Remove inline transformations (-> name)
body = re.sub(r"\s*->\s*\w+", "", body)
# Convert alternatives
parts = []
for alt in body.split("|"):
alt = alt.strip()
if alt:
converted = _convert_sequence(alt)
if converted:
parts.append(converted)
return " | ".join(parts)
def _convert_sequence(seq: str) -> str:
"""Convert a sequence of items in a rule."""
items = []
# Tokenize the sequence
tokens = re.findall(
r'"[^"]*"|\'[^\']*\'|/[^/]+/|\([^)]+\)|\[[^\]]+\]|[a-zA-Z_][a-zA-Z0-9_]*|\?|\*|\+',
seq
)
for token in tokens:
if token.startswith('"') or token.startswith("'"):
# String literal
items.append(token)
elif token.startswith("("):
# Group
inner = token[1:-1]
items.append(f"({_convert_body(inner)})")
elif token.startswith("["):
# Optional group in Lark
inner = token[1:-1]
items.append(f"({_convert_body(inner)})?")
elif token in ("?", "*", "+"):
# Quantifiers - attach to previous item
if items:
items[-1] = items[-1] + token
elif token.isupper() or token.startswith("_"):
# Terminal reference
items.append(token)
elif token.islower() or "_" in token:
# Rule reference - convert to PascalCase
items.append(_to_pascal_case(token))
return " ".join(items)
def _to_pascal_case(name: str) -> str:
"""Convert snake_case to PascalCase."""
return "".join(word.capitalize() for word in name.split("_"))
def extract_completions_from_grammar(lark_grammar: str) -> Dict[str, List[str]]:
"""
Extract completion items from a Lark grammar.
Parses the grammar to find:
- Keywords (reserved words like if, not, and)
- Operators (==, !=, contains, etc.)
- Functions (style, format, etc.)
- Types (number, date, boolean, etc.)
- Literals (True, False, etc.)
Args:
lark_grammar: The Lark grammar string.
Returns:
Dictionary with completion categories.
"""
keywords: Set[str] = set()
operators: Set[str] = set()
functions: Set[str] = set()
types: Set[str] = set()
literals: Set[str] = set()
# Find all quoted strings (potential keywords/operators)
quoted_strings = re.findall(r'"([^"]+)"', lark_grammar)
# Also look for terminal definitions with string alternatives (e.g., BOOLEAN: "True" | "False")
terminal_literals = re.findall(r'[A-Z_]+:\s*"([^"]+)"(?:\s*\|\s*"([^"]+)")*', lark_grammar)
for match in terminal_literals:
for literal in match:
if literal:
quoted_strings.append(literal)
for s in quoted_strings:
s_lower = s.lower()
# Classify based on pattern
if s in ("==", "!=", "<=", "<", ">=", ">", "+", "-", "*", "/"):
operators.add(s)
elif s_lower in ("contains", "startswith", "endswith", "in", "between", "isempty", "isnotempty"):
operators.add(s_lower)
elif s_lower in ("if", "not", "and", "or"):
keywords.add(s_lower)
elif s_lower in ("true", "false"):
literals.add(s)
elif s_lower in ("style", "format"):
functions.add(s_lower)
elif s_lower in ("column", "row", "cell", "value", "col"):
keywords.add(s_lower)
elif s_lower in ("number", "date", "boolean", "text", "enum"):
types.add(s_lower)
elif s_lower == "case":
keywords.add(s_lower)
# Find function-like patterns: word "("
function_patterns = re.findall(r'"(\w+)"\s*"?\("', lark_grammar)
for func in function_patterns:
if func.lower() not in ("true", "false"):
functions.add(func.lower())
# Find type patterns from format_type rule
type_match = re.search(r'format_type\s*:\s*(.+?)(?:\n\n|\Z)', lark_grammar, re.DOTALL)
if type_match:
type_strings = re.findall(r'"(\w+)"', type_match.group(1))
types.update(t.lower() for t in type_strings)
return {
"keywords": sorted(keywords),
"operators": sorted(operators),
"functions": sorted(functions),
"types": sorted(types),
"literals": sorted(literals),
}

View File

@@ -0,0 +1,103 @@
"""
Base types for DSL autocompletion.
Provides dataclasses for cursor position, suggestions, and completion results
compatible with CodeMirror 5.
"""
from dataclasses import dataclass, field
from typing import Any
@dataclass(frozen=True)
class Position:
"""
Cursor position in a document.
Compatible with CodeMirror 5 position format.
Attributes:
line: 0-based line number
ch: 0-based character position in the line
"""
line: int
ch: int
def to_dict(self) -> dict[str, int]:
"""Convert to CodeMirror-compatible dictionary."""
return {"line": self.line, "ch": self.ch}
@dataclass(frozen=True)
class Suggestion:
"""
A single autocompletion suggestion.
Attributes:
label: The text to display and insert
detail: Optional description shown next to the label
kind: Optional category (e.g., "keyword", "preset", "value")
"""
label: str
detail: str = ""
kind: str = ""
def to_dict(self) -> dict[str, str]:
"""Convert to dictionary for JSON serialization."""
result = {"label": self.label}
if self.detail:
result["detail"] = self.detail
if self.kind:
result["kind"] = self.kind
return result
@dataclass
class CompletionResult:
"""
Result of an autocompletion request.
Compatible with CodeMirror 5 hint format.
Attributes:
from_pos: Start position of the text to replace
to_pos: End position of the text to replace
suggestions: List of completion suggestions
"""
from_pos: Position
to_pos: Position
suggestions: list[Suggestion] = field(default_factory=list)
def to_dict(self) -> dict[str, Any]:
"""Convert to CodeMirror-compatible dictionary."""
return {
"from": self.from_pos.to_dict(),
"to": self.to_pos.to_dict(),
"suggestions": [s.to_dict() for s in self.suggestions],
}
@property
def is_empty(self) -> bool:
"""Return True if there are no suggestions."""
return len(self.suggestions) == 0
@dataclass(frozen=True)
class WordRange:
"""
Range of a word in a line.
Used for determining what text to replace when applying a suggestion.
Attributes:
start: Start character position (inclusive)
end: End character position (exclusive)
text: The word text
"""
start: int
end: int
text: str = ""

View File

@@ -0,0 +1,226 @@
"""
Shared utilities for DSL autocompletion.
Provides helper functions for text analysis, word boundary detection,
and other common operations used by completion engines.
"""
from .types import Position, WordRange
# Delimiters used to detect word boundaries
DELIMITERS = set('"\' ()[]{}=,:<>!\t\n\r')
def get_line_at(text: str, line_number: int) -> str:
"""
Get the content of a specific line.
Args:
text: The full document text
line_number: 0-based line number
Returns:
The line content, or empty string if line doesn't exist
"""
lines = text.split("\n")
if 0 <= line_number < len(lines):
return lines[line_number]
return ""
def get_line_up_to_cursor(text: str, cursor: Position) -> str:
"""
Get the content of the current line up to the cursor position.
Args:
text: The full document text
cursor: Cursor position
Returns:
The line content from start to cursor position
"""
line = get_line_at(text, cursor.line)
return line[: cursor.ch]
def get_lines_up_to(text: str, line_number: int) -> list[str]:
"""
Get all lines from start up to and including the specified line.
Args:
text: The full document text
line_number: 0-based line number (inclusive)
Returns:
List of lines from 0 to line_number
"""
lines = text.split("\n")
return lines[: line_number + 1]
def find_word_boundaries(line: str, cursor_ch: int) -> WordRange:
"""
Find the word boundaries around the cursor position.
Uses delimiters to detect where a word starts and ends.
The cursor can be anywhere within the word.
Args:
line: The line content
cursor_ch: Cursor character position in the line
Returns:
WordRange with start, end positions and the word text
"""
if not line or cursor_ch < 0:
return WordRange(start=cursor_ch, end=cursor_ch, text="")
# Clamp cursor position to line length
cursor_ch = min(cursor_ch, len(line))
# Find start of word (scan backwards from cursor)
start = cursor_ch
while start > 0 and line[start - 1] not in DELIMITERS:
start -= 1
# Find end of word (scan forwards from cursor)
end = cursor_ch
while end < len(line) and line[end] not in DELIMITERS:
end += 1
word = line[start:end]
return WordRange(start=start, end=end, text=word)
def get_prefix(line: str, cursor_ch: int) -> str:
"""
Get the word prefix before the cursor.
This is the text from the start of the current word to the cursor.
Args:
line: The line content
cursor_ch: Cursor character position in the line
Returns:
The prefix text
"""
word_range = find_word_boundaries(line, cursor_ch)
# Prefix is from word start to cursor
return line[word_range.start: cursor_ch]
def is_in_comment(line: str, cursor_ch: int) -> bool:
"""
Check if the cursor is inside a comment.
A comment starts with # and extends to the end of the line.
Args:
line: The line content
cursor_ch: Cursor character position in the line
Returns:
True if cursor is after a # character
"""
# Find first # that's not inside a string
in_string = False
string_char = None
for i, char in enumerate(line):
if i >= cursor_ch:
break
if char in ('"', "'") and (i == 0 or line[i - 1] != "\\"):
if not in_string:
in_string = True
string_char = char
elif char == string_char:
in_string = False
string_char = None
elif char == "#" and not in_string:
return True
return False
def is_in_string(line: str, cursor_ch: int) -> tuple[bool, str | None]:
"""
Check if the cursor is inside a string literal.
Args:
line: The line content
cursor_ch: Cursor character position in the line
Returns:
Tuple of (is_in_string, quote_char)
quote_char is '"' or "'" if inside a string, None otherwise
"""
in_string = False
string_char = None
for i, char in enumerate(line):
if i >= cursor_ch:
break
if char in ('"', "'") and (i == 0 or line[i - 1] != "\\"):
if not in_string:
in_string = True
string_char = char
elif char == string_char:
in_string = False
string_char = None
return in_string, string_char if in_string else None
def get_indentation(line: str) -> int:
"""
Get the indentation level of a line.
Counts leading spaces (tabs are converted to 4 spaces).
Args:
line: The line content
Returns:
Number of leading spaces
"""
count = 0
for char in line:
if char == " ":
count += 1
elif char == "\t":
count += 4
else:
break
return count
def is_indented(line: str) -> bool:
"""
Check if a line is indented (has leading whitespace).
Args:
line: The line content
Returns:
True if line starts with whitespace
"""
return len(line) > 0 and line[0] in (" ", "\t")
def strip_quotes(text: str) -> str:
"""
Remove surrounding quotes from a string.
Args:
text: Text that may be quoted
Returns:
Text without surrounding quotes
"""
if len(text) >= 2:
if (text[0] == '"' and text[-1] == '"') or (text[0] == "'" and text[-1] == "'"):
return text[1:-1]
return text