Added syntax colorization
This commit is contained in:
@@ -9,9 +9,9 @@ from abc import ABC, abstractmethod
|
||||
from functools import cached_property
|
||||
from typing import List, Dict, Any
|
||||
|
||||
# TODO: Replace with lark_to_simple_mode when implemented
|
||||
from myfasthtml.core.dsl.lark_to_lezer import (
|
||||
lark_to_lezer_grammar,
|
||||
extract_completions_from_grammar,
|
||||
extract_completions_from_grammar, # Will be moved to utils.py
|
||||
)
|
||||
from myfasthtml.core.utils import make_safe_id
|
||||
|
||||
@@ -39,18 +39,6 @@ class DSLDefinition(ABC):
|
||||
"""
|
||||
pass
|
||||
|
||||
@cached_property
|
||||
def lezer_grammar(self) -> str:
|
||||
"""
|
||||
Return the Lezer grammar derived from the Lark grammar.
|
||||
|
||||
This is cached after first computation.
|
||||
|
||||
Returns:
|
||||
The Lezer grammar as a string.
|
||||
"""
|
||||
return lark_to_lezer_grammar(self.get_grammar())
|
||||
|
||||
@cached_property
|
||||
def completions(self) -> Dict[str, List[str]]:
|
||||
"""
|
||||
@@ -68,6 +56,26 @@ class DSLDefinition(ABC):
|
||||
"""
|
||||
return extract_completions_from_grammar(self.get_grammar())
|
||||
|
||||
@cached_property
|
||||
def simple_mode_config(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Return the CodeMirror 5 Simple Mode configuration for syntax highlighting.
|
||||
|
||||
This is cached after first computation.
|
||||
|
||||
Returns:
|
||||
Dictionary with Simple Mode rules:
|
||||
{
|
||||
"start": [
|
||||
{"regex": "...", "token": "keyword"},
|
||||
{"regex": "...", "token": "string"},
|
||||
...
|
||||
]
|
||||
}
|
||||
"""
|
||||
from myfasthtml.core.dsl.lark_to_simple_mode import lark_to_simple_mode
|
||||
return lark_to_simple_mode(self.get_grammar())
|
||||
|
||||
def get_editor_config(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Return the configuration for the DslEditor JavaScript initialization.
|
||||
|
||||
@@ -1,256 +1,267 @@
|
||||
"""
|
||||
Utilities for converting Lark grammars to Lezer format and extracting completions.
|
||||
|
||||
This module provides functions to:
|
||||
1. Transform a Lark grammar to a Lezer grammar for CodeMirror
|
||||
2. Extract completion items (keywords, operators, etc.) from a Lark grammar
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Set
|
||||
|
||||
|
||||
def lark_to_lezer_grammar(lark_grammar: str) -> str:
|
||||
"""
|
||||
Convert a Lark grammar to a Lezer grammar.
|
||||
|
||||
This is a simplified converter that handles common Lark patterns.
|
||||
Complex grammars may require manual adjustment.
|
||||
|
||||
Args:
|
||||
lark_grammar: The Lark grammar string.
|
||||
|
||||
Returns:
|
||||
The Lezer grammar string.
|
||||
"""
|
||||
lines = lark_grammar.strip().split("\n")
|
||||
lezer_rules = []
|
||||
tokens = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
|
||||
# Skip empty lines and comments
|
||||
if not line or line.startswith("//") or line.startswith("#"):
|
||||
continue
|
||||
|
||||
# Skip Lark-specific directives
|
||||
if line.startswith("%"):
|
||||
continue
|
||||
|
||||
# Parse rule definitions (lowercase names only)
|
||||
rule_match = re.match(r"^([a-z_][a-z0-9_]*)\s*:\s*(.+)$", line)
|
||||
if rule_match:
|
||||
name, body = rule_match.groups()
|
||||
lezer_rule = _convert_rule(name, body)
|
||||
if lezer_rule:
|
||||
lezer_rules.append(lezer_rule)
|
||||
continue
|
||||
|
||||
# Parse terminal definitions (uppercase names)
|
||||
terminal_match = re.match(r"^([A-Z_][A-Z0-9_]*)\s*:\s*(.+)$", line)
|
||||
if terminal_match:
|
||||
name, pattern = terminal_match.groups()
|
||||
token = _convert_terminal(name, pattern)
|
||||
if token:
|
||||
tokens.append(token)
|
||||
|
||||
# Build Lezer grammar
|
||||
lezer_output = ["@top Start { scope+ }", ""]
|
||||
|
||||
# Add rules
|
||||
for rule in lezer_rules:
|
||||
lezer_output.append(rule)
|
||||
|
||||
lezer_output.append("")
|
||||
lezer_output.append("@tokens {")
|
||||
|
||||
# Add tokens
|
||||
for token in tokens:
|
||||
lezer_output.append(f" {token}")
|
||||
|
||||
# Add common tokens
|
||||
lezer_output.extend([
|
||||
' whitespace { $[ \\t]+ }',
|
||||
' newline { $[\\n\\r] }',
|
||||
' Comment { "#" ![$\\n]* }',
|
||||
])
|
||||
|
||||
lezer_output.append("}")
|
||||
lezer_output.append("")
|
||||
lezer_output.append("@skip { whitespace | Comment }")
|
||||
|
||||
return "\n".join(lezer_output)
|
||||
|
||||
|
||||
def _convert_rule(name: str, body: str) -> str:
|
||||
"""Convert a single Lark rule to Lezer format."""
|
||||
# Skip internal rules (starting with _)
|
||||
if name.startswith("_"):
|
||||
return ""
|
||||
|
||||
# Convert rule name to PascalCase for Lezer
|
||||
lezer_name = _to_pascal_case(name)
|
||||
|
||||
# Convert body
|
||||
lezer_body = _convert_body(body)
|
||||
|
||||
if lezer_body:
|
||||
return f"{lezer_name} {{ {lezer_body} }}"
|
||||
return ""
|
||||
|
||||
|
||||
def _convert_terminal(name: str, pattern: str) -> str:
|
||||
"""Convert a Lark terminal to Lezer token format."""
|
||||
pattern = pattern.strip()
|
||||
|
||||
# Handle regex patterns
|
||||
if pattern.startswith("/") and pattern.endswith("/"):
|
||||
regex = pattern[1:-1]
|
||||
# Convert to Lezer regex format
|
||||
return f'{name} {{ ${regex}$ }}'
|
||||
|
||||
# Handle string literals
|
||||
if pattern.startswith('"') or pattern.startswith("'"):
|
||||
return f'{name} {{ {pattern} }}'
|
||||
|
||||
# Handle alternatives (literal strings separated by |)
|
||||
if "|" in pattern:
|
||||
alternatives = [alt.strip() for alt in pattern.split("|")]
|
||||
if all(alt.startswith('"') or alt.startswith("'") for alt in alternatives):
|
||||
return f'{name} {{ {" | ".join(alternatives)} }}'
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def _convert_body(body: str) -> str:
|
||||
"""Convert the body of a Lark rule to Lezer format."""
|
||||
# Remove inline transformations (-> name)
|
||||
body = re.sub(r"\s*->\s*\w+", "", body)
|
||||
|
||||
# Convert alternatives
|
||||
parts = []
|
||||
for alt in body.split("|"):
|
||||
alt = alt.strip()
|
||||
if alt:
|
||||
converted = _convert_sequence(alt)
|
||||
if converted:
|
||||
parts.append(converted)
|
||||
|
||||
return " | ".join(parts)
|
||||
|
||||
|
||||
def _convert_sequence(seq: str) -> str:
|
||||
"""Convert a sequence of items in a rule."""
|
||||
items = []
|
||||
|
||||
# Tokenize the sequence
|
||||
tokens = re.findall(
|
||||
r'"[^"]*"|\'[^\']*\'|/[^/]+/|\([^)]+\)|\[[^\]]+\]|[a-zA-Z_][a-zA-Z0-9_]*|\?|\*|\+',
|
||||
seq
|
||||
)
|
||||
|
||||
for token in tokens:
|
||||
if token.startswith('"') or token.startswith("'"):
|
||||
# String literal
|
||||
items.append(token)
|
||||
elif token.startswith("("):
|
||||
# Group
|
||||
inner = token[1:-1]
|
||||
items.append(f"({_convert_body(inner)})")
|
||||
elif token.startswith("["):
|
||||
# Optional group in Lark
|
||||
inner = token[1:-1]
|
||||
items.append(f"({_convert_body(inner)})?")
|
||||
elif token in ("?", "*", "+"):
|
||||
# Quantifiers - attach to previous item
|
||||
if items:
|
||||
items[-1] = items[-1] + token
|
||||
elif token.isupper() or token.startswith("_"):
|
||||
# Terminal reference
|
||||
items.append(token)
|
||||
elif token.islower() or "_" in token:
|
||||
# Rule reference - convert to PascalCase
|
||||
items.append(_to_pascal_case(token))
|
||||
|
||||
return " ".join(items)
|
||||
|
||||
|
||||
def _to_pascal_case(name: str) -> str:
|
||||
"""Convert snake_case to PascalCase."""
|
||||
return "".join(word.capitalize() for word in name.split("_"))
|
||||
|
||||
|
||||
def extract_completions_from_grammar(lark_grammar: str) -> Dict[str, List[str]]:
|
||||
"""
|
||||
Extract completion items from a Lark grammar.
|
||||
|
||||
Parses the grammar to find:
|
||||
- Keywords (reserved words like if, not, and)
|
||||
- Operators (==, !=, contains, etc.)
|
||||
- Functions (style, format, etc.)
|
||||
- Types (number, date, boolean, etc.)
|
||||
- Literals (True, False, etc.)
|
||||
|
||||
Args:
|
||||
lark_grammar: The Lark grammar string.
|
||||
|
||||
Returns:
|
||||
Dictionary with completion categories.
|
||||
"""
|
||||
keywords: Set[str] = set()
|
||||
operators: Set[str] = set()
|
||||
functions: Set[str] = set()
|
||||
types: Set[str] = set()
|
||||
literals: Set[str] = set()
|
||||
|
||||
# Find all quoted strings (potential keywords/operators)
|
||||
quoted_strings = re.findall(r'"([^"]+)"', lark_grammar)
|
||||
|
||||
# Also look for terminal definitions with string alternatives (e.g., BOOLEAN: "True" | "False")
|
||||
terminal_literals = re.findall(r'[A-Z_]+:\s*"([^"]+)"(?:\s*\|\s*"([^"]+)")*', lark_grammar)
|
||||
for match in terminal_literals:
|
||||
for literal in match:
|
||||
if literal:
|
||||
quoted_strings.append(literal)
|
||||
|
||||
for s in quoted_strings:
|
||||
s_lower = s.lower()
|
||||
|
||||
# Classify based on pattern
|
||||
if s in ("==", "!=", "<=", "<", ">=", ">", "+", "-", "*", "/"):
|
||||
operators.add(s)
|
||||
elif s_lower in ("contains", "startswith", "endswith", "in", "between", "isempty", "isnotempty"):
|
||||
operators.add(s_lower)
|
||||
elif s_lower in ("if", "not", "and", "or"):
|
||||
keywords.add(s_lower)
|
||||
elif s_lower in ("true", "false"):
|
||||
literals.add(s)
|
||||
elif s_lower in ("style", "format"):
|
||||
functions.add(s_lower)
|
||||
elif s_lower in ("column", "row", "cell", "value", "col"):
|
||||
keywords.add(s_lower)
|
||||
elif s_lower in ("number", "date", "boolean", "text", "enum"):
|
||||
types.add(s_lower)
|
||||
elif s_lower == "case":
|
||||
keywords.add(s_lower)
|
||||
|
||||
# Find function-like patterns: word "("
|
||||
function_patterns = re.findall(r'"(\w+)"\s*"?\("', lark_grammar)
|
||||
for func in function_patterns:
|
||||
if func.lower() not in ("true", "false"):
|
||||
functions.add(func.lower())
|
||||
|
||||
# Find type patterns from format_type rule
|
||||
type_match = re.search(r'format_type\s*:\s*(.+?)(?:\n\n|\Z)', lark_grammar, re.DOTALL)
|
||||
if type_match:
|
||||
type_strings = re.findall(r'"(\w+)"', type_match.group(1))
|
||||
types.update(t.lower() for t in type_strings)
|
||||
|
||||
return {
|
||||
"keywords": sorted(keywords),
|
||||
"operators": sorted(operators),
|
||||
"functions": sorted(functions),
|
||||
"types": sorted(types),
|
||||
"literals": sorted(literals),
|
||||
}
|
||||
# """
|
||||
# DEPRECATED: Utilities for converting Lark grammars to Lezer format.
|
||||
#
|
||||
# ⚠️ WARNING: This module is deprecated and will be removed in a future version.
|
||||
#
|
||||
# Original purpose:
|
||||
# - Transform a Lark grammar to a Lezer grammar for CodeMirror 6
|
||||
# - Extract completion items (keywords, operators, etc.) from a Lark grammar
|
||||
#
|
||||
# Deprecation reason:
|
||||
# - CodeMirror 6 requires a bundler (Webpack, Rollup, etc.)
|
||||
# - Incompatible with FastHTML's direct script inclusion approach
|
||||
# - Replaced by CodeMirror 5 Simple Mode (see lark_to_simple_mode.py)
|
||||
#
|
||||
# Migration path:
|
||||
# - Use lark_to_simple_mode.py for CodeMirror 5 syntax highlighting
|
||||
# - extract_completions_from_grammar() is still used and will be moved to utils.py
|
||||
# """
|
||||
#
|
||||
# import re
|
||||
# from typing import Dict, List, Set
|
||||
#
|
||||
#
|
||||
# def lark_to_lezer_grammar(lark_grammar: str) -> str:
|
||||
# """
|
||||
# Convert a Lark grammar to a Lezer grammar.
|
||||
#
|
||||
# This is a simplified converter that handles common Lark patterns.
|
||||
# Complex grammars may require manual adjustment.
|
||||
#
|
||||
# Args:
|
||||
# lark_grammar: The Lark grammar string.
|
||||
#
|
||||
# Returns:
|
||||
# The Lezer grammar string.
|
||||
# """
|
||||
# lines = lark_grammar.strip().split("\n")
|
||||
# lezer_rules = []
|
||||
# tokens = []
|
||||
#
|
||||
# for line in lines:
|
||||
# line = line.strip()
|
||||
#
|
||||
# # Skip empty lines and comments
|
||||
# if not line or line.startswith("//") or line.startswith("#"):
|
||||
# continue
|
||||
#
|
||||
# # Skip Lark-specific directives
|
||||
# if line.startswith("%"):
|
||||
# continue
|
||||
#
|
||||
# # Parse rule definitions (lowercase names only)
|
||||
# rule_match = re.match(r"^([a-z_][a-z0-9_]*)\s*:\s*(.+)$", line)
|
||||
# if rule_match:
|
||||
# name, body = rule_match.groups()
|
||||
# lezer_rule = _convert_rule(name, body)
|
||||
# if lezer_rule:
|
||||
# lezer_rules.append(lezer_rule)
|
||||
# continue
|
||||
#
|
||||
# # Parse terminal definitions (uppercase names)
|
||||
# terminal_match = re.match(r"^([A-Z_][A-Z0-9_]*)\s*:\s*(.+)$", line)
|
||||
# if terminal_match:
|
||||
# name, pattern = terminal_match.groups()
|
||||
# token = _convert_terminal(name, pattern)
|
||||
# if token:
|
||||
# tokens.append(token)
|
||||
#
|
||||
# # Build Lezer grammar
|
||||
# lezer_output = ["@top Start { scope+ }", ""]
|
||||
#
|
||||
# # Add rules
|
||||
# for rule in lezer_rules:
|
||||
# lezer_output.append(rule)
|
||||
#
|
||||
# lezer_output.append("")
|
||||
# lezer_output.append("@tokens {")
|
||||
#
|
||||
# # Add tokens
|
||||
# for token in tokens:
|
||||
# lezer_output.append(f" {token}")
|
||||
#
|
||||
# # Add common tokens
|
||||
# lezer_output.extend([
|
||||
# ' whitespace { $[ \\t]+ }',
|
||||
# ' newline { $[\\n\\r] }',
|
||||
# ' Comment { "#" ![$\\n]* }',
|
||||
# ])
|
||||
#
|
||||
# lezer_output.append("}")
|
||||
# lezer_output.append("")
|
||||
# lezer_output.append("@skip { whitespace | Comment }")
|
||||
#
|
||||
# return "\n".join(lezer_output)
|
||||
#
|
||||
#
|
||||
# def _convert_rule(name: str, body: str) -> str:
|
||||
# """Convert a single Lark rule to Lezer format."""
|
||||
# # Skip internal rules (starting with _)
|
||||
# if name.startswith("_"):
|
||||
# return ""
|
||||
#
|
||||
# # Convert rule name to PascalCase for Lezer
|
||||
# lezer_name = _to_pascal_case(name)
|
||||
#
|
||||
# # Convert body
|
||||
# lezer_body = _convert_body(body)
|
||||
#
|
||||
# if lezer_body:
|
||||
# return f"{lezer_name} {{ {lezer_body} }}"
|
||||
# return ""
|
||||
#
|
||||
#
|
||||
# def _convert_terminal(name: str, pattern: str) -> str:
|
||||
# """Convert a Lark terminal to Lezer token format."""
|
||||
# pattern = pattern.strip()
|
||||
#
|
||||
# # Handle regex patterns
|
||||
# if pattern.startswith("/") and pattern.endswith("/"):
|
||||
# regex = pattern[1:-1]
|
||||
# # Convert to Lezer regex format
|
||||
# return f'{name} {{ ${regex}$ }}'
|
||||
#
|
||||
# # Handle string literals
|
||||
# if pattern.startswith('"') or pattern.startswith("'"):
|
||||
# return f'{name} {{ {pattern} }}'
|
||||
#
|
||||
# # Handle alternatives (literal strings separated by |)
|
||||
# if "|" in pattern:
|
||||
# alternatives = [alt.strip() for alt in pattern.split("|")]
|
||||
# if all(alt.startswith('"') or alt.startswith("'") for alt in alternatives):
|
||||
# return f'{name} {{ {" | ".join(alternatives)} }}'
|
||||
#
|
||||
# return ""
|
||||
#
|
||||
#
|
||||
# def _convert_body(body: str) -> str:
|
||||
# """Convert the body of a Lark rule to Lezer format."""
|
||||
# # Remove inline transformations (-> name)
|
||||
# body = re.sub(r"\s*->\s*\w+", "", body)
|
||||
#
|
||||
# # Convert alternatives
|
||||
# parts = []
|
||||
# for alt in body.split("|"):
|
||||
# alt = alt.strip()
|
||||
# if alt:
|
||||
# converted = _convert_sequence(alt)
|
||||
# if converted:
|
||||
# parts.append(converted)
|
||||
#
|
||||
# return " | ".join(parts)
|
||||
#
|
||||
#
|
||||
# def _convert_sequence(seq: str) -> str:
|
||||
# """Convert a sequence of items in a rule."""
|
||||
# items = []
|
||||
#
|
||||
# # Tokenize the sequence
|
||||
# tokens = re.findall(
|
||||
# r'"[^"]*"|\'[^\']*\'|/[^/]+/|\([^)]+\)|\[[^\]]+\]|[a-zA-Z_][a-zA-Z0-9_]*|\?|\*|\+',
|
||||
# seq
|
||||
# )
|
||||
#
|
||||
# for token in tokens:
|
||||
# if token.startswith('"') or token.startswith("'"):
|
||||
# # String literal
|
||||
# items.append(token)
|
||||
# elif token.startswith("("):
|
||||
# # Group
|
||||
# inner = token[1:-1]
|
||||
# items.append(f"({_convert_body(inner)})")
|
||||
# elif token.startswith("["):
|
||||
# # Optional group in Lark
|
||||
# inner = token[1:-1]
|
||||
# items.append(f"({_convert_body(inner)})?")
|
||||
# elif token in ("?", "*", "+"):
|
||||
# # Quantifiers - attach to previous item
|
||||
# if items:
|
||||
# items[-1] = items[-1] + token
|
||||
# elif token.isupper() or token.startswith("_"):
|
||||
# # Terminal reference
|
||||
# items.append(token)
|
||||
# elif token.islower() or "_" in token:
|
||||
# # Rule reference - convert to PascalCase
|
||||
# items.append(_to_pascal_case(token))
|
||||
#
|
||||
# return " ".join(items)
|
||||
#
|
||||
#
|
||||
# def _to_pascal_case(name: str) -> str:
|
||||
# """Convert snake_case to PascalCase."""
|
||||
# return "".join(word.capitalize() for word in name.split("_"))
|
||||
#
|
||||
#
|
||||
# def extract_completions_from_grammar(lark_grammar: str) -> Dict[str, List[str]]:
|
||||
# """
|
||||
# Extract completion items from a Lark grammar.
|
||||
#
|
||||
# Parses the grammar to find:
|
||||
# - Keywords (reserved words like if, not, and)
|
||||
# - Operators (==, !=, contains, etc.)
|
||||
# - Functions (style, format, etc.)
|
||||
# - Types (number, date, boolean, etc.)
|
||||
# - Literals (True, False, etc.)
|
||||
#
|
||||
# Args:
|
||||
# lark_grammar: The Lark grammar string.
|
||||
#
|
||||
# Returns:
|
||||
# Dictionary with completion categories.
|
||||
# """
|
||||
# keywords: Set[str] = set()
|
||||
# operators: Set[str] = set()
|
||||
# functions: Set[str] = set()
|
||||
# types: Set[str] = set()
|
||||
# literals: Set[str] = set()
|
||||
#
|
||||
# # Find all quoted strings (potential keywords/operators)
|
||||
# quoted_strings = re.findall(r'"([^"]+)"', lark_grammar)
|
||||
#
|
||||
# # Also look for terminal definitions with string alternatives (e.g., BOOLEAN: "True" | "False")
|
||||
# terminal_literals = re.findall(r'[A-Z_]+:\s*"([^"]+)"(?:\s*\|\s*"([^"]+)")*', lark_grammar)
|
||||
# for match in terminal_literals:
|
||||
# for literal in match:
|
||||
# if literal:
|
||||
# quoted_strings.append(literal)
|
||||
#
|
||||
# for s in quoted_strings:
|
||||
# s_lower = s.lower()
|
||||
#
|
||||
# # Classify based on pattern
|
||||
# if s in ("==", "!=", "<=", "<", ">=", ">", "+", "-", "*", "/"):
|
||||
# operators.add(s)
|
||||
# elif s_lower in ("contains", "startswith", "endswith", "in", "between", "isempty", "isnotempty"):
|
||||
# operators.add(s_lower)
|
||||
# elif s_lower in ("if", "not", "and", "or"):
|
||||
# keywords.add(s_lower)
|
||||
# elif s_lower in ("true", "false"):
|
||||
# literals.add(s)
|
||||
# elif s_lower in ("style", "format"):
|
||||
# functions.add(s_lower)
|
||||
# elif s_lower in ("column", "row", "cell", "value", "col"):
|
||||
# keywords.add(s_lower)
|
||||
# elif s_lower in ("number", "date", "boolean", "text", "enum"):
|
||||
# types.add(s_lower)
|
||||
# elif s_lower == "case":
|
||||
# keywords.add(s_lower)
|
||||
#
|
||||
# # Find function-like patterns: word "("
|
||||
# function_patterns = re.findall(r'"(\w+)"\s*"?\("', lark_grammar)
|
||||
# for func in function_patterns:
|
||||
# if func.lower() not in ("true", "false"):
|
||||
# functions.add(func.lower())
|
||||
#
|
||||
# # Find type patterns from format_type rule
|
||||
# type_match = re.search(r'format_type\s*:\s*(.+?)(?:\n\n|\Z)', lark_grammar, re.DOTALL)
|
||||
# if type_match:
|
||||
# type_strings = re.findall(r'"(\w+)"', type_match.group(1))
|
||||
# types.update(t.lower() for t in type_strings)
|
||||
#
|
||||
# return {
|
||||
# "keywords": sorted(keywords),
|
||||
# "operators": sorted(operators),
|
||||
# "functions": sorted(functions),
|
||||
# "types": sorted(types),
|
||||
# "literals": sorted(literals),
|
||||
# }
|
||||
|
||||
240
src/myfasthtml/core/dsl/lark_to_simple_mode.py
Normal file
240
src/myfasthtml/core/dsl/lark_to_simple_mode.py
Normal file
@@ -0,0 +1,240 @@
|
||||
"""
|
||||
Utilities for converting Lark grammars to CodeMirror 5 Simple Mode format.
|
||||
|
||||
This module provides functions to extract regex patterns from Lark grammar
|
||||
terminals and generate a CodeMirror Simple Mode configuration for syntax highlighting.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Any
|
||||
|
||||
|
||||
def lark_to_simple_mode(lark_grammar: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert a Lark grammar to CodeMirror 5 Simple Mode configuration.
|
||||
|
||||
Extracts terminal definitions (regex patterns) from the Lark grammar and
|
||||
maps them to CodeMirror token classes for syntax highlighting.
|
||||
|
||||
Args:
|
||||
lark_grammar: The Lark grammar string.
|
||||
|
||||
Returns:
|
||||
Dictionary with Simple Mode configuration:
|
||||
{
|
||||
"start": [
|
||||
{"regex": "...", "token": "keyword"},
|
||||
{"regex": "...", "token": "string"},
|
||||
...
|
||||
]
|
||||
}
|
||||
"""
|
||||
# Extract keywords from literal strings in grammar rules
|
||||
keywords = _extract_keywords(lark_grammar)
|
||||
|
||||
# Extract terminals (regex patterns)
|
||||
terminals = _extract_terminals(lark_grammar)
|
||||
|
||||
# Build Simple Mode rules
|
||||
rules = []
|
||||
|
||||
# Comments (must come first to have priority)
|
||||
rules.append({
|
||||
"regex": r"#.*",
|
||||
"token": "comment"
|
||||
})
|
||||
|
||||
# Keywords
|
||||
if keywords:
|
||||
keyword_pattern = r"\b(?:" + "|".join(re.escape(k) for k in keywords) + r")\b"
|
||||
rules.append({
|
||||
"regex": keyword_pattern,
|
||||
"token": "keyword"
|
||||
})
|
||||
|
||||
# Terminals mapped to token types
|
||||
terminal_mappings = {
|
||||
"QUOTED_STRING": "string",
|
||||
"SIGNED_NUMBER": "number",
|
||||
"INTEGER": "number",
|
||||
"BOOLEAN": "atom",
|
||||
"CELL_ID": "variable-3",
|
||||
"NAME": "variable",
|
||||
}
|
||||
|
||||
for term_name, pattern in terminals.items():
|
||||
if term_name in terminal_mappings:
|
||||
token_type = terminal_mappings[term_name]
|
||||
js_pattern = _lark_regex_to_js(pattern)
|
||||
if js_pattern:
|
||||
rules.append({
|
||||
"regex": js_pattern,
|
||||
"token": token_type
|
||||
})
|
||||
|
||||
return {"start": rules}
|
||||
|
||||
|
||||
def _extract_keywords(grammar: str) -> List[str]:
|
||||
"""
|
||||
Extract keyword literals from grammar rules.
|
||||
|
||||
Looks for quoted string literals in rules (e.g., "column", "if", "style").
|
||||
|
||||
Args:
|
||||
grammar: The Lark grammar string.
|
||||
|
||||
Returns:
|
||||
List of keyword strings.
|
||||
"""
|
||||
keywords = set()
|
||||
|
||||
# Match quoted literals in rules (not in terminal definitions)
|
||||
# Pattern: "keyword" but not in lines like: TERMINAL: "pattern"
|
||||
lines = grammar.split("\n")
|
||||
for line in lines:
|
||||
# Skip terminal definitions (uppercase name followed by colon)
|
||||
if re.match(r'\s*[A-Z_]+\s*:', line):
|
||||
continue
|
||||
|
||||
# Skip comments
|
||||
if line.strip().startswith("//") or line.strip().startswith("#"):
|
||||
continue
|
||||
|
||||
# Find quoted strings in rules
|
||||
matches = re.findall(r'"([a-z_]+)"', line)
|
||||
for match in matches:
|
||||
# Filter out regex-like patterns, keep only identifiers
|
||||
if re.match(r'^[a-z_]+$', match):
|
||||
keywords.add(match)
|
||||
|
||||
return sorted(keywords)
|
||||
|
||||
|
||||
def _extract_terminals(grammar: str) -> Dict[str, str]:
|
||||
"""
|
||||
Extract terminal definitions from Lark grammar.
|
||||
|
||||
Args:
|
||||
grammar: The Lark grammar string.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping terminal names to their regex patterns.
|
||||
"""
|
||||
terminals = {}
|
||||
lines = grammar.split("\n")
|
||||
|
||||
for line in lines:
|
||||
# Match terminal definitions: NAME: /regex/ or NAME: "literal"
|
||||
match = re.match(r'\s*([A-Z_]+)\s*:\s*/([^/]+)/', line)
|
||||
if match:
|
||||
name, pattern = match.groups()
|
||||
terminals[name] = pattern
|
||||
continue
|
||||
|
||||
# Match literal alternatives: BOOLEAN: "True" | "False"
|
||||
match = re.match(r'\s*([A-Z_]+)\s*:\s*(.+)', line)
|
||||
if match:
|
||||
name, alternatives = match.groups()
|
||||
# Extract quoted literals
|
||||
literals = re.findall(r'"([^"]+)"', alternatives)
|
||||
if literals:
|
||||
# Build regex alternation
|
||||
pattern = "|".join(re.escape(lit) for lit in literals)
|
||||
terminals[name] = pattern
|
||||
|
||||
return terminals
|
||||
|
||||
|
||||
def _lark_regex_to_js(lark_pattern: str) -> str:
|
||||
"""
|
||||
Convert a Lark regex pattern to JavaScript regex.
|
||||
|
||||
This is a simplified converter that handles common patterns.
|
||||
Complex patterns may need manual adjustment.
|
||||
|
||||
Args:
|
||||
lark_pattern: Lark regex pattern.
|
||||
|
||||
Returns:
|
||||
JavaScript regex pattern string, or empty string if conversion fails.
|
||||
"""
|
||||
# Remove Lark-specific flags
|
||||
pattern = lark_pattern.strip()
|
||||
|
||||
# Handle common patterns
|
||||
conversions = [
|
||||
# Escape sequences
|
||||
(r'\[', r'['),
|
||||
(r'\]', r']'),
|
||||
|
||||
# Character classes are mostly compatible
|
||||
# Numbers: [0-9]+ or \d+
|
||||
# Letters: [a-zA-Z]
|
||||
# Whitespace: [ \t]
|
||||
]
|
||||
|
||||
result = pattern
|
||||
for lark_pat, js_pat in conversions:
|
||||
result = result.replace(lark_pat, js_pat)
|
||||
|
||||
# Wrap in word boundaries for identifier-like patterns
|
||||
# Example: [a-zA-Z_][a-zA-Z0-9_]* → \b[a-zA-Z_][a-zA-Z0-9_]*\b
|
||||
if re.match(r'\[[a-zA-Z_]+\]', result):
|
||||
result = r'\b' + result + r'\b'
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def generate_formatting_dsl_mode() -> Dict[str, Any]:
|
||||
"""
|
||||
Generate Simple Mode configuration for the Formatting DSL.
|
||||
|
||||
This is a specialized version with hand-tuned rules for better highlighting.
|
||||
|
||||
Returns:
|
||||
Simple Mode configuration dictionary.
|
||||
"""
|
||||
return {
|
||||
"start": [
|
||||
# Comments (highest priority)
|
||||
{"regex": r"#.*", "token": "comment"},
|
||||
|
||||
# Scope keywords
|
||||
{"regex": r"\b(?:column|row|cell)\b", "token": "keyword"},
|
||||
|
||||
# Condition keywords
|
||||
{"regex": r"\b(?:if|not|and|or|in|between|case)\b", "token": "keyword"},
|
||||
|
||||
# Built-in functions
|
||||
{"regex": r"\b(?:style|format)\b", "token": "builtin"},
|
||||
|
||||
# Format types
|
||||
{"regex": r"\b(?:number|date|boolean|text|enum)\b", "token": "builtin"},
|
||||
|
||||
# String operators (word-like)
|
||||
{"regex": r"\b(?:contains|startswith|endswith|isempty|isnotempty)\b", "token": "operator"},
|
||||
|
||||
# Comparison operators (symbols)
|
||||
{"regex": r"==|!=|<=|>=|<|>", "token": "operator"},
|
||||
|
||||
# Special references
|
||||
{"regex": r"\b(?:value|col|row|cell)\b", "token": "variable-2"},
|
||||
|
||||
# Booleans
|
||||
{"regex": r"\b(?:True|False|true|false)\b", "token": "atom"},
|
||||
|
||||
# Numbers (integers and floats, with optional sign)
|
||||
{"regex": r"[+-]?\b\d+(?:\.\d+)?\b", "token": "number"},
|
||||
|
||||
# Strings (double or single quoted)
|
||||
{"regex": r'"(?:[^\\"]|\\.)*"', "token": "string"},
|
||||
{"regex": r"'(?:[^\\']|\\.)*'", "token": "string"},
|
||||
|
||||
# Cell IDs
|
||||
{"regex": r"\btcell_[a-zA-Z0-9_-]+\b", "token": "variable-3"},
|
||||
|
||||
# Names (identifiers) - lowest priority
|
||||
{"regex": r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", "token": "variable"},
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user