Added syntax colorization. Remove all references to deprecated lark_to_lezer module.

This commit is contained in:
2026-02-07 11:08:34 +01:00
parent 1c1ced2a9f
commit ab4f251f0c
7 changed files with 106 additions and 461 deletions

View File

@@ -9,10 +9,7 @@ from abc import ABC, abstractmethod
from functools import cached_property
from typing import List, Dict, Any
# TODO: Replace with lark_to_simple_mode when implemented
from myfasthtml.core.dsl.lark_to_lezer import (
extract_completions_from_grammar, # Will be moved to utils.py
)
from myfasthtml.core.dsl.lark_to_simple_mode import extract_completions_from_grammar
from myfasthtml.core.utils import make_safe_id
@@ -82,13 +79,13 @@ class DSLDefinition(ABC):
Returns:
Dictionary with:
- 'lezerGrammar': The Lezer grammar string
- 'simpleModeConfig': The CodeMirror Simple Mode configuration
- 'completions': The completion items
- 'name': The DSL name
"""
return {
"name": self.name,
"lezerGrammar": self.lezer_grammar,
"simpleModeConfig": self.simple_mode_config,
"completions": self.completions,
}

View File

@@ -1,267 +0,0 @@
# """
# DEPRECATED: Utilities for converting Lark grammars to Lezer format.
#
# ⚠️ WARNING: This module is deprecated and will be removed in a future version.
#
# Original purpose:
# - Transform a Lark grammar to a Lezer grammar for CodeMirror 6
# - Extract completion items (keywords, operators, etc.) from a Lark grammar
#
# Deprecation reason:
# - CodeMirror 6 requires a bundler (Webpack, Rollup, etc.)
# - Incompatible with FastHTML's direct script inclusion approach
# - Replaced by CodeMirror 5 Simple Mode (see lark_to_simple_mode.py)
#
# Migration path:
# - Use lark_to_simple_mode.py for CodeMirror 5 syntax highlighting
# - extract_completions_from_grammar() is still used and will be moved to utils.py
# """
#
# import re
# from typing import Dict, List, Set
#
#
# def lark_to_lezer_grammar(lark_grammar: str) -> str:
# """
# Convert a Lark grammar to a Lezer grammar.
#
# This is a simplified converter that handles common Lark patterns.
# Complex grammars may require manual adjustment.
#
# Args:
# lark_grammar: The Lark grammar string.
#
# Returns:
# The Lezer grammar string.
# """
# lines = lark_grammar.strip().split("\n")
# lezer_rules = []
# tokens = []
#
# for line in lines:
# line = line.strip()
#
# # Skip empty lines and comments
# if not line or line.startswith("//") or line.startswith("#"):
# continue
#
# # Skip Lark-specific directives
# if line.startswith("%"):
# continue
#
# # Parse rule definitions (lowercase names only)
# rule_match = re.match(r"^([a-z_][a-z0-9_]*)\s*:\s*(.+)$", line)
# if rule_match:
# name, body = rule_match.groups()
# lezer_rule = _convert_rule(name, body)
# if lezer_rule:
# lezer_rules.append(lezer_rule)
# continue
#
# # Parse terminal definitions (uppercase names)
# terminal_match = re.match(r"^([A-Z_][A-Z0-9_]*)\s*:\s*(.+)$", line)
# if terminal_match:
# name, pattern = terminal_match.groups()
# token = _convert_terminal(name, pattern)
# if token:
# tokens.append(token)
#
# # Build Lezer grammar
# lezer_output = ["@top Start { scope+ }", ""]
#
# # Add rules
# for rule in lezer_rules:
# lezer_output.append(rule)
#
# lezer_output.append("")
# lezer_output.append("@tokens {")
#
# # Add tokens
# for token in tokens:
# lezer_output.append(f" {token}")
#
# # Add common tokens
# lezer_output.extend([
# ' whitespace { $[ \\t]+ }',
# ' newline { $[\\n\\r] }',
# ' Comment { "#" ![$\\n]* }',
# ])
#
# lezer_output.append("}")
# lezer_output.append("")
# lezer_output.append("@skip { whitespace | Comment }")
#
# return "\n".join(lezer_output)
#
#
# def _convert_rule(name: str, body: str) -> str:
# """Convert a single Lark rule to Lezer format."""
# # Skip internal rules (starting with _)
# if name.startswith("_"):
# return ""
#
# # Convert rule name to PascalCase for Lezer
# lezer_name = _to_pascal_case(name)
#
# # Convert body
# lezer_body = _convert_body(body)
#
# if lezer_body:
# return f"{lezer_name} {{ {lezer_body} }}"
# return ""
#
#
# def _convert_terminal(name: str, pattern: str) -> str:
# """Convert a Lark terminal to Lezer token format."""
# pattern = pattern.strip()
#
# # Handle regex patterns
# if pattern.startswith("/") and pattern.endswith("/"):
# regex = pattern[1:-1]
# # Convert to Lezer regex format
# return f'{name} {{ ${regex}$ }}'
#
# # Handle string literals
# if pattern.startswith('"') or pattern.startswith("'"):
# return f'{name} {{ {pattern} }}'
#
# # Handle alternatives (literal strings separated by |)
# if "|" in pattern:
# alternatives = [alt.strip() for alt in pattern.split("|")]
# if all(alt.startswith('"') or alt.startswith("'") for alt in alternatives):
# return f'{name} {{ {" | ".join(alternatives)} }}'
#
# return ""
#
#
# def _convert_body(body: str) -> str:
# """Convert the body of a Lark rule to Lezer format."""
# # Remove inline transformations (-> name)
# body = re.sub(r"\s*->\s*\w+", "", body)
#
# # Convert alternatives
# parts = []
# for alt in body.split("|"):
# alt = alt.strip()
# if alt:
# converted = _convert_sequence(alt)
# if converted:
# parts.append(converted)
#
# return " | ".join(parts)
#
#
# def _convert_sequence(seq: str) -> str:
# """Convert a sequence of items in a rule."""
# items = []
#
# # Tokenize the sequence
# tokens = re.findall(
# r'"[^"]*"|\'[^\']*\'|/[^/]+/|\([^)]+\)|\[[^\]]+\]|[a-zA-Z_][a-zA-Z0-9_]*|\?|\*|\+',
# seq
# )
#
# for token in tokens:
# if token.startswith('"') or token.startswith("'"):
# # String literal
# items.append(token)
# elif token.startswith("("):
# # Group
# inner = token[1:-1]
# items.append(f"({_convert_body(inner)})")
# elif token.startswith("["):
# # Optional group in Lark
# inner = token[1:-1]
# items.append(f"({_convert_body(inner)})?")
# elif token in ("?", "*", "+"):
# # Quantifiers - attach to previous item
# if items:
# items[-1] = items[-1] + token
# elif token.isupper() or token.startswith("_"):
# # Terminal reference
# items.append(token)
# elif token.islower() or "_" in token:
# # Rule reference - convert to PascalCase
# items.append(_to_pascal_case(token))
#
# return " ".join(items)
#
#
# def _to_pascal_case(name: str) -> str:
# """Convert snake_case to PascalCase."""
# return "".join(word.capitalize() for word in name.split("_"))
#
#
# def extract_completions_from_grammar(lark_grammar: str) -> Dict[str, List[str]]:
# """
# Extract completion items from a Lark grammar.
#
# Parses the grammar to find:
# - Keywords (reserved words like if, not, and)
# - Operators (==, !=, contains, etc.)
# - Functions (style, format, etc.)
# - Types (number, date, boolean, etc.)
# - Literals (True, False, etc.)
#
# Args:
# lark_grammar: The Lark grammar string.
#
# Returns:
# Dictionary with completion categories.
# """
# keywords: Set[str] = set()
# operators: Set[str] = set()
# functions: Set[str] = set()
# types: Set[str] = set()
# literals: Set[str] = set()
#
# # Find all quoted strings (potential keywords/operators)
# quoted_strings = re.findall(r'"([^"]+)"', lark_grammar)
#
# # Also look for terminal definitions with string alternatives (e.g., BOOLEAN: "True" | "False")
# terminal_literals = re.findall(r'[A-Z_]+:\s*"([^"]+)"(?:\s*\|\s*"([^"]+)")*', lark_grammar)
# for match in terminal_literals:
# for literal in match:
# if literal:
# quoted_strings.append(literal)
#
# for s in quoted_strings:
# s_lower = s.lower()
#
# # Classify based on pattern
# if s in ("==", "!=", "<=", "<", ">=", ">", "+", "-", "*", "/"):
# operators.add(s)
# elif s_lower in ("contains", "startswith", "endswith", "in", "between", "isempty", "isnotempty"):
# operators.add(s_lower)
# elif s_lower in ("if", "not", "and", "or"):
# keywords.add(s_lower)
# elif s_lower in ("true", "false"):
# literals.add(s)
# elif s_lower in ("style", "format"):
# functions.add(s_lower)
# elif s_lower in ("column", "row", "cell", "value", "col"):
# keywords.add(s_lower)
# elif s_lower in ("number", "date", "boolean", "text", "enum"):
# types.add(s_lower)
# elif s_lower == "case":
# keywords.add(s_lower)
#
# # Find function-like patterns: word "("
# function_patterns = re.findall(r'"(\w+)"\s*"?\("', lark_grammar)
# for func in function_patterns:
# if func.lower() not in ("true", "false"):
# functions.add(func.lower())
#
# # Find type patterns from format_type rule
# type_match = re.search(r'format_type\s*:\s*(.+?)(?:\n\n|\Z)', lark_grammar, re.DOTALL)
# if type_match:
# type_strings = re.findall(r'"(\w+)"', type_match.group(1))
# types.update(t.lower() for t in type_strings)
#
# return {
# "keywords": sorted(keywords),
# "operators": sorted(operators),
# "functions": sorted(functions),
# "types": sorted(types),
# "literals": sorted(literals),
# }

View File

@@ -1,12 +1,14 @@
"""
Utilities for converting Lark grammars to CodeMirror 5 Simple Mode format.
This module provides functions to extract regex patterns from Lark grammar
terminals and generate a CodeMirror Simple Mode configuration for syntax highlighting.
This module provides functions to:
1. Extract regex patterns from Lark grammar terminals
2. Generate CodeMirror Simple Mode configuration for syntax highlighting
3. Extract completion items from Lark grammar (keywords, operators, etc.)
"""
import re
from typing import Dict, List, Any
from typing import Dict, List, Any, Set
def lark_to_simple_mode(lark_grammar: str) -> Dict[str, Any]:
@@ -238,3 +240,85 @@ def generate_formatting_dsl_mode() -> Dict[str, Any]:
{"regex": r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", "token": "variable"},
]
}
def extract_completions_from_grammar(lark_grammar: str) -> Dict[str, List[str]]:
"""
Extract completion items from a Lark grammar.
Parses the grammar to find:
- Keywords (reserved words like if, not, and)
- Operators (==, !=, contains, etc.)
- Functions (style, format, etc.)
- Types (number, date, boolean, etc.)
- Literals (True, False, etc.)
Args:
lark_grammar: The Lark grammar string.
Returns:
Dictionary with completion categories:
{
"keywords": [...],
"operators": [...],
"functions": [...],
"types": [...],
"literals": [...]
}
"""
keywords: Set[str] = set()
operators: Set[str] = set()
functions: Set[str] = set()
types: Set[str] = set()
literals: Set[str] = set()
# Find all quoted strings (potential keywords/operators)
quoted_strings = re.findall(r'"([^"]+)"', lark_grammar)
# Also look for terminal definitions with string alternatives (e.g., BOOLEAN: "True" | "False")
terminal_literals = re.findall(r'[A-Z_]+:\s*"([^"]+)"(?:\s*\|\s*"([^"]+)")*', lark_grammar)
for match in terminal_literals:
for literal in match:
if literal:
quoted_strings.append(literal)
for s in quoted_strings:
s_lower = s.lower()
# Classify based on pattern
if s in ("==", "!=", "<=", "<", ">=", ">", "+", "-", "*", "/"):
operators.add(s)
elif s_lower in ("contains", "startswith", "endswith", "in", "between", "isempty", "isnotempty"):
operators.add(s_lower)
elif s_lower in ("if", "not", "and", "or"):
keywords.add(s_lower)
elif s_lower in ("true", "false"):
literals.add(s)
elif s_lower in ("style", "format"):
functions.add(s_lower)
elif s_lower in ("column", "row", "cell", "value", "col"):
keywords.add(s_lower)
elif s_lower in ("number", "date", "boolean", "text", "enum"):
types.add(s_lower)
elif s_lower == "case":
keywords.add(s_lower)
# Find function-like patterns: word "("
function_patterns = re.findall(r'"(\w+)"\s*"?\("', lark_grammar)
for func in function_patterns:
if func.lower() not in ("true", "false"):
functions.add(func.lower())
# Find type patterns from format_type rule
type_match = re.search(r'format_type\s*:\s*(.+?)(?:\n\n|\Z)', lark_grammar, re.DOTALL)
if type_match:
type_strings = re.findall(r'"(\w+)"', type_match.group(1))
types.update(t.lower() for t in type_strings)
return {
"keywords": sorted(keywords),
"operators": sorted(operators),
"functions": sorted(functions),
"types": sorted(types),
"literals": sorted(literals),
}

View File

@@ -283,9 +283,11 @@ def _get_column_value_suggestions(
"""Get column value suggestions based on the current scope."""
if not scope.column_name:
return []
try:
values = provider.list_column_values(scope.column_name)
# Use table_name from scope, or empty string as fallback
table_name = scope.table_name or ""
values = provider.list_column_values(table_name, scope.column_name)
suggestions = []
for value in values:
if value is None:

View File

@@ -1,172 +0,0 @@
"""Tests for lark_to_lezer module."""
import pytest
from myfasthtml.core.dsl.lark_to_lezer import (
extract_completions_from_grammar,
lark_to_lezer_grammar,
)
# Sample grammars for testing
SIMPLE_GRAMMAR = r'''
start: rule+
rule: "if" condition
condition: "value" operator literal
operator: "==" -> op_eq
| "!=" -> op_ne
| "contains" -> op_contains
literal: QUOTED_STRING -> string_literal
| BOOLEAN -> boolean_literal
QUOTED_STRING: /"[^"]*"/
BOOLEAN: "True" | "False"
'''
GRAMMAR_WITH_KEYWORDS = r'''
start: scope+
scope: "column" NAME ":" rule
| "row" INTEGER ":" rule
| "cell" cell_ref ":" rule
rule: style_expr condition?
condition: "if" "not"? comparison
comparison: operand "and" operand
| operand "or" operand
style_expr: "style" "(" args ")"
operand: "value" | literal
'''
GRAMMAR_WITH_TYPES = r'''
format_type: "number" -> fmt_number
| "date" -> fmt_date
| "boolean" -> fmt_boolean
| "text" -> fmt_text
| "enum" -> fmt_enum
'''
class TestExtractCompletions:
"""Tests for extract_completions_from_grammar function."""
def test_i_can_extract_keywords_from_grammar(self):
"""Test that keywords like if, not, and are extracted."""
completions = extract_completions_from_grammar(GRAMMAR_WITH_KEYWORDS)
assert "if" in completions["keywords"]
assert "not" in completions["keywords"]
assert "column" in completions["keywords"]
assert "row" in completions["keywords"]
assert "cell" in completions["keywords"]
assert "value" in completions["keywords"]
@pytest.mark.parametrize(
"operator",
["==", "!=", "contains"],
)
def test_i_can_extract_operators_from_grammar(self, operator):
"""Test that operators are extracted from grammar."""
completions = extract_completions_from_grammar(SIMPLE_GRAMMAR)
assert operator in completions["operators"]
def test_i_can_extract_functions_from_grammar(self):
"""Test that function-like constructs are extracted."""
completions = extract_completions_from_grammar(GRAMMAR_WITH_KEYWORDS)
assert "style" in completions["functions"]
@pytest.mark.parametrize(
"type_name",
["number", "date", "boolean", "text", "enum"],
)
def test_i_can_extract_types_from_grammar(self, type_name):
"""Test that type names are extracted from format_type rule."""
completions = extract_completions_from_grammar(GRAMMAR_WITH_TYPES)
assert type_name in completions["types"]
@pytest.mark.parametrize("literal", [
"True",
"False"
])
def test_i_can_extract_literals_from_grammar(self, literal):
"""Test that literal values like True/False are extracted."""
completions = extract_completions_from_grammar(SIMPLE_GRAMMAR)
assert literal in completions["literals"]
def test_i_can_extract_completions_returns_all_categories(self):
"""Test that all completion categories are present in result."""
completions = extract_completions_from_grammar(SIMPLE_GRAMMAR)
assert "keywords" in completions
assert "operators" in completions
assert "functions" in completions
assert "types" in completions
assert "literals" in completions
def test_i_can_extract_completions_returns_sorted_lists(self):
"""Test that completion lists are sorted alphabetically."""
completions = extract_completions_from_grammar(SIMPLE_GRAMMAR)
for category in completions.values():
assert category == sorted(category)
class TestLarkToLezerConversion:
"""Tests for lark_to_lezer_grammar function."""
def test_i_can_convert_simple_grammar_to_lezer(self):
"""Test that a simple Lark grammar is converted to Lezer format."""
lezer = lark_to_lezer_grammar(SIMPLE_GRAMMAR)
# Should have @top directive
assert "@top Start" in lezer
# Should have @tokens block
assert "@tokens {" in lezer
# Should have @skip directive
assert "@skip {" in lezer
def test_i_can_convert_rule_names_to_pascal_case(self):
"""Test that snake_case rule names become PascalCase."""
grammar = r'''
my_rule: other_rule
other_rule: "test"
'''
lezer = lark_to_lezer_grammar(grammar)
assert "MyRule" in lezer
assert "OtherRule" in lezer
def test_i_cannot_include_internal_rules_in_lezer(self):
"""Test that rules starting with _ are not included."""
grammar = r'''
start: rule _NL
rule: "test"
_NL: /\n/
'''
lezer = lark_to_lezer_grammar(grammar)
# Internal rules should not appear as Lezer rules
assert "Nl {" not in lezer
def test_i_can_convert_terminal_regex_to_lezer(self):
"""Test that terminal regex patterns are converted."""
grammar = r'''
NAME: /[a-zA-Z_][a-zA-Z0-9_]*/
'''
lezer = lark_to_lezer_grammar(grammar)
assert "NAME" in lezer
@pytest.mark.parametrize(
"terminal,pattern",
[
('BOOLEAN: "True" | "False"', "BOOLEAN"),
('KEYWORD: "if"', "KEYWORD"),
],
)
def test_i_can_convert_terminal_strings_to_lezer(self, terminal, pattern):
"""Test that terminal string literals are converted."""
grammar = f"start: test\n{terminal}"
lezer = lark_to_lezer_grammar(grammar)
assert pattern in lezer

View File

@@ -34,13 +34,13 @@ class MockProvider:
Provides predefined data for columns, values, and presets.
"""
def get_tables(self) -> list[str]:
def list_tables(self) -> list[str]:
return ["app.orders"]
def get_columns(self, table: str) -> list[str]:
def list_columns(self, table: str) -> list[str]:
return ["id", "amount", "status"]
def get_column_values(self, column: str) -> list[Any]:
def list_column_values(self, table: str, column: str) -> list[Any]:
if column == "status":
return ["draft", "pending", "approved"]
if column == "amount":
@@ -50,10 +50,10 @@ class MockProvider:
def get_row_count(self, table: str) -> int:
return 150
def get_style_presets(self) -> list[str]:
def list_style_presets(self) -> list[str]:
return ["custom_highlight"]
def get_format_presets(self) -> list[str]:
def list_format_presets(self) -> list[str]:
return ["CHF"]

View File

@@ -84,14 +84,14 @@ class TestFormattingDSL:
assert completions1 is completions2
def test_i_can_get_lezer_grammar_is_cached(self):
"""Test that lezer_grammar property is cached (same object returned)."""
def test_i_can_get_simple_mode_config_is_cached(self):
"""Test that simple_mode_config property is cached (same object returned)."""
dsl = FormattingDSL()
lezer1 = dsl.lezer_grammar
lezer2 = dsl.lezer_grammar
config1 = dsl.simple_mode_config
config2 = dsl.simple_mode_config
assert lezer1 is lezer2
assert config1 is config2
def test_i_can_get_editor_config(self):
"""Test that get_editor_config() returns expected structure."""
@@ -100,6 +100,7 @@ class TestFormattingDSL:
config = dsl.get_editor_config()
assert "name" in config
assert "lezerGrammar" in config
assert "simpleModeConfig" in config
assert "completions" in config
assert config["name"] == "Formatting DSL"
assert "start" in config["simpleModeConfig"] # Simple Mode structure