Added syntax colorization. Remove all references to deprecated lark_to_lezer module.

2026-02-07 11:08:34 +01:00
parent 1c1ced2a9f
commit ab4f251f0c
7 changed files with 106 additions and 461 deletions
@@ -9,10 +9,7 @@ from abc import ABC, abstractmethod
 from functools import cached_property
 from typing import List, Dict, Any
-# TODO: Replace with lark_to_simple_mode when implemented
+from myfasthtml.core.dsl.lark_to_simple_mode import extract_completions_from_grammar
 from myfasthtml.core.dsl.lark_to_lezer import (
  extract_completions_from_grammar,  # Will be moved to utils.py
 )
 from myfasthtml.core.utils import make_safe_id
@@ -82,13 +79,13 @@ class DSLDefinition(ABC):
    Returns:
        Dictionary with:
-        - 'lezerGrammar': The Lezer grammar string
+        - 'simpleModeConfig': The CodeMirror Simple Mode configuration
        - 'completions': The completion items
        - 'name': The DSL name
    """
    return {
        "name": self.name,
-        "lezerGrammar": self.lezer_grammar,
+        "simpleModeConfig": self.simple_mode_config,
        "completions": self.completions,
    }
@@ -1,267 +0,0 @@
 # """
 # DEPRECATED: Utilities for converting Lark grammars to Lezer format.
 #
 # ⚠️ WARNING: This module is deprecated and will be removed in a future version.
 #
 # Original purpose:
 # - Transform a Lark grammar to a Lezer grammar for CodeMirror 6
 # - Extract completion items (keywords, operators, etc.) from a Lark grammar
 #
 # Deprecation reason:
 # - CodeMirror 6 requires a bundler (Webpack, Rollup, etc.)
 # - Incompatible with FastHTML's direct script inclusion approach
 # - Replaced by CodeMirror 5 Simple Mode (see lark_to_simple_mode.py)
 #
 # Migration path:
 # - Use lark_to_simple_mode.py for CodeMirror 5 syntax highlighting
 # - extract_completions_from_grammar() is still used and will be moved to utils.py
 # """
 #
 # import re
 # from typing import Dict, List, Set
 #
 #
 # def lark_to_lezer_grammar(lark_grammar: str) -> str:
 #   """
 #   Convert a Lark grammar to a Lezer grammar.
 #
 #   This is a simplified converter that handles common Lark patterns.
 #   Complex grammars may require manual adjustment.
 #
 #   Args:
 #       lark_grammar: The Lark grammar string.
 #
 #   Returns:
 #       The Lezer grammar string.
 #   """
 #   lines = lark_grammar.strip().split("\n")
 #   lezer_rules = []
 #   tokens = []
 #
 #   for line in lines:
 #     line = line.strip()
 #
 #     # Skip empty lines and comments
 #     if not line or line.startswith("//") or line.startswith("#"):
 #       continue
 #
 #     # Skip Lark-specific directives
 #     if line.startswith("%"):
 #       continue
 #
 #     # Parse rule definitions (lowercase names only)
 #     rule_match = re.match(r"^([a-z_][a-z0-9_]*)\s*:\s*(.+)$", line)
 #     if rule_match:
 #       name, body = rule_match.groups()
 #       lezer_rule = _convert_rule(name, body)
 #       if lezer_rule:
 #         lezer_rules.append(lezer_rule)
 #       continue
 #
 #     # Parse terminal definitions (uppercase names)
 #     terminal_match = re.match(r"^([A-Z_][A-Z0-9_]*)\s*:\s*(.+)$", line)
 #     if terminal_match:
 #       name, pattern = terminal_match.groups()
 #       token = _convert_terminal(name, pattern)
 #       if token:
 #         tokens.append(token)
 #
 #   # Build Lezer grammar
 #   lezer_output = ["@top Start { scope+ }", ""]
 #
 #   # Add rules
 #   for rule in lezer_rules:
 #     lezer_output.append(rule)
 #
 #   lezer_output.append("")
 #   lezer_output.append("@tokens {")
 #
 #   # Add tokens
 #   for token in tokens:
 #     lezer_output.append(f"  {token}")
 #
 #   # Add common tokens
 #   lezer_output.extend([
 #       '  whitespace { $[ \\t]+ }',
 #       '  newline { $[\\n\\r] }',
 #       '  Comment { "#" ![$\\n]* }',
 #   ])
 #
 #   lezer_output.append("}")
 #   lezer_output.append("")
 #   lezer_output.append("@skip { whitespace | Comment }")
 #
 #   return "\n".join(lezer_output)
 #
 #
 # def _convert_rule(name: str, body: str) -> str:
 #   """Convert a single Lark rule to Lezer format."""
 #   # Skip internal rules (starting with _)
 #   if name.startswith("_"):
 #     return ""
 #
 #   # Convert rule name to PascalCase for Lezer
 #   lezer_name = _to_pascal_case(name)
 #
 #   # Convert body
 #   lezer_body = _convert_body(body)
 #
 #   if lezer_body:
 #     return f"{lezer_name} {{ {lezer_body} }}"
 #   return ""
 #
 #
 # def _convert_terminal(name: str, pattern: str) -> str:
 #   """Convert a Lark terminal to Lezer token format."""
 #   pattern = pattern.strip()
 #
 #   # Handle regex patterns
 #   if pattern.startswith("/") and pattern.endswith("/"):
 #     regex = pattern[1:-1]
 #     # Convert to Lezer regex format
 #     return f'{name} {{ ${regex}$ }}'
 #
 #   # Handle string literals
 #   if pattern.startswith('"') or pattern.startswith("'"):
 #     return f'{name} {{ {pattern} }}'
 #
 #   # Handle alternatives (literal strings separated by |)
 #   if "|" in pattern:
 #     alternatives = [alt.strip() for alt in pattern.split("|")]
 #     if all(alt.startswith('"') or alt.startswith("'") for alt in alternatives):
 #       return f'{name} {{ {" | ".join(alternatives)} }}'
 #
 #   return ""
 #
 #
 # def _convert_body(body: str) -> str:
 #   """Convert the body of a Lark rule to Lezer format."""
 #   # Remove inline transformations (-> name)
 #   body = re.sub(r"\s*->\s*\w+", "", body)
 #
 #   # Convert alternatives
 #   parts = []
 #   for alt in body.split("|"):
 #     alt = alt.strip()
 #     if alt:
 #       converted = _convert_sequence(alt)
 #       if converted:
 #         parts.append(converted)
 #
 #   return " | ".join(parts)
 #
 #
 # def _convert_sequence(seq: str) -> str:
 #   """Convert a sequence of items in a rule."""
 #   items = []
 #
 #   # Tokenize the sequence
 #   tokens = re.findall(
 #     r'"[^"]*"|\'[^\']*\'|/[^/]+/|\([^)]+\)|\[[^\]]+\]|[a-zA-Z_][a-zA-Z0-9_]*|\?|\*|\+',
 #     seq
 #   )
 #
 #   for token in tokens:
 #     if token.startswith('"') or token.startswith("'"):
 #       # String literal
 #       items.append(token)
 #     elif token.startswith("("):
 #       # Group
 #       inner = token[1:-1]
 #       items.append(f"({_convert_body(inner)})")
 #     elif token.startswith("["):
 #       # Optional group in Lark
 #       inner = token[1:-1]
 #       items.append(f"({_convert_body(inner)})?")
 #     elif token in ("?", "*", "+"):
 #       # Quantifiers - attach to previous item
 #       if items:
 #         items[-1] = items[-1] + token
 #     elif token.isupper() or token.startswith("_"):
 #       # Terminal reference
 #       items.append(token)
 #     elif token.islower() or "_" in token:
 #       # Rule reference - convert to PascalCase
 #       items.append(_to_pascal_case(token))
 #
 #   return " ".join(items)
 #
 #
 # def _to_pascal_case(name: str) -> str:
 #   """Convert snake_case to PascalCase."""
 #   return "".join(word.capitalize() for word in name.split("_"))
 #
 #
 # def extract_completions_from_grammar(lark_grammar: str) -> Dict[str, List[str]]:
 #   """
 #   Extract completion items from a Lark grammar.
 #
 #   Parses the grammar to find:
 #   - Keywords (reserved words like if, not, and)
 #   - Operators (==, !=, contains, etc.)
 #   - Functions (style, format, etc.)
 #   - Types (number, date, boolean, etc.)
 #   - Literals (True, False, etc.)
 #
 #   Args:
 #       lark_grammar: The Lark grammar string.
 #
 #   Returns:
 #       Dictionary with completion categories.
 #   """
 #   keywords: Set[str] = set()
 #   operators: Set[str] = set()
 #   functions: Set[str] = set()
 #   types: Set[str] = set()
 #   literals: Set[str] = set()
 #
 #   # Find all quoted strings (potential keywords/operators)
 #   quoted_strings = re.findall(r'"([^"]+)"', lark_grammar)
 #
 #   # Also look for terminal definitions with string alternatives (e.g., BOOLEAN: "True" | "False")
 #   terminal_literals = re.findall(r'[A-Z_]+:\s*"([^"]+)"(?:\s*\|\s*"([^"]+)")*', lark_grammar)
 #   for match in terminal_literals:
 #     for literal in match:
 #       if literal:
 #         quoted_strings.append(literal)
 #
 #   for s in quoted_strings:
 #     s_lower = s.lower()
 #
 #     # Classify based on pattern
 #     if s in ("==", "!=", "<=", "<", ">=", ">", "+", "-", "*", "/"):
 #       operators.add(s)
 #     elif s_lower in ("contains", "startswith", "endswith", "in", "between", "isempty", "isnotempty"):
 #       operators.add(s_lower)
 #     elif s_lower in ("if", "not", "and", "or"):
 #       keywords.add(s_lower)
 #     elif s_lower in ("true", "false"):
 #       literals.add(s)
 #     elif s_lower in ("style", "format"):
 #       functions.add(s_lower)
 #     elif s_lower in ("column", "row", "cell", "value", "col"):
 #       keywords.add(s_lower)
 #     elif s_lower in ("number", "date", "boolean", "text", "enum"):
 #       types.add(s_lower)
 #     elif s_lower == "case":
 #       keywords.add(s_lower)
 #
 #   # Find function-like patterns: word "("
 #   function_patterns = re.findall(r'"(\w+)"\s*"?\("', lark_grammar)
 #   for func in function_patterns:
 #     if func.lower() not in ("true", "false"):
 #       functions.add(func.lower())
 #
 #   # Find type patterns from format_type rule
 #   type_match = re.search(r'format_type\s*:\s*(.+?)(?:\n\n|\Z)', lark_grammar, re.DOTALL)
 #   if type_match:
 #     type_strings = re.findall(r'"(\w+)"', type_match.group(1))
 #     types.update(t.lower() for t in type_strings)
 #
 #   return {
 #       "keywords": sorted(keywords),
 #       "operators": sorted(operators),
 #       "functions": sorted(functions),
 #       "types": sorted(types),
 #       "literals": sorted(literals),
 #   }
@@ -1,12 +1,14 @@
 """
 Utilities for converting Lark grammars to CodeMirror 5 Simple Mode format.
-This module provides functions to extract regex patterns from Lark grammar
+This module provides functions to:
-terminals and generate a CodeMirror Simple Mode configuration for syntax highlighting.
+1. Extract regex patterns from Lark grammar terminals
 2. Generate CodeMirror Simple Mode configuration for syntax highlighting
 3. Extract completion items from Lark grammar (keywords, operators, etc.)
 """
 import re
-from typing import Dict, List, Any
+from typing import Dict, List, Any, Set
 def lark_to_simple_mode(lark_grammar: str) -> Dict[str, Any]:
@@ -238,3 +240,85 @@ def generate_formatting_dsl_mode() -> Dict[str, Any]:
            {"regex": r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", "token": "variable"},
        ]
    }
 def extract_completions_from_grammar(lark_grammar: str) -> Dict[str, List[str]]:
    """
    Extract completion items from a Lark grammar.
    Parses the grammar to find:
    - Keywords (reserved words like if, not, and)
    - Operators (==, !=, contains, etc.)
    - Functions (style, format, etc.)
    - Types (number, date, boolean, etc.)
    - Literals (True, False, etc.)
    Args:
        lark_grammar: The Lark grammar string.
    Returns:
        Dictionary with completion categories:
        {
            "keywords": [...],
            "operators": [...],
            "functions": [...],
            "types": [...],
            "literals": [...]
        }
    """
    keywords: Set[str] = set()
    operators: Set[str] = set()
    functions: Set[str] = set()
    types: Set[str] = set()
    literals: Set[str] = set()
    # Find all quoted strings (potential keywords/operators)
    quoted_strings = re.findall(r'"([^"]+)"', lark_grammar)
    # Also look for terminal definitions with string alternatives (e.g., BOOLEAN: "True" | "False")
    terminal_literals = re.findall(r'[A-Z_]+:\s*"([^"]+)"(?:\s*\|\s*"([^"]+)")*', lark_grammar)
    for match in terminal_literals:
        for literal in match:
            if literal:
                quoted_strings.append(literal)
    for s in quoted_strings:
        s_lower = s.lower()
        # Classify based on pattern
        if s in ("==", "!=", "<=", "<", ">=", ">", "+", "-", "*", "/"):
            operators.add(s)
        elif s_lower in ("contains", "startswith", "endswith", "in", "between", "isempty", "isnotempty"):
            operators.add(s_lower)
        elif s_lower in ("if", "not", "and", "or"):
            keywords.add(s_lower)
        elif s_lower in ("true", "false"):
            literals.add(s)
        elif s_lower in ("style", "format"):
            functions.add(s_lower)
        elif s_lower in ("column", "row", "cell", "value", "col"):
            keywords.add(s_lower)
        elif s_lower in ("number", "date", "boolean", "text", "enum"):
            types.add(s_lower)
        elif s_lower == "case":
            keywords.add(s_lower)
    # Find function-like patterns: word "("
    function_patterns = re.findall(r'"(\w+)"\s*"?\("', lark_grammar)
    for func in function_patterns:
        if func.lower() not in ("true", "false"):
            functions.add(func.lower())
    # Find type patterns from format_type rule
    type_match = re.search(r'format_type\s*:\s*(.+?)(?:\n\n|\Z)', lark_grammar, re.DOTALL)
    if type_match:
        type_strings = re.findall(r'"(\w+)"', type_match.group(1))
        types.update(t.lower() for t in type_strings)
    return {
        "keywords": sorted(keywords),
        "operators": sorted(operators),
        "functions": sorted(functions),
        "types": sorted(types),
        "literals": sorted(literals),
    }
@@ -285,7 +285,9 @@ def _get_column_value_suggestions(
    return []
  try:
-    values = provider.list_column_values(scope.column_name)
+    # Use table_name from scope, or empty string as fallback
    table_name = scope.table_name or ""
    values = provider.list_column_values(table_name, scope.column_name)
    suggestions = []
    for value in values:
      if value is None:
@@ -1,172 +0,0 @@
 """Tests for lark_to_lezer module."""
 import pytest
 from myfasthtml.core.dsl.lark_to_lezer import (
  extract_completions_from_grammar,
  lark_to_lezer_grammar,
 )
 # Sample grammars for testing
 SIMPLE_GRAMMAR = r'''
    start: rule+
    rule: "if" condition
    condition: "value" operator literal
    operator: "==" -> op_eq
           | "!=" -> op_ne
           | "contains" -> op_contains
    literal: QUOTED_STRING -> string_literal
          | BOOLEAN -> boolean_literal
    QUOTED_STRING: /"[^"]*"/
    BOOLEAN: "True" | "False"
 '''
 GRAMMAR_WITH_KEYWORDS = r'''
    start: scope+
    scope: "column" NAME ":" rule
        | "row" INTEGER ":" rule
        | "cell" cell_ref ":" rule
    rule: style_expr condition?
    condition: "if" "not"? comparison
    comparison: operand "and" operand
             | operand "or" operand
    style_expr: "style" "(" args ")"
    operand: "value" | literal
 '''
 GRAMMAR_WITH_TYPES = r'''
    format_type: "number" -> fmt_number
              | "date" -> fmt_date
              | "boolean" -> fmt_boolean
              | "text" -> fmt_text
              | "enum" -> fmt_enum
 '''
 class TestExtractCompletions:
  """Tests for extract_completions_from_grammar function."""
  def test_i_can_extract_keywords_from_grammar(self):
    """Test that keywords like if, not, and are extracted."""
    completions = extract_completions_from_grammar(GRAMMAR_WITH_KEYWORDS)
    assert "if" in completions["keywords"]
    assert "not" in completions["keywords"]
    assert "column" in completions["keywords"]
    assert "row" in completions["keywords"]
    assert "cell" in completions["keywords"]
    assert "value" in completions["keywords"]
  @pytest.mark.parametrize(
    "operator",
    ["==", "!=", "contains"],
  )
  def test_i_can_extract_operators_from_grammar(self, operator):
    """Test that operators are extracted from grammar."""
    completions = extract_completions_from_grammar(SIMPLE_GRAMMAR)
    assert operator in completions["operators"]
  def test_i_can_extract_functions_from_grammar(self):
    """Test that function-like constructs are extracted."""
    completions = extract_completions_from_grammar(GRAMMAR_WITH_KEYWORDS)
    assert "style" in completions["functions"]
  @pytest.mark.parametrize(
    "type_name",
    ["number", "date", "boolean", "text", "enum"],
  )
  def test_i_can_extract_types_from_grammar(self, type_name):
    """Test that type names are extracted from format_type rule."""
    completions = extract_completions_from_grammar(GRAMMAR_WITH_TYPES)
    assert type_name in completions["types"]
  @pytest.mark.parametrize("literal", [
      "True",
      "False"
  ])
  def test_i_can_extract_literals_from_grammar(self, literal):
    """Test that literal values like True/False are extracted."""
    completions = extract_completions_from_grammar(SIMPLE_GRAMMAR)
    assert literal in completions["literals"]
  def test_i_can_extract_completions_returns_all_categories(self):
    """Test that all completion categories are present in result."""
    completions = extract_completions_from_grammar(SIMPLE_GRAMMAR)
    assert "keywords" in completions
    assert "operators" in completions
    assert "functions" in completions
    assert "types" in completions
    assert "literals" in completions
  def test_i_can_extract_completions_returns_sorted_lists(self):
    """Test that completion lists are sorted alphabetically."""
    completions = extract_completions_from_grammar(SIMPLE_GRAMMAR)
    for category in completions.values():
      assert category == sorted(category)
 class TestLarkToLezerConversion:
  """Tests for lark_to_lezer_grammar function."""
  def test_i_can_convert_simple_grammar_to_lezer(self):
    """Test that a simple Lark grammar is converted to Lezer format."""
    lezer = lark_to_lezer_grammar(SIMPLE_GRAMMAR)
    # Should have @top directive
    assert "@top Start" in lezer
    # Should have @tokens block
    assert "@tokens {" in lezer
    # Should have @skip directive
    assert "@skip {" in lezer
  def test_i_can_convert_rule_names_to_pascal_case(self):
    """Test that snake_case rule names become PascalCase."""
    grammar = r'''
            my_rule: other_rule
            other_rule: "test"
        '''
    lezer = lark_to_lezer_grammar(grammar)
    assert "MyRule" in lezer
    assert "OtherRule" in lezer
  def test_i_cannot_include_internal_rules_in_lezer(self):
    """Test that rules starting with _ are not included."""
    grammar = r'''
            start: rule _NL
            rule: "test"
            _NL: /\n/
        '''
    lezer = lark_to_lezer_grammar(grammar)
    # Internal rules should not appear as Lezer rules
    assert "Nl {" not in lezer
  def test_i_can_convert_terminal_regex_to_lezer(self):
    """Test that terminal regex patterns are converted."""
    grammar = r'''
            NAME: /[a-zA-Z_][a-zA-Z0-9_]*/
        '''
    lezer = lark_to_lezer_grammar(grammar)
    assert "NAME" in lezer
  @pytest.mark.parametrize(
    "terminal,pattern",
    [
        ('BOOLEAN: "True" | "False"', "BOOLEAN"),
        ('KEYWORD: "if"', "KEYWORD"),
    ],
  )
  def test_i_can_convert_terminal_strings_to_lezer(self, terminal, pattern):
    """Test that terminal string literals are converted."""
    grammar = f"start: test\n{terminal}"
    lezer = lark_to_lezer_grammar(grammar)
    assert pattern in lezer
@@ -34,13 +34,13 @@ class MockProvider:
    Provides predefined data for columns, values, and presets.
    """
-    def get_tables(self) -> list[str]:
+    def list_tables(self) -> list[str]:
        return ["app.orders"]
-    def get_columns(self, table: str) -> list[str]:
+    def list_columns(self, table: str) -> list[str]:
        return ["id", "amount", "status"]
-    def get_column_values(self, column: str) -> list[Any]:
+    def list_column_values(self, table: str, column: str) -> list[Any]:
        if column == "status":
            return ["draft", "pending", "approved"]
        if column == "amount":
@@ -50,10 +50,10 @@ class MockProvider:
    def get_row_count(self, table: str) -> int:
        return 150
-    def get_style_presets(self) -> list[str]:
+    def list_style_presets(self) -> list[str]:
        return ["custom_highlight"]
-    def get_format_presets(self) -> list[str]:
+    def list_format_presets(self) -> list[str]:
        return ["CHF"]
@@ -84,14 +84,14 @@ class TestFormattingDSL:
        assert completions1 is completions2
-    def test_i_can_get_lezer_grammar_is_cached(self):
+    def test_i_can_get_simple_mode_config_is_cached(self):
-        """Test that lezer_grammar property is cached (same object returned)."""
+        """Test that simple_mode_config property is cached (same object returned)."""
        dsl = FormattingDSL()
-        lezer1 = dsl.lezer_grammar
+        config1 = dsl.simple_mode_config
-        lezer2 = dsl.lezer_grammar
+        config2 = dsl.simple_mode_config
-        assert lezer1 is lezer2
+        assert config1 is config2
    def test_i_can_get_editor_config(self):
        """Test that get_editor_config() returns expected structure."""
@@ -100,6 +100,7 @@ class TestFormattingDSL:
        config = dsl.get_editor_config()
        assert "name" in config
-        assert "lezerGrammar" in config
+        assert "simpleModeConfig" in config
        assert "completions" in config
        assert config["name"] == "Formatting DSL"
        assert "start" in config["simpleModeConfig"]  # Simple Mode structure