Added syntax colorization. Remove all references to deprecated lark_to_lezer module.

2026-02-07 11:08:34 +01:00
parent 1c1ced2a9f
commit ab4f251f0c
7 changed files with 106 additions and 461 deletions
@@ -9,10 +9,7 @@ from abc import ABC, abstractmethod
 from functools import cached_property
 from typing import List, Dict, Any

-# TODO: Replace with lark_to_simple_mode when implemented
-from myfasthtml.core.dsl.lark_to_lezer import (
-  extract_completions_from_grammar,  # Will be moved to utils.py
-)
+from myfasthtml.core.dsl.lark_to_simple_mode import extract_completions_from_grammar
 from myfasthtml.core.utils import make_safe_id


@@ -82,13 +79,13 @@ class DSLDefinition(ABC):

    Returns:
        Dictionary with:
-        - 'lezerGrammar': The Lezer grammar string
+        - 'simpleModeConfig': The CodeMirror Simple Mode configuration
        - 'completions': The completion items
        - 'name': The DSL name
    """
    return {
        "name": self.name,
-        "lezerGrammar": self.lezer_grammar,
+        "simpleModeConfig": self.simple_mode_config,
        "completions": self.completions,
    }
  
@@ -1,267 +0,0 @@
-# """
-# DEPRECATED: Utilities for converting Lark grammars to Lezer format.
-#
-# ⚠️ WARNING: This module is deprecated and will be removed in a future version.
-#
-# Original purpose:
-# - Transform a Lark grammar to a Lezer grammar for CodeMirror 6
-# - Extract completion items (keywords, operators, etc.) from a Lark grammar
-#
-# Deprecation reason:
-# - CodeMirror 6 requires a bundler (Webpack, Rollup, etc.)
-# - Incompatible with FastHTML's direct script inclusion approach
-# - Replaced by CodeMirror 5 Simple Mode (see lark_to_simple_mode.py)
-#
-# Migration path:
-# - Use lark_to_simple_mode.py for CodeMirror 5 syntax highlighting
-# - extract_completions_from_grammar() is still used and will be moved to utils.py
-# """
-#
-# import re
-# from typing import Dict, List, Set
-#
-#
-# def lark_to_lezer_grammar(lark_grammar: str) -> str:
-#   """
-#   Convert a Lark grammar to a Lezer grammar.
-#
-#   This is a simplified converter that handles common Lark patterns.
-#   Complex grammars may require manual adjustment.
-#
-#   Args:
-#       lark_grammar: The Lark grammar string.
-#
-#   Returns:
-#       The Lezer grammar string.
-#   """
-#   lines = lark_grammar.strip().split("\n")
-#   lezer_rules = []
-#   tokens = []
-#
-#   for line in lines:
-#     line = line.strip()
-#
-#     # Skip empty lines and comments
-#     if not line or line.startswith("//") or line.startswith("#"):
-#       continue
-#
-#     # Skip Lark-specific directives
-#     if line.startswith("%"):
-#       continue
-#
-#     # Parse rule definitions (lowercase names only)
-#     rule_match = re.match(r"^([a-z_][a-z0-9_]*)\s*:\s*(.+)$", line)
-#     if rule_match:
-#       name, body = rule_match.groups()
-#       lezer_rule = _convert_rule(name, body)
-#       if lezer_rule:
-#         lezer_rules.append(lezer_rule)
-#       continue
-#
-#     # Parse terminal definitions (uppercase names)
-#     terminal_match = re.match(r"^([A-Z_][A-Z0-9_]*)\s*:\s*(.+)$", line)
-#     if terminal_match:
-#       name, pattern = terminal_match.groups()
-#       token = _convert_terminal(name, pattern)
-#       if token:
-#         tokens.append(token)
-#
-#   # Build Lezer grammar
-#   lezer_output = ["@top Start { scope+ }", ""]
-#
-#   # Add rules
-#   for rule in lezer_rules:
-#     lezer_output.append(rule)
-#
-#   lezer_output.append("")
-#   lezer_output.append("@tokens {")
-#
-#   # Add tokens
-#   for token in tokens:
-#     lezer_output.append(f"  {token}")
-#
-#   # Add common tokens
-#   lezer_output.extend([
-#       '  whitespace { $[ \\t]+ }',
-#       '  newline { $[\\n\\r] }',
-#       '  Comment { "#" ![$\\n]* }',
-#   ])
-#
-#   lezer_output.append("}")
-#   lezer_output.append("")
-#   lezer_output.append("@skip { whitespace | Comment }")
-#
-#   return "\n".join(lezer_output)
-#
-#
-# def _convert_rule(name: str, body: str) -> str:
-#   """Convert a single Lark rule to Lezer format."""
-#   # Skip internal rules (starting with _)
-#   if name.startswith("_"):
-#     return ""
-#
-#   # Convert rule name to PascalCase for Lezer
-#   lezer_name = _to_pascal_case(name)
-#
-#   # Convert body
-#   lezer_body = _convert_body(body)
-#
-#   if lezer_body:
-#     return f"{lezer_name} {{ {lezer_body} }}"
-#   return ""
-#
-#
-# def _convert_terminal(name: str, pattern: str) -> str:
-#   """Convert a Lark terminal to Lezer token format."""
-#   pattern = pattern.strip()
-#
-#   # Handle regex patterns
-#   if pattern.startswith("/") and pattern.endswith("/"):
-#     regex = pattern[1:-1]
-#     # Convert to Lezer regex format
-#     return f'{name} {{ ${regex}$ }}'
-#
-#   # Handle string literals
-#   if pattern.startswith('"') or pattern.startswith("'"):
-#     return f'{name} {{ {pattern} }}'
-#
-#   # Handle alternatives (literal strings separated by |)
-#   if "|" in pattern:
-#     alternatives = [alt.strip() for alt in pattern.split("|")]
-#     if all(alt.startswith('"') or alt.startswith("'") for alt in alternatives):
-#       return f'{name} {{ {" | ".join(alternatives)} }}'
-#
-#   return ""
-#
-#
-# def _convert_body(body: str) -> str:
-#   """Convert the body of a Lark rule to Lezer format."""
-#   # Remove inline transformations (-> name)
-#   body = re.sub(r"\s*->\s*\w+", "", body)
-#
-#   # Convert alternatives
-#   parts = []
-#   for alt in body.split("|"):
-#     alt = alt.strip()
-#     if alt:
-#       converted = _convert_sequence(alt)
-#       if converted:
-#         parts.append(converted)
-#
-#   return " | ".join(parts)
-#
-#
-# def _convert_sequence(seq: str) -> str:
-#   """Convert a sequence of items in a rule."""
-#   items = []
-#
-#   # Tokenize the sequence
-#   tokens = re.findall(
-#     r'"[^"]*"|\'[^\']*\'|/[^/]+/|\([^)]+\)|\[[^\]]+\]|[a-zA-Z_][a-zA-Z0-9_]*|\?|\*|\+',
-#     seq
-#   )
-#
-#   for token in tokens:
-#     if token.startswith('"') or token.startswith("'"):
-#       # String literal
-#       items.append(token)
-#     elif token.startswith("("):
-#       # Group
-#       inner = token[1:-1]
-#       items.append(f"({_convert_body(inner)})")
-#     elif token.startswith("["):
-#       # Optional group in Lark
-#       inner = token[1:-1]
-#       items.append(f"({_convert_body(inner)})?")
-#     elif token in ("?", "*", "+"):
-#       # Quantifiers - attach to previous item
-#       if items:
-#         items[-1] = items[-1] + token
-#     elif token.isupper() or token.startswith("_"):
-#       # Terminal reference
-#       items.append(token)
-#     elif token.islower() or "_" in token:
-#       # Rule reference - convert to PascalCase
-#       items.append(_to_pascal_case(token))
-#
-#   return " ".join(items)
-#
-#
-# def _to_pascal_case(name: str) -> str:
-#   """Convert snake_case to PascalCase."""
-#   return "".join(word.capitalize() for word in name.split("_"))
-#
-#
-# def extract_completions_from_grammar(lark_grammar: str) -> Dict[str, List[str]]:
-#   """
-#   Extract completion items from a Lark grammar.
-#
-#   Parses the grammar to find:
-#   - Keywords (reserved words like if, not, and)
-#   - Operators (==, !=, contains, etc.)
-#   - Functions (style, format, etc.)
-#   - Types (number, date, boolean, etc.)
-#   - Literals (True, False, etc.)
-#
-#   Args:
-#       lark_grammar: The Lark grammar string.
-#
-#   Returns:
-#       Dictionary with completion categories.
-#   """
-#   keywords: Set[str] = set()
-#   operators: Set[str] = set()
-#   functions: Set[str] = set()
-#   types: Set[str] = set()
-#   literals: Set[str] = set()
-#
-#   # Find all quoted strings (potential keywords/operators)
-#   quoted_strings = re.findall(r'"([^"]+)"', lark_grammar)
-#
-#   # Also look for terminal definitions with string alternatives (e.g., BOOLEAN: "True" | "False")
-#   terminal_literals = re.findall(r'[A-Z_]+:\s*"([^"]+)"(?:\s*\|\s*"([^"]+)")*', lark_grammar)
-#   for match in terminal_literals:
-#     for literal in match:
-#       if literal:
-#         quoted_strings.append(literal)
-#
-#   for s in quoted_strings:
-#     s_lower = s.lower()
-#
-#     # Classify based on pattern
-#     if s in ("==", "!=", "<=", "<", ">=", ">", "+", "-", "*", "/"):
-#       operators.add(s)
-#     elif s_lower in ("contains", "startswith", "endswith", "in", "between", "isempty", "isnotempty"):
-#       operators.add(s_lower)
-#     elif s_lower in ("if", "not", "and", "or"):
-#       keywords.add(s_lower)
-#     elif s_lower in ("true", "false"):
-#       literals.add(s)
-#     elif s_lower in ("style", "format"):
-#       functions.add(s_lower)
-#     elif s_lower in ("column", "row", "cell", "value", "col"):
-#       keywords.add(s_lower)
-#     elif s_lower in ("number", "date", "boolean", "text", "enum"):
-#       types.add(s_lower)
-#     elif s_lower == "case":
-#       keywords.add(s_lower)
-#
-#   # Find function-like patterns: word "("
-#   function_patterns = re.findall(r'"(\w+)"\s*"?\("', lark_grammar)
-#   for func in function_patterns:
-#     if func.lower() not in ("true", "false"):
-#       functions.add(func.lower())
-#
-#   # Find type patterns from format_type rule
-#   type_match = re.search(r'format_type\s*:\s*(.+?)(?:\n\n|\Z)', lark_grammar, re.DOTALL)
-#   if type_match:
-#     type_strings = re.findall(r'"(\w+)"', type_match.group(1))
-#     types.update(t.lower() for t in type_strings)
-#
-#   return {
-#       "keywords": sorted(keywords),
-#       "operators": sorted(operators),
-#       "functions": sorted(functions),
-#       "types": sorted(types),
-#       "literals": sorted(literals),
-#   }
@@ -1,12 +1,14 @@
 """
 Utilities for converting Lark grammars to CodeMirror 5 Simple Mode format.

-This module provides functions to extract regex patterns from Lark grammar
-terminals and generate a CodeMirror Simple Mode configuration for syntax highlighting.
+This module provides functions to:
+1. Extract regex patterns from Lark grammar terminals
+2. Generate CodeMirror Simple Mode configuration for syntax highlighting
+3. Extract completion items from Lark grammar (keywords, operators, etc.)
 """

 import re
-from typing import Dict, List, Any
+from typing import Dict, List, Any, Set


 def lark_to_simple_mode(lark_grammar: str) -> Dict[str, Any]:
@@ -238,3 +240,85 @@ def generate_formatting_dsl_mode() -> Dict[str, Any]:
            {"regex": r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", "token": "variable"},
        ]
    }
+
+
+def extract_completions_from_grammar(lark_grammar: str) -> Dict[str, List[str]]:
+    """
+    Extract completion items from a Lark grammar.
+
+    Parses the grammar to find:
+    - Keywords (reserved words like if, not, and)
+    - Operators (==, !=, contains, etc.)
+    - Functions (style, format, etc.)
+    - Types (number, date, boolean, etc.)
+    - Literals (True, False, etc.)
+
+    Args:
+        lark_grammar: The Lark grammar string.
+
+    Returns:
+        Dictionary with completion categories:
+        {
+            "keywords": [...],
+            "operators": [...],
+            "functions": [...],
+            "types": [...],
+            "literals": [...]
+        }
+    """
+    keywords: Set[str] = set()
+    operators: Set[str] = set()
+    functions: Set[str] = set()
+    types: Set[str] = set()
+    literals: Set[str] = set()
+
+    # Find all quoted strings (potential keywords/operators)
+    quoted_strings = re.findall(r'"([^"]+)"', lark_grammar)
+
+    # Also look for terminal definitions with string alternatives (e.g., BOOLEAN: "True" | "False")
+    terminal_literals = re.findall(r'[A-Z_]+:\s*"([^"]+)"(?:\s*\|\s*"([^"]+)")*', lark_grammar)
+    for match in terminal_literals:
+        for literal in match:
+            if literal:
+                quoted_strings.append(literal)
+
+    for s in quoted_strings:
+        s_lower = s.lower()
+
+        # Classify based on pattern
+        if s in ("==", "!=", "<=", "<", ">=", ">", "+", "-", "*", "/"):
+            operators.add(s)
+        elif s_lower in ("contains", "startswith", "endswith", "in", "between", "isempty", "isnotempty"):
+            operators.add(s_lower)
+        elif s_lower in ("if", "not", "and", "or"):
+            keywords.add(s_lower)
+        elif s_lower in ("true", "false"):
+            literals.add(s)
+        elif s_lower in ("style", "format"):
+            functions.add(s_lower)
+        elif s_lower in ("column", "row", "cell", "value", "col"):
+            keywords.add(s_lower)
+        elif s_lower in ("number", "date", "boolean", "text", "enum"):
+            types.add(s_lower)
+        elif s_lower == "case":
+            keywords.add(s_lower)
+
+    # Find function-like patterns: word "("
+    function_patterns = re.findall(r'"(\w+)"\s*"?\("', lark_grammar)
+    for func in function_patterns:
+        if func.lower() not in ("true", "false"):
+            functions.add(func.lower())
+
+    # Find type patterns from format_type rule
+    type_match = re.search(r'format_type\s*:\s*(.+?)(?:\n\n|\Z)', lark_grammar, re.DOTALL)
+    if type_match:
+        type_strings = re.findall(r'"(\w+)"', type_match.group(1))
+        types.update(t.lower() for t in type_strings)
+
+    return {
+        "keywords": sorted(keywords),
+        "operators": sorted(operators),
+        "functions": sorted(functions),
+        "types": sorted(types),
+        "literals": sorted(literals),
+    }
@@ -283,9 +283,11 @@ def _get_column_value_suggestions(
  """Get column value suggestions based on the current scope."""
  if not scope.column_name:
    return []
-  
+
  try:
-    values = provider.list_column_values(scope.column_name)
+    # Use table_name from scope, or empty string as fallback
+    table_name = scope.table_name or ""
+    values = provider.list_column_values(table_name, scope.column_name)
    suggestions = []
    for value in values:
      if value is None:
@@ -1,172 +0,0 @@
-"""Tests for lark_to_lezer module."""
-
-import pytest
-
-from myfasthtml.core.dsl.lark_to_lezer import (
-  extract_completions_from_grammar,
-  lark_to_lezer_grammar,
-)
-
-# Sample grammars for testing
-SIMPLE_GRAMMAR = r'''
-    start: rule+
-    rule: "if" condition
-    condition: "value" operator literal
-    operator: "==" -> op_eq
-           | "!=" -> op_ne
-           | "contains" -> op_contains
-    literal: QUOTED_STRING -> string_literal
-          | BOOLEAN -> boolean_literal
-    QUOTED_STRING: /"[^"]*"/
-    BOOLEAN: "True" | "False"
-'''
-
-GRAMMAR_WITH_KEYWORDS = r'''
-    start: scope+
-    scope: "column" NAME ":" rule
-        | "row" INTEGER ":" rule
-        | "cell" cell_ref ":" rule
-    rule: style_expr condition?
-    condition: "if" "not"? comparison
-    comparison: operand "and" operand
-             | operand "or" operand
-    style_expr: "style" "(" args ")"
-    operand: "value" | literal
-'''
-
-GRAMMAR_WITH_TYPES = r'''
-    format_type: "number" -> fmt_number
-              | "date" -> fmt_date
-              | "boolean" -> fmt_boolean
-              | "text" -> fmt_text
-              | "enum" -> fmt_enum
-'''
-
-
-class TestExtractCompletions:
-  """Tests for extract_completions_from_grammar function."""
-  
-  def test_i_can_extract_keywords_from_grammar(self):
-    """Test that keywords like if, not, and are extracted."""
-    completions = extract_completions_from_grammar(GRAMMAR_WITH_KEYWORDS)
-    
-    assert "if" in completions["keywords"]
-    assert "not" in completions["keywords"]
-    assert "column" in completions["keywords"]
-    assert "row" in completions["keywords"]
-    assert "cell" in completions["keywords"]
-    assert "value" in completions["keywords"]
-  
-  @pytest.mark.parametrize(
-    "operator",
-    ["==", "!=", "contains"],
-  )
-  def test_i_can_extract_operators_from_grammar(self, operator):
-    """Test that operators are extracted from grammar."""
-    completions = extract_completions_from_grammar(SIMPLE_GRAMMAR)
-    
-    assert operator in completions["operators"]
-  
-  def test_i_can_extract_functions_from_grammar(self):
-    """Test that function-like constructs are extracted."""
-    completions = extract_completions_from_grammar(GRAMMAR_WITH_KEYWORDS)
-    
-    assert "style" in completions["functions"]
-  
-  @pytest.mark.parametrize(
-    "type_name",
-    ["number", "date", "boolean", "text", "enum"],
-  )
-  def test_i_can_extract_types_from_grammar(self, type_name):
-    """Test that type names are extracted from format_type rule."""
-    completions = extract_completions_from_grammar(GRAMMAR_WITH_TYPES)
-    
-    assert type_name in completions["types"]
-  
-  @pytest.mark.parametrize("literal", [
-      "True",
-      "False"
-  ])
-  def test_i_can_extract_literals_from_grammar(self, literal):
-    """Test that literal values like True/False are extracted."""
-    completions = extract_completions_from_grammar(SIMPLE_GRAMMAR)
-    
-    assert literal in completions["literals"]
-  
-  def test_i_can_extract_completions_returns_all_categories(self):
-    """Test that all completion categories are present in result."""
-    completions = extract_completions_from_grammar(SIMPLE_GRAMMAR)
-    
-    assert "keywords" in completions
-    assert "operators" in completions
-    assert "functions" in completions
-    assert "types" in completions
-    assert "literals" in completions
-  
-  def test_i_can_extract_completions_returns_sorted_lists(self):
-    """Test that completion lists are sorted alphabetically."""
-    completions = extract_completions_from_grammar(SIMPLE_GRAMMAR)
-    
-    for category in completions.values():
-      assert category == sorted(category)
-
-
-class TestLarkToLezerConversion:
-  """Tests for lark_to_lezer_grammar function."""
-  
-  def test_i_can_convert_simple_grammar_to_lezer(self):
-    """Test that a simple Lark grammar is converted to Lezer format."""
-    lezer = lark_to_lezer_grammar(SIMPLE_GRAMMAR)
-    
-    # Should have @top directive
-    assert "@top Start" in lezer
-    # Should have @tokens block
-    assert "@tokens {" in lezer
-    # Should have @skip directive
-    assert "@skip {" in lezer
-  
-  def test_i_can_convert_rule_names_to_pascal_case(self):
-    """Test that snake_case rule names become PascalCase."""
-    grammar = r'''
-            my_rule: other_rule
-            other_rule: "test"
-        '''
-    lezer = lark_to_lezer_grammar(grammar)
-    
-    assert "MyRule" in lezer
-    assert "OtherRule" in lezer
-  
-  def test_i_cannot_include_internal_rules_in_lezer(self):
-    """Test that rules starting with _ are not included."""
-    grammar = r'''
-            start: rule _NL
-            rule: "test"
-            _NL: /\n/
-        '''
-    lezer = lark_to_lezer_grammar(grammar)
-    
-    # Internal rules should not appear as Lezer rules
-    assert "Nl {" not in lezer
-  
-  def test_i_can_convert_terminal_regex_to_lezer(self):
-    """Test that terminal regex patterns are converted."""
-    grammar = r'''
-            NAME: /[a-zA-Z_][a-zA-Z0-9_]*/
-        '''
-    lezer = lark_to_lezer_grammar(grammar)
-    
-    assert "NAME" in lezer
-  
-  @pytest.mark.parametrize(
-    "terminal,pattern",
-    [
-        ('BOOLEAN: "True" | "False"', "BOOLEAN"),
-        ('KEYWORD: "if"', "KEYWORD"),
-    ],
-  )
-  def test_i_can_convert_terminal_strings_to_lezer(self, terminal, pattern):
-    """Test that terminal string literals are converted."""
-    grammar = f"start: test\n{terminal}"
-    lezer = lark_to_lezer_grammar(grammar)
-    
-    assert pattern in lezer
@@ -34,13 +34,13 @@ class MockProvider:
    Provides predefined data for columns, values, and presets.
    """

-    def get_tables(self) -> list[str]:
+    def list_tables(self) -> list[str]:
        return ["app.orders"]

-    def get_columns(self, table: str) -> list[str]:
+    def list_columns(self, table: str) -> list[str]:
        return ["id", "amount", "status"]

-    def get_column_values(self, column: str) -> list[Any]:
+    def list_column_values(self, table: str, column: str) -> list[Any]:
        if column == "status":
            return ["draft", "pending", "approved"]
        if column == "amount":
@@ -50,10 +50,10 @@ class MockProvider:
    def get_row_count(self, table: str) -> int:
        return 150

-    def get_style_presets(self) -> list[str]:
+    def list_style_presets(self) -> list[str]:
        return ["custom_highlight"]

-    def get_format_presets(self) -> list[str]:
+    def list_format_presets(self) -> list[str]:
        return ["CHF"]


@@ -84,14 +84,14 @@ class TestFormattingDSL:

        assert completions1 is completions2

-    def test_i_can_get_lezer_grammar_is_cached(self):
-        """Test that lezer_grammar property is cached (same object returned)."""
+    def test_i_can_get_simple_mode_config_is_cached(self):
+        """Test that simple_mode_config property is cached (same object returned)."""
        dsl = FormattingDSL()

-        lezer1 = dsl.lezer_grammar
-        lezer2 = dsl.lezer_grammar
+        config1 = dsl.simple_mode_config
+        config2 = dsl.simple_mode_config

-        assert lezer1 is lezer2
+        assert config1 is config2

    def test_i_can_get_editor_config(self):
        """Test that get_editor_config() returns expected structure."""
@@ -100,6 +100,7 @@ class TestFormattingDSL:
        config = dsl.get_editor_config()

        assert "name" in config
-        assert "lezerGrammar" in config
+        assert "simpleModeConfig" in config
        assert "completions" in config
        assert config["name"] == "Formatting DSL"
+        assert "start" in config["simpleModeConfig"]  # Simple Mode structure