Added syntax colorization

2026-02-07 10:52:40 +01:00
parent db1e94f930
commit 1c1ced2a9f
13 changed files with 1049 additions and 330 deletions
--- a/src/myfasthtml/core/dsl/base.py
+++ b/src/myfasthtml/core/dsl/base.py
@@ -9,9 +9,9 @@ from abc import ABC, abstractmethod
 from functools import cached_property
 from typing import List, Dict, Any

+# TODO: Replace with lark_to_simple_mode when implemented
 from myfasthtml.core.dsl.lark_to_lezer import (
-  lark_to_lezer_grammar,
-  extract_completions_from_grammar,
+  extract_completions_from_grammar,  # Will be moved to utils.py
 )
 from myfasthtml.core.utils import make_safe_id

@@ -39,18 +39,6 @@ class DSLDefinition(ABC):
    """
    pass
  
-  @cached_property
-  def lezer_grammar(self) -> str:
-    """
-    Return the Lezer grammar derived from the Lark grammar.
-
-    This is cached after first computation.
-
-    Returns:
-        The Lezer grammar as a string.
-    """
-    return lark_to_lezer_grammar(self.get_grammar())
-  
  @cached_property
  def completions(self) -> Dict[str, List[str]]:
    """
@@ -68,6 +56,26 @@ class DSLDefinition(ABC):
    """
    return extract_completions_from_grammar(self.get_grammar())
  
+  @cached_property
+  def simple_mode_config(self) -> Dict[str, Any]:
+    """
+    Return the CodeMirror 5 Simple Mode configuration for syntax highlighting.
+
+    This is cached after first computation.
+
+    Returns:
+        Dictionary with Simple Mode rules:
+        {
+            "start": [
+                {"regex": "...", "token": "keyword"},
+                {"regex": "...", "token": "string"},
+                ...
+            ]
+        }
+    """
+    from myfasthtml.core.dsl.lark_to_simple_mode import lark_to_simple_mode
+    return lark_to_simple_mode(self.get_grammar())
+  
  def get_editor_config(self) -> Dict[str, Any]:
    """
    Return the configuration for the DslEditor JavaScript initialization.
--- a/src/myfasthtml/core/dsl/lark_to_lezer.py
+++ b/src/myfasthtml/core/dsl/lark_to_lezer.py
@@ -1,256 +1,267 @@
-"""
-Utilities for converting Lark grammars to Lezer format and extracting completions.
-
-This module provides functions to:
-1. Transform a Lark grammar to a Lezer grammar for CodeMirror
-2. Extract completion items (keywords, operators, etc.) from a Lark grammar
-"""
-
-import re
-from typing import Dict, List, Set
-
-
-def lark_to_lezer_grammar(lark_grammar: str) -> str:
-  """
-  Convert a Lark grammar to a Lezer grammar.
-
-  This is a simplified converter that handles common Lark patterns.
-  Complex grammars may require manual adjustment.
-
-  Args:
-      lark_grammar: The Lark grammar string.
-
-  Returns:
-      The Lezer grammar string.
-  """
-  lines = lark_grammar.strip().split("\n")
-  lezer_rules = []
-  tokens = []
-  
-  for line in lines:
-    line = line.strip()
-    
-    # Skip empty lines and comments
-    if not line or line.startswith("//") or line.startswith("#"):
-      continue
-    
-    # Skip Lark-specific directives
-    if line.startswith("%"):
-      continue
-    
-    # Parse rule definitions (lowercase names only)
-    rule_match = re.match(r"^([a-z_][a-z0-9_]*)\s*:\s*(.+)$", line)
-    if rule_match:
-      name, body = rule_match.groups()
-      lezer_rule = _convert_rule(name, body)
-      if lezer_rule:
-        lezer_rules.append(lezer_rule)
-      continue
-    
-    # Parse terminal definitions (uppercase names)
-    terminal_match = re.match(r"^([A-Z_][A-Z0-9_]*)\s*:\s*(.+)$", line)
-    if terminal_match:
-      name, pattern = terminal_match.groups()
-      token = _convert_terminal(name, pattern)
-      if token:
-        tokens.append(token)
-  
-  # Build Lezer grammar
-  lezer_output = ["@top Start { scope+ }", ""]
-  
-  # Add rules
-  for rule in lezer_rules:
-    lezer_output.append(rule)
-  
-  lezer_output.append("")
-  lezer_output.append("@tokens {")
-  
-  # Add tokens
-  for token in tokens:
-    lezer_output.append(f"  {token}")
-  
-  # Add common tokens
-  lezer_output.extend([
-      '  whitespace { $[ \\t]+ }',
-      '  newline { $[\\n\\r] }',
-      '  Comment { "#" ![$\\n]* }',
-  ])
-  
-  lezer_output.append("}")
-  lezer_output.append("")
-  lezer_output.append("@skip { whitespace | Comment }")
-  
-  return "\n".join(lezer_output)
-
-
-def _convert_rule(name: str, body: str) -> str:
-  """Convert a single Lark rule to Lezer format."""
-  # Skip internal rules (starting with _)
-  if name.startswith("_"):
-    return ""
-  
-  # Convert rule name to PascalCase for Lezer
-  lezer_name = _to_pascal_case(name)
-  
-  # Convert body
-  lezer_body = _convert_body(body)
-  
-  if lezer_body:
-    return f"{lezer_name} {{ {lezer_body} }}"
-  return ""
-
-
-def _convert_terminal(name: str, pattern: str) -> str:
-  """Convert a Lark terminal to Lezer token format."""
-  pattern = pattern.strip()
-  
-  # Handle regex patterns
-  if pattern.startswith("/") and pattern.endswith("/"):
-    regex = pattern[1:-1]
-    # Convert to Lezer regex format
-    return f'{name} {{ ${regex}$ }}'
-  
-  # Handle string literals
-  if pattern.startswith('"') or pattern.startswith("'"):
-    return f'{name} {{ {pattern} }}'
-  
-  # Handle alternatives (literal strings separated by |)
-  if "|" in pattern:
-    alternatives = [alt.strip() for alt in pattern.split("|")]
-    if all(alt.startswith('"') or alt.startswith("'") for alt in alternatives):
-      return f'{name} {{ {" | ".join(alternatives)} }}'
-  
-  return ""
-
-
-def _convert_body(body: str) -> str:
-  """Convert the body of a Lark rule to Lezer format."""
-  # Remove inline transformations (-> name)
-  body = re.sub(r"\s*->\s*\w+", "", body)
-  
-  # Convert alternatives
-  parts = []
-  for alt in body.split("|"):
-    alt = alt.strip()
-    if alt:
-      converted = _convert_sequence(alt)
-      if converted:
-        parts.append(converted)
-  
-  return " | ".join(parts)
-
-
-def _convert_sequence(seq: str) -> str:
-  """Convert a sequence of items in a rule."""
-  items = []
-  
-  # Tokenize the sequence
-  tokens = re.findall(
-    r'"[^"]*"|\'[^\']*\'|/[^/]+/|\([^)]+\)|\[[^\]]+\]|[a-zA-Z_][a-zA-Z0-9_]*|\?|\*|\+',
-    seq
-  )
-  
-  for token in tokens:
-    if token.startswith('"') or token.startswith("'"):
-      # String literal
-      items.append(token)
-    elif token.startswith("("):
-      # Group
-      inner = token[1:-1]
-      items.append(f"({_convert_body(inner)})")
-    elif token.startswith("["):
-      # Optional group in Lark
-      inner = token[1:-1]
-      items.append(f"({_convert_body(inner)})?")
-    elif token in ("?", "*", "+"):
-      # Quantifiers - attach to previous item
-      if items:
-        items[-1] = items[-1] + token
-    elif token.isupper() or token.startswith("_"):
-      # Terminal reference
-      items.append(token)
-    elif token.islower() or "_" in token:
-      # Rule reference - convert to PascalCase
-      items.append(_to_pascal_case(token))
-  
-  return " ".join(items)
-
-
-def _to_pascal_case(name: str) -> str:
-  """Convert snake_case to PascalCase."""
-  return "".join(word.capitalize() for word in name.split("_"))
-
-
-def extract_completions_from_grammar(lark_grammar: str) -> Dict[str, List[str]]:
-  """
-  Extract completion items from a Lark grammar.
-
-  Parses the grammar to find:
-  - Keywords (reserved words like if, not, and)
-  - Operators (==, !=, contains, etc.)
-  - Functions (style, format, etc.)
-  - Types (number, date, boolean, etc.)
-  - Literals (True, False, etc.)
-
-  Args:
-      lark_grammar: The Lark grammar string.
-
-  Returns:
-      Dictionary with completion categories.
-  """
-  keywords: Set[str] = set()
-  operators: Set[str] = set()
-  functions: Set[str] = set()
-  types: Set[str] = set()
-  literals: Set[str] = set()
-  
-  # Find all quoted strings (potential keywords/operators)
-  quoted_strings = re.findall(r'"([^"]+)"', lark_grammar)
-  
-  # Also look for terminal definitions with string alternatives (e.g., BOOLEAN: "True" | "False")
-  terminal_literals = re.findall(r'[A-Z_]+:\s*"([^"]+)"(?:\s*\|\s*"([^"]+)")*', lark_grammar)
-  for match in terminal_literals:
-    for literal in match:
-      if literal:
-        quoted_strings.append(literal)
-  
-  for s in quoted_strings:
-    s_lower = s.lower()
-    
-    # Classify based on pattern
-    if s in ("==", "!=", "<=", "<", ">=", ">", "+", "-", "*", "/"):
-      operators.add(s)
-    elif s_lower in ("contains", "startswith", "endswith", "in", "between", "isempty", "isnotempty"):
-      operators.add(s_lower)
-    elif s_lower in ("if", "not", "and", "or"):
-      keywords.add(s_lower)
-    elif s_lower in ("true", "false"):
-      literals.add(s)
-    elif s_lower in ("style", "format"):
-      functions.add(s_lower)
-    elif s_lower in ("column", "row", "cell", "value", "col"):
-      keywords.add(s_lower)
-    elif s_lower in ("number", "date", "boolean", "text", "enum"):
-      types.add(s_lower)
-    elif s_lower == "case":
-      keywords.add(s_lower)
-  
-  # Find function-like patterns: word "("
-  function_patterns = re.findall(r'"(\w+)"\s*"?\("', lark_grammar)
-  for func in function_patterns:
-    if func.lower() not in ("true", "false"):
-      functions.add(func.lower())
-  
-  # Find type patterns from format_type rule
-  type_match = re.search(r'format_type\s*:\s*(.+?)(?:\n\n|\Z)', lark_grammar, re.DOTALL)
-  if type_match:
-    type_strings = re.findall(r'"(\w+)"', type_match.group(1))
-    types.update(t.lower() for t in type_strings)
-  
-  return {
-      "keywords": sorted(keywords),
-      "operators": sorted(operators),
-      "functions": sorted(functions),
-      "types": sorted(types),
-      "literals": sorted(literals),
-  }
+# """
+# DEPRECATED: Utilities for converting Lark grammars to Lezer format.
+#
+# ⚠️ WARNING: This module is deprecated and will be removed in a future version.
+#
+# Original purpose:
+# - Transform a Lark grammar to a Lezer grammar for CodeMirror 6
+# - Extract completion items (keywords, operators, etc.) from a Lark grammar
+#
+# Deprecation reason:
+# - CodeMirror 6 requires a bundler (Webpack, Rollup, etc.)
+# - Incompatible with FastHTML's direct script inclusion approach
+# - Replaced by CodeMirror 5 Simple Mode (see lark_to_simple_mode.py)
+#
+# Migration path:
+# - Use lark_to_simple_mode.py for CodeMirror 5 syntax highlighting
+# - extract_completions_from_grammar() is still used and will be moved to utils.py
+# """
+#
+# import re
+# from typing import Dict, List, Set
+#
+#
+# def lark_to_lezer_grammar(lark_grammar: str) -> str:
+#   """
+#   Convert a Lark grammar to a Lezer grammar.
+#
+#   This is a simplified converter that handles common Lark patterns.
+#   Complex grammars may require manual adjustment.
+#
+#   Args:
+#       lark_grammar: The Lark grammar string.
+#
+#   Returns:
+#       The Lezer grammar string.
+#   """
+#   lines = lark_grammar.strip().split("\n")
+#   lezer_rules = []
+#   tokens = []
+#
+#   for line in lines:
+#     line = line.strip()
+#
+#     # Skip empty lines and comments
+#     if not line or line.startswith("//") or line.startswith("#"):
+#       continue
+#
+#     # Skip Lark-specific directives
+#     if line.startswith("%"):
+#       continue
+#
+#     # Parse rule definitions (lowercase names only)
+#     rule_match = re.match(r"^([a-z_][a-z0-9_]*)\s*:\s*(.+)$", line)
+#     if rule_match:
+#       name, body = rule_match.groups()
+#       lezer_rule = _convert_rule(name, body)
+#       if lezer_rule:
+#         lezer_rules.append(lezer_rule)
+#       continue
+#
+#     # Parse terminal definitions (uppercase names)
+#     terminal_match = re.match(r"^([A-Z_][A-Z0-9_]*)\s*:\s*(.+)$", line)
+#     if terminal_match:
+#       name, pattern = terminal_match.groups()
+#       token = _convert_terminal(name, pattern)
+#       if token:
+#         tokens.append(token)
+#
+#   # Build Lezer grammar
+#   lezer_output = ["@top Start { scope+ }", ""]
+#
+#   # Add rules
+#   for rule in lezer_rules:
+#     lezer_output.append(rule)
+#
+#   lezer_output.append("")
+#   lezer_output.append("@tokens {")
+#
+#   # Add tokens
+#   for token in tokens:
+#     lezer_output.append(f"  {token}")
+#
+#   # Add common tokens
+#   lezer_output.extend([
+#       '  whitespace { $[ \\t]+ }',
+#       '  newline { $[\\n\\r] }',
+#       '  Comment { "#" ![$\\n]* }',
+#   ])
+#
+#   lezer_output.append("}")
+#   lezer_output.append("")
+#   lezer_output.append("@skip { whitespace | Comment }")
+#
+#   return "\n".join(lezer_output)
+#
+#
+# def _convert_rule(name: str, body: str) -> str:
+#   """Convert a single Lark rule to Lezer format."""
+#   # Skip internal rules (starting with _)
+#   if name.startswith("_"):
+#     return ""
+#
+#   # Convert rule name to PascalCase for Lezer
+#   lezer_name = _to_pascal_case(name)
+#
+#   # Convert body
+#   lezer_body = _convert_body(body)
+#
+#   if lezer_body:
+#     return f"{lezer_name} {{ {lezer_body} }}"
+#   return ""
+#
+#
+# def _convert_terminal(name: str, pattern: str) -> str:
+#   """Convert a Lark terminal to Lezer token format."""
+#   pattern = pattern.strip()
+#
+#   # Handle regex patterns
+#   if pattern.startswith("/") and pattern.endswith("/"):
+#     regex = pattern[1:-1]
+#     # Convert to Lezer regex format
+#     return f'{name} {{ ${regex}$ }}'
+#
+#   # Handle string literals
+#   if pattern.startswith('"') or pattern.startswith("'"):
+#     return f'{name} {{ {pattern} }}'
+#
+#   # Handle alternatives (literal strings separated by |)
+#   if "|" in pattern:
+#     alternatives = [alt.strip() for alt in pattern.split("|")]
+#     if all(alt.startswith('"') or alt.startswith("'") for alt in alternatives):
+#       return f'{name} {{ {" | ".join(alternatives)} }}'
+#
+#   return ""
+#
+#
+# def _convert_body(body: str) -> str:
+#   """Convert the body of a Lark rule to Lezer format."""
+#   # Remove inline transformations (-> name)
+#   body = re.sub(r"\s*->\s*\w+", "", body)
+#
+#   # Convert alternatives
+#   parts = []
+#   for alt in body.split("|"):
+#     alt = alt.strip()
+#     if alt:
+#       converted = _convert_sequence(alt)
+#       if converted:
+#         parts.append(converted)
+#
+#   return " | ".join(parts)
+#
+#
+# def _convert_sequence(seq: str) -> str:
+#   """Convert a sequence of items in a rule."""
+#   items = []
+#
+#   # Tokenize the sequence
+#   tokens = re.findall(
+#     r'"[^"]*"|\'[^\']*\'|/[^/]+/|\([^)]+\)|\[[^\]]+\]|[a-zA-Z_][a-zA-Z0-9_]*|\?|\*|\+',
+#     seq
+#   )
+#
+#   for token in tokens:
+#     if token.startswith('"') or token.startswith("'"):
+#       # String literal
+#       items.append(token)
+#     elif token.startswith("("):
+#       # Group
+#       inner = token[1:-1]
+#       items.append(f"({_convert_body(inner)})")
+#     elif token.startswith("["):
+#       # Optional group in Lark
+#       inner = token[1:-1]
+#       items.append(f"({_convert_body(inner)})?")
+#     elif token in ("?", "*", "+"):
+#       # Quantifiers - attach to previous item
+#       if items:
+#         items[-1] = items[-1] + token
+#     elif token.isupper() or token.startswith("_"):
+#       # Terminal reference
+#       items.append(token)
+#     elif token.islower() or "_" in token:
+#       # Rule reference - convert to PascalCase
+#       items.append(_to_pascal_case(token))
+#
+#   return " ".join(items)
+#
+#
+# def _to_pascal_case(name: str) -> str:
+#   """Convert snake_case to PascalCase."""
+#   return "".join(word.capitalize() for word in name.split("_"))
+#
+#
+# def extract_completions_from_grammar(lark_grammar: str) -> Dict[str, List[str]]:
+#   """
+#   Extract completion items from a Lark grammar.
+#
+#   Parses the grammar to find:
+#   - Keywords (reserved words like if, not, and)
+#   - Operators (==, !=, contains, etc.)
+#   - Functions (style, format, etc.)
+#   - Types (number, date, boolean, etc.)
+#   - Literals (True, False, etc.)
+#
+#   Args:
+#       lark_grammar: The Lark grammar string.
+#
+#   Returns:
+#       Dictionary with completion categories.
+#   """
+#   keywords: Set[str] = set()
+#   operators: Set[str] = set()
+#   functions: Set[str] = set()
+#   types: Set[str] = set()
+#   literals: Set[str] = set()
+#
+#   # Find all quoted strings (potential keywords/operators)
+#   quoted_strings = re.findall(r'"([^"]+)"', lark_grammar)
+#
+#   # Also look for terminal definitions with string alternatives (e.g., BOOLEAN: "True" | "False")
+#   terminal_literals = re.findall(r'[A-Z_]+:\s*"([^"]+)"(?:\s*\|\s*"([^"]+)")*', lark_grammar)
+#   for match in terminal_literals:
+#     for literal in match:
+#       if literal:
+#         quoted_strings.append(literal)
+#
+#   for s in quoted_strings:
+#     s_lower = s.lower()
+#
+#     # Classify based on pattern
+#     if s in ("==", "!=", "<=", "<", ">=", ">", "+", "-", "*", "/"):
+#       operators.add(s)
+#     elif s_lower in ("contains", "startswith", "endswith", "in", "between", "isempty", "isnotempty"):
+#       operators.add(s_lower)
+#     elif s_lower in ("if", "not", "and", "or"):
+#       keywords.add(s_lower)
+#     elif s_lower in ("true", "false"):
+#       literals.add(s)
+#     elif s_lower in ("style", "format"):
+#       functions.add(s_lower)
+#     elif s_lower in ("column", "row", "cell", "value", "col"):
+#       keywords.add(s_lower)
+#     elif s_lower in ("number", "date", "boolean", "text", "enum"):
+#       types.add(s_lower)
+#     elif s_lower == "case":
+#       keywords.add(s_lower)
+#
+#   # Find function-like patterns: word "("
+#   function_patterns = re.findall(r'"(\w+)"\s*"?\("', lark_grammar)
+#   for func in function_patterns:
+#     if func.lower() not in ("true", "false"):
+#       functions.add(func.lower())
+#
+#   # Find type patterns from format_type rule
+#   type_match = re.search(r'format_type\s*:\s*(.+?)(?:\n\n|\Z)', lark_grammar, re.DOTALL)
+#   if type_match:
+#     type_strings = re.findall(r'"(\w+)"', type_match.group(1))
+#     types.update(t.lower() for t in type_strings)
+#
+#   return {
+#       "keywords": sorted(keywords),
+#       "operators": sorted(operators),
+#       "functions": sorted(functions),
+#       "types": sorted(types),
+#       "literals": sorted(literals),
+#   }
--- a/src/myfasthtml/core/dsl/lark_to_simple_mode.py
+++ b/src/myfasthtml/core/dsl/lark_to_simple_mode.py
@@ -0,0 +1,240 @@
+"""
+Utilities for converting Lark grammars to CodeMirror 5 Simple Mode format.
+
+This module provides functions to extract regex patterns from Lark grammar
+terminals and generate a CodeMirror Simple Mode configuration for syntax highlighting.
+"""
+
+import re
+from typing import Dict, List, Any
+
+
+def lark_to_simple_mode(lark_grammar: str) -> Dict[str, Any]:
+    """
+    Convert a Lark grammar to CodeMirror 5 Simple Mode configuration.
+
+    Extracts terminal definitions (regex patterns) from the Lark grammar and
+    maps them to CodeMirror token classes for syntax highlighting.
+
+    Args:
+        lark_grammar: The Lark grammar string.
+
+    Returns:
+        Dictionary with Simple Mode configuration:
+        {
+            "start": [
+                {"regex": "...", "token": "keyword"},
+                {"regex": "...", "token": "string"},
+                ...
+            ]
+        }
+    """
+    # Extract keywords from literal strings in grammar rules
+    keywords = _extract_keywords(lark_grammar)
+
+    # Extract terminals (regex patterns)
+    terminals = _extract_terminals(lark_grammar)
+
+    # Build Simple Mode rules
+    rules = []
+
+    # Comments (must come first to have priority)
+    rules.append({
+        "regex": r"#.*",
+        "token": "comment"
+    })
+
+    # Keywords
+    if keywords:
+        keyword_pattern = r"\b(?:" + "|".join(re.escape(k) for k in keywords) + r")\b"
+        rules.append({
+            "regex": keyword_pattern,
+            "token": "keyword"
+        })
+
+    # Terminals mapped to token types
+    terminal_mappings = {
+        "QUOTED_STRING": "string",
+        "SIGNED_NUMBER": "number",
+        "INTEGER": "number",
+        "BOOLEAN": "atom",
+        "CELL_ID": "variable-3",
+        "NAME": "variable",
+    }
+
+    for term_name, pattern in terminals.items():
+        if term_name in terminal_mappings:
+            token_type = terminal_mappings[term_name]
+            js_pattern = _lark_regex_to_js(pattern)
+            if js_pattern:
+                rules.append({
+                    "regex": js_pattern,
+                    "token": token_type
+                })
+
+    return {"start": rules}
+
+
+def _extract_keywords(grammar: str) -> List[str]:
+    """
+    Extract keyword literals from grammar rules.
+
+    Looks for quoted string literals in rules (e.g., "column", "if", "style").
+
+    Args:
+        grammar: The Lark grammar string.
+
+    Returns:
+        List of keyword strings.
+    """
+    keywords = set()
+
+    # Match quoted literals in rules (not in terminal definitions)
+    # Pattern: "keyword" but not in lines like: TERMINAL: "pattern"
+    lines = grammar.split("\n")
+    for line in lines:
+        # Skip terminal definitions (uppercase name followed by colon)
+        if re.match(r'\s*[A-Z_]+\s*:', line):
+            continue
+
+        # Skip comments
+        if line.strip().startswith("//") or line.strip().startswith("#"):
+            continue
+
+        # Find quoted strings in rules
+        matches = re.findall(r'"([a-z_]+)"', line)
+        for match in matches:
+            # Filter out regex-like patterns, keep only identifiers
+            if re.match(r'^[a-z_]+$', match):
+                keywords.add(match)
+
+    return sorted(keywords)
+
+
+def _extract_terminals(grammar: str) -> Dict[str, str]:
+    """
+    Extract terminal definitions from Lark grammar.
+
+    Args:
+        grammar: The Lark grammar string.
+
+    Returns:
+        Dictionary mapping terminal names to their regex patterns.
+    """
+    terminals = {}
+    lines = grammar.split("\n")
+
+    for line in lines:
+        # Match terminal definitions: NAME: /regex/ or NAME: "literal"
+        match = re.match(r'\s*([A-Z_]+)\s*:\s*/([^/]+)/', line)
+        if match:
+            name, pattern = match.groups()
+            terminals[name] = pattern
+            continue
+
+        # Match literal alternatives: BOOLEAN: "True" | "False"
+        match = re.match(r'\s*([A-Z_]+)\s*:\s*(.+)', line)
+        if match:
+            name, alternatives = match.groups()
+            # Extract quoted literals
+            literals = re.findall(r'"([^"]+)"', alternatives)
+            if literals:
+                # Build regex alternation
+                pattern = "|".join(re.escape(lit) for lit in literals)
+                terminals[name] = pattern
+
+    return terminals
+
+
+def _lark_regex_to_js(lark_pattern: str) -> str:
+    """
+    Convert a Lark regex pattern to JavaScript regex.
+
+    This is a simplified converter that handles common patterns.
+    Complex patterns may need manual adjustment.
+
+    Args:
+        lark_pattern: Lark regex pattern.
+
+    Returns:
+        JavaScript regex pattern string, or empty string if conversion fails.
+    """
+    # Remove Lark-specific flags
+    pattern = lark_pattern.strip()
+
+    # Handle common patterns
+    conversions = [
+        # Escape sequences
+        (r'\[', r'['),
+        (r'\]', r']'),
+
+        # Character classes are mostly compatible
+        # Numbers: [0-9]+ or \d+
+        # Letters: [a-zA-Z]
+        # Whitespace: [ \t]
+    ]
+
+    result = pattern
+    for lark_pat, js_pat in conversions:
+        result = result.replace(lark_pat, js_pat)
+
+    # Wrap in word boundaries for identifier-like patterns
+    # Example: [a-zA-Z_][a-zA-Z0-9_]* → \b[a-zA-Z_][a-zA-Z0-9_]*\b
+    if re.match(r'\[[a-zA-Z_]+\]', result):
+        result = r'\b' + result + r'\b'
+
+    return result
+
+
+def generate_formatting_dsl_mode() -> Dict[str, Any]:
+    """
+    Generate Simple Mode configuration for the Formatting DSL.
+
+    This is a specialized version with hand-tuned rules for better highlighting.
+
+    Returns:
+        Simple Mode configuration dictionary.
+    """
+    return {
+        "start": [
+            # Comments (highest priority)
+            {"regex": r"#.*", "token": "comment"},
+
+            # Scope keywords
+            {"regex": r"\b(?:column|row|cell)\b", "token": "keyword"},
+
+            # Condition keywords
+            {"regex": r"\b(?:if|not|and|or|in|between|case)\b", "token": "keyword"},
+
+            # Built-in functions
+            {"regex": r"\b(?:style|format)\b", "token": "builtin"},
+
+            # Format types
+            {"regex": r"\b(?:number|date|boolean|text|enum)\b", "token": "builtin"},
+
+            # String operators (word-like)
+            {"regex": r"\b(?:contains|startswith|endswith|isempty|isnotempty)\b", "token": "operator"},
+
+            # Comparison operators (symbols)
+            {"regex": r"==|!=|<=|>=|<|>", "token": "operator"},
+
+            # Special references
+            {"regex": r"\b(?:value|col|row|cell)\b", "token": "variable-2"},
+
+            # Booleans
+            {"regex": r"\b(?:True|False|true|false)\b", "token": "atom"},
+
+            # Numbers (integers and floats, with optional sign)
+            {"regex": r"[+-]?\b\d+(?:\.\d+)?\b", "token": "number"},
+
+            # Strings (double or single quoted)
+            {"regex": r'"(?:[^\\"]|\\.)*"', "token": "string"},
+            {"regex": r"'(?:[^\\']|\\.)*'", "token": "string"},
+
+            # Cell IDs
+            {"regex": r"\btcell_[a-zA-Z0-9_-]+\b", "token": "variable-3"},
+
+            # Names (identifiers) - lowest priority
+            {"regex": r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", "token": "variable"},
+        ]
+    }