fist version of the lexer

2019-10-22 19:02:51 +02:00
parent 8f1c2ed818
commit 913cd3c0b3
8 changed files with 368 additions and 3 deletions
@@ -0,0 +1,249 @@
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Token:
+    type: str
+    value: str
+    index: int
+    line: int
+    column: int
+
+
+@dataclass(frozen=True)
+class LexerError(Exception):
+    message: str
+    text: str
+    index: int
+    line: int
+    column: int
+
+
+class Tokens:
+    EOF = "eof"
+    WHITESPACE = "whitespace"
+    NEWLINE = "newline"
+    KEYWORD = "keyword"
+    IDENTIFIER = "identifier"
+    STRING = "string"
+    NUMBER = "number"
+    TRUE = "true"
+    FALSE = "false"
+    LPAR = "lpar"
+    RPAR = "rpar"
+    LBRACKET = "lbrace"
+    RBRACKET = "rbracket"
+    LBRACE = "lbrace"
+    RBRACE = "rbrace"
+    PLUS = "plus"
+    MINUS = "minus"
+    STAR = "star"
+    SLASH = "slash"
+    PERCENT = "percent"
+    COMMA = "comma"
+    SEMICOLON = "semicolon"
+    COLON = "colon"
+    DOT = "dot"
+    QMARK = "qmark"
+    VBAR = "vbar"
+    AMPER = "amper"
+
+
+class TokenIter:
+    KEYWORDS = ("def", "concept", "as", "pre", "post")
+
+    """
+    Class that can iterate on the tokens
+    """
+
+    def __init__(self, text):
+        self.text = text
+        self.text_len = len(text)
+
+    def __iter__(self):
+
+        i = 0
+        line = 1
+        column = 1
+        while i < self.text_len:
+            c = self.text[i]
+            if c == "+":
+                yield Token(Tokens.PLUS, "+", i, line, column)
+                i += 1
+                column += 1
+            elif c == "-":
+                if i + 1 < self.text_len and self.text[i + 1].isdigit():
+                    number = self.eat_number(i)
+                    yield Token(Tokens.NUMBER, number, i, line, column)
+                    i += len(number)
+                    column += len(number)
+                else:
+                    yield Token(Tokens.MINUS, "-", i, line, column)
+                    i += 1
+                    column += 1
+            elif c == "/":
+                yield Token(Tokens.SLASH, "/", i, line, column)
+                i += 1
+                column += 1
+            elif c == "*":
+                yield Token(Tokens.STAR, "*", i, line, column)
+                i += 1
+                column += 1
+            elif c == "{":
+                yield Token(Tokens.LBRACE, "{", i, line, column)
+                i += 1
+                column += 1
+            elif c == "}":
+                yield Token(Tokens.RBRACE, "}", i, line, column)
+                i += 1
+                column += 1
+            elif c == "(":
+                yield Token(Tokens.LPAR, "(", i, line, column)
+                i += 1
+                column += 1
+            elif c == ")":
+                yield Token(Tokens.RPAR, ")", i, line, column)
+                i += 1
+                column += 1
+            elif c == "[":
+                yield Token(Tokens.LBRACKET, "[", i, line, column)
+                i += 1
+                column += 1
+            elif c == "]":
+                yield Token(Tokens.RBRACKET, "]", i, line, column)
+                i += 1
+                column += 1
+            elif c == " " or c == "\t":
+                whitespace = self.eat_whitespace(i)
+                yield Token(Tokens.WHITESPACE, whitespace, i, line, column)
+                i += len(whitespace)
+                column += len(whitespace)
+            elif c == ",":
+                yield Token(Tokens.COMMA, ",", i, line, column)
+                i += 1
+                column += 1
+            elif c == ".":
+                yield Token(Tokens.DOT, ".", i, line, column)
+                i += 1
+                column += 1
+            elif c == ";":
+                yield Token(Tokens.SEMICOLON, ";", i, line, column)
+                i += 1
+                column += 1
+            elif c == ":":
+                yield Token(Tokens.COLON, ":", i, line, column)
+                i += 1
+                column += 1
+            elif c == "?":
+                yield Token(Tokens.QMARK, "?", i, line, column)
+                i += 1
+                column += 1
+            elif c == "\n" or c == "\r":
+                newline = self.eat_newline(i)
+                yield Token(Tokens.NEWLINE, newline, i, line, column)
+                i += len(newline)
+                column = 1
+                line += 1
+            elif c.isalpha() or c == "_":
+                identifier = self.eat_identifier(i)
+                type = Tokens.KEYWORD if identifier in self.KEYWORDS else Tokens.IDENTIFIER
+                yield Token(type, identifier, i, line, column)
+                i += len(identifier)
+                column += len(identifier)
+            elif c.isdigit():
+                number = self.eat_number(i)
+                yield Token(Tokens.NUMBER, number, i, line, column)
+                i += len(number)
+                column += len(number)
+            elif c == "'" or c == '"':
+                string, newlines = self.eat_string(i)
+                yield Token(Tokens.STRING, string, i, line, column)
+                i += len(string)
+                column = 1 if newlines > 0 else column + len(string)
+                line += newlines
+            else:
+                raise LexerError(f"Unknown token '{c}'", self.text, i, line, column)
+
+        yield Token(Tokens.EOF, "", i, line, column)
+
+    def eat_whitespace(self, start):
+        result = self.text[start]
+        i = start + 1
+        while i < self.text_len:
+            c = self.text[i]
+            if c == " " or c == "\t":
+                result += c
+                i += 1
+            else:
+                break
+
+        return result
+
+    def eat_newline(self, start):
+        if start + 1 == self.text_len:
+            return self.text[start]
+
+        current = self.text[start]
+        next = self.text[start + 1]
+        if current == "\n" and next == "\r" or current == "\r" and next == "\n":
+            return current + next
+
+        return current
+
+    def eat_identifier(self, start):
+        result = self.text[start]
+        i = start + 1
+        while i < self.text_len:
+            c = self.text[i]
+            if c.isalpha() or c == "_" or c == "-" or c.isdigit():
+                result += c
+                i += 1
+            else:
+                break
+
+        return result
+
+    def eat_number(self, start):
+        result = self.text[start]
+        i = start + 1
+        while i < self.text_len:
+            c = self.text[i]
+            if c.isdigit() or c == ".":
+                result += c
+                i += 1
+            else:
+                break
+
+        return result
+
+    def eat_string(self, start):
+        quote = self.text[start]
+        result = self.text[start]
+        lines_count = 0
+
+        i = start + 1
+        escape = False
+        newline = None
+        while i < self.text_len:
+            c = self.text[i]
+            result += c
+            i += 1
+
+            if newline:
+                lines_count += 1
+                newline = c if c == newline else None
+            else:
+                if c == "\r" or c == "\n":
+                    newline = c
+
+            if c == "\\":
+                escape = True
+            elif c == quote and not escape:
+                break
+            else:
+                escape = False
+
+        if newline:
+            lines_count += 1
+
+        return result, lines_count