fist version of the lexer

2019-10-22 19:02:51 +02:00
parent 8f1c2ed818
commit 913cd3c0b3
8 changed files with 368 additions and 3 deletions
@@ -12,8 +12,7 @@ class Concept:
        self.is_builtin = is_builtin
        self.pre = None  # list of pre conditions before calling the main function
        self.post = None  # list of post conditions after calling the main function
-        self.main = None   # main method
+        self.main = None   # main method, can also be the value of the concept
        self.value = None  # value of the concept
        self.id = Concept.concepts_id
        Concept.concepts_id = Concept.concepts_id + 1
@@ -0,0 +1,20 @@
 ```
 def concept one as 1  --> creates a new concept 1
 def concept two as 2  --> creates a new concept 2
 def concept add(a,b) as a + b   --> create concept that need parenthesis
 def concept a plus b as add(a,b)  --> create a concept that mimic human language
 one plus two --> recognizes the concept 'a plus b'
 one plus two ? --> makes the addition
 concept a plus b --> will work on this concept
 pre: a is a number 
 --> ERROR : 'a is a number' is not known
 def concept a is a number as :
    isinstanceof(a, number) 
 --> adds concept a is a number
 --> add the pre condition to the concept a plus b
 ```
@@ -15,7 +15,7 @@ def main():
    # launch the parsers
    # execute the concepts
-
+    print(event_as_string)
    return True
@@ -0,0 +1 @@
 print("hello")
@@ -0,0 +1,249 @@
 from dataclasses import dataclass
@dataclass(frozen=True)
 class Token:
    type: str
    value: str
    index: int
    line: int
    column: int
@dataclass(frozen=True)
 class LexerError(Exception):
    message: str
    text: str
    index: int
    line: int
    column: int
 class Tokens:
    EOF = "eof"
    WHITESPACE = "whitespace"
    NEWLINE = "newline"
    KEYWORD = "keyword"
    IDENTIFIER = "identifier"
    STRING = "string"
    NUMBER = "number"
    TRUE = "true"
    FALSE = "false"
    LPAR = "lpar"
    RPAR = "rpar"
    LBRACKET = "lbrace"
    RBRACKET = "rbracket"
    LBRACE = "lbrace"
    RBRACE = "rbrace"
    PLUS = "plus"
    MINUS = "minus"
    STAR = "star"
    SLASH = "slash"
    PERCENT = "percent"
    COMMA = "comma"
    SEMICOLON = "semicolon"
    COLON = "colon"
    DOT = "dot"
    QMARK = "qmark"
    VBAR = "vbar"
    AMPER = "amper"
 class TokenIter:
    KEYWORDS = ("def", "concept", "as", "pre", "post")
    """
    Class that can iterate on the tokens
    """
    def __init__(self, text):
        self.text = text
        self.text_len = len(text)
    def __iter__(self):
        i = 0
        line = 1
        column = 1
        while i < self.text_len:
            c = self.text[i]
            if c == "+":
                yield Token(Tokens.PLUS, "+", i, line, column)
                i += 1
                column += 1
            elif c == "-":
                if i + 1 < self.text_len and self.text[i + 1].isdigit():
                    number = self.eat_number(i)
                    yield Token(Tokens.NUMBER, number, i, line, column)
                    i += len(number)
                    column += len(number)
                else:
                    yield Token(Tokens.MINUS, "-", i, line, column)
                    i += 1
                    column += 1
            elif c == "/":
                yield Token(Tokens.SLASH, "/", i, line, column)
                i += 1
                column += 1
            elif c == "*":
                yield Token(Tokens.STAR, "*", i, line, column)
                i += 1
                column += 1
            elif c == "{":
                yield Token(Tokens.LBRACE, "{", i, line, column)
                i += 1
                column += 1
            elif c == "}":
                yield Token(Tokens.RBRACE, "}", i, line, column)
                i += 1
                column += 1
            elif c == "(":
                yield Token(Tokens.LPAR, "(", i, line, column)
                i += 1
                column += 1
            elif c == ")":
                yield Token(Tokens.RPAR, ")", i, line, column)
                i += 1
                column += 1
            elif c == "[":
                yield Token(Tokens.LBRACKET, "[", i, line, column)
                i += 1
                column += 1
            elif c == "]":
                yield Token(Tokens.RBRACKET, "]", i, line, column)
                i += 1
                column += 1
            elif c == " " or c == "\t":
                whitespace = self.eat_whitespace(i)
                yield Token(Tokens.WHITESPACE, whitespace, i, line, column)
                i += len(whitespace)
                column += len(whitespace)
            elif c == ",":
                yield Token(Tokens.COMMA, ",", i, line, column)
                i += 1
                column += 1
            elif c == ".":
                yield Token(Tokens.DOT, ".", i, line, column)
                i += 1
                column += 1
            elif c == ";":
                yield Token(Tokens.SEMICOLON, ";", i, line, column)
                i += 1
                column += 1
            elif c == ":":
                yield Token(Tokens.COLON, ":", i, line, column)
                i += 1
                column += 1
            elif c == "?":
                yield Token(Tokens.QMARK, "?", i, line, column)
                i += 1
                column += 1
            elif c == "\n" or c == "\r":
                newline = self.eat_newline(i)
                yield Token(Tokens.NEWLINE, newline, i, line, column)
                i += len(newline)
                column = 1
                line += 1
            elif c.isalpha() or c == "_":
                identifier = self.eat_identifier(i)
                type = Tokens.KEYWORD if identifier in self.KEYWORDS else Tokens.IDENTIFIER
                yield Token(type, identifier, i, line, column)
                i += len(identifier)
                column += len(identifier)
            elif c.isdigit():
                number = self.eat_number(i)
                yield Token(Tokens.NUMBER, number, i, line, column)
                i += len(number)
                column += len(number)
            elif c == "'" or c == '"':
                string, newlines = self.eat_string(i)
                yield Token(Tokens.STRING, string, i, line, column)
                i += len(string)
                column = 1 if newlines > 0 else column + len(string)
                line += newlines
            else:
                raise LexerError(f"Unknown token '{c}'", self.text, i, line, column)
        yield Token(Tokens.EOF, "", i, line, column)
    def eat_whitespace(self, start):
        result = self.text[start]
        i = start + 1
        while i < self.text_len:
            c = self.text[i]
            if c == " " or c == "\t":
                result += c
                i += 1
            else:
                break
        return result
    def eat_newline(self, start):
        if start + 1 == self.text_len:
            return self.text[start]
        current = self.text[start]
        next = self.text[start + 1]
        if current == "\n" and next == "\r" or current == "\r" and next == "\n":
            return current + next
        return current
    def eat_identifier(self, start):
        result = self.text[start]
        i = start + 1
        while i < self.text_len:
            c = self.text[i]
            if c.isalpha() or c == "_" or c == "-" or c.isdigit():
                result += c
                i += 1
            else:
                break
        return result
    def eat_number(self, start):
        result = self.text[start]
        i = start + 1
        while i < self.text_len:
            c = self.text[i]
            if c.isdigit() or c == ".":
                result += c
                i += 1
            else:
                break
        return result
    def eat_string(self, start):
        quote = self.text[start]
        result = self.text[start]
        lines_count = 0
        i = start + 1
        escape = False
        newline = None
        while i < self.text_len:
            c = self.text[i]
            result += c
            i += 1
            if newline:
                lines_count += 1
                newline = c if c == newline else None
            else:
                if c == "\r" or c == "\n":
                    newline = c
            if c == "\\":
                escape = True
            elif c == quote and not escape:
                break
            else:
                escape = False
        if newline:
            lines_count += 1
        return result, lines_count
@@ -0,0 +1,96 @@
 import pytest
 from parsers.defaultparser import TokenIter, Token, Tokens
 def test_i_can_tokenize():
    source = "+*-/{}[]()    ,;:.?\n\n\r\r\r\nidentifier_0\t  \t10.15 10 'string\n' \"another string\""
    tokens = list(TokenIter(source))
    assert tokens[0] == Token(Tokens.PLUS, "+", 0, 1, 1)
    assert tokens[1] == Token(Tokens.STAR, "*", 1, 1, 2)
    assert tokens[2] == Token(Tokens.MINUS, "-", 2, 1, 3)
    assert tokens[3] == Token(Tokens.SLASH, "/", 3, 1, 4)
    assert tokens[4] == Token(Tokens.LBRACE, "{", 4, 1, 5)
    assert tokens[5] == Token(Tokens.RBRACE, "}", 5, 1, 6)
    assert tokens[6] == Token(Tokens.LBRACKET, "[", 6, 1, 7)
    assert tokens[7] == Token(Tokens.RBRACKET, "]", 7, 1, 8)
    assert tokens[8] == Token(Tokens.LPAR, "(", 8, 1, 9)
    assert tokens[9] == Token(Tokens.RPAR, ")", 9, 1, 10)
    assert tokens[10] == Token(Tokens.WHITESPACE, "    ", 10, 1, 11)
    assert tokens[11] == Token(Tokens.COMMA, ",", 14, 1, 15)
    assert tokens[12] == Token(Tokens.SEMICOLON, ";", 15, 1, 16)
    assert tokens[13] == Token(Tokens.COLON, ":", 16, 1, 17)
    assert tokens[14] == Token(Tokens.DOT, ".", 17, 1, 18)
    assert tokens[15] == Token(Tokens.QMARK, "?", 18, 1, 19)
    assert tokens[16] == Token(Tokens.NEWLINE, "\n", 19, 1, 20)
    assert tokens[17] == Token(Tokens.NEWLINE, "\n\r", 20, 2, 1)
    assert tokens[18] == Token(Tokens.NEWLINE, "\r", 22, 3, 1)
    assert tokens[19] == Token(Tokens.NEWLINE, "\r\n", 23, 4, 1)
    assert tokens[20] == Token(Tokens.IDENTIFIER, "identifier_0", 25, 5, 1)
    assert tokens[21] == Token(Tokens.WHITESPACE, "\t  \t", 37, 5, 13)
    assert tokens[22] == Token(Tokens.NUMBER, "10.15", 41, 5, 17)
    assert tokens[23] == Token(Tokens.WHITESPACE, " ", 46, 5, 22)
    assert tokens[24] == Token(Tokens.NUMBER, "10", 47, 5, 23)
    assert tokens[25] == Token(Tokens.WHITESPACE, " ", 49, 5, 25)
    assert tokens[26] == Token(Tokens.STRING, "'string\n'", 50, 5, 26)
    assert tokens[27] == Token(Tokens.WHITESPACE, " ", 59, 6, 1)
    assert tokens[28] == Token(Tokens.STRING, '"another string"', 60, 6, 2)
@pytest.mark.parametrize("text, expected", [
    ("_ident", True),
    ("ident", True),
    ("ident123", True),
    ("ident_123", True),
    ("ident-like-this", True),
    ("àèùéû", True),
    ("011254", False),
    ("0abcd", False),
    ("-abcd", False)
 ])
 def test_i_can_tokenize_identifiers(text, expected):
    tokens = list(TokenIter(text))
    comparison = tokens[0].type == Tokens.IDENTIFIER
    assert comparison == expected
@pytest.mark.parametrize("text, expected_text, expected_newlines", [
    ("'foo'", "'foo'", 0),
    ('"foo"', '"foo"', 0),
    ("'foo\rbar'", "'foo\rbar'", 1),
    ("'foo\nbar'", "'foo\nbar'", 1),
    ("'foo\n\rbar'", "'foo\n\rbar'", 1),
    ("'foo\r\nbar'", "'foo\r\nbar'", 1),
    ("'foo\r\rbar'", "'foo\r\rbar'", 2),
    ("'foo\n\nbar'", "'foo\n\nbar'", 2),
    ("'foo\r\n\n\rbar'", "'foo\r\n\n\rbar'", 2),
    ("'\rfoo\rbar\r'", "'\rfoo\rbar\r'", 3),
    ("'\nfoo\nbar\n'", "'\nfoo\nbar\n'", 3),
    ("'\n\rfoo\r\n'", "'\n\rfoo\r\n'", 2),
    (r"'foo\'bar'", r"'foo\'bar'", 0),
    (r'"foo\"bar"', r'"foo\"bar"', 0),
    ('"foo"bar"', '"foo"', 0),
    ("'foo'bar'", "'foo'", 0),
 ])
 def test_i_can_parse_strings(text, expected_text, expected_newlines):
    lexer = TokenIter(text)
    text_found, nb_of_newlines = lexer.eat_string(0)
    assert nb_of_newlines == expected_newlines
    assert text_found == expected_text
@pytest.mark.parametrize("text", [
    "1", "3.1415", "0.5", "01", "-5", "-5.10"
 ])
 def test_i_can_parse_numbers(text):
    tokens = list(TokenIter(text))
    assert tokens[0].type == Tokens.NUMBER
    assert tokens[0].value == text
@pytest.mark.parametrize("text", [
    "def", "concept", "as", "pre", "post"
 ])
 def test_i_can_recognize_keywords(text):
    tokens = list(TokenIter(text))
    assert tokens[0].type == Tokens.KEYWORD