fist version of the lexer
This commit is contained in:
+1
-2
@@ -12,8 +12,7 @@ class Concept:
|
|||||||
self.is_builtin = is_builtin
|
self.is_builtin = is_builtin
|
||||||
self.pre = None # list of pre conditions before calling the main function
|
self.pre = None # list of pre conditions before calling the main function
|
||||||
self.post = None # list of post conditions after calling the main function
|
self.post = None # list of post conditions after calling the main function
|
||||||
self.main = None # main method
|
self.main = None # main method, can also be the value of the concept
|
||||||
self.value = None # value of the concept
|
|
||||||
self.id = Concept.concepts_id
|
self.id = Concept.concepts_id
|
||||||
Concept.concepts_id = Concept.concepts_id + 1
|
Concept.concepts_id = Concept.concepts_id + 1
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,20 @@
|
|||||||
|
```
|
||||||
|
def concept one as 1 --> creates a new concept 1
|
||||||
|
def concept two as 2 --> creates a new concept 2
|
||||||
|
def concept add(a,b) as a + b --> create concept that need parenthesis
|
||||||
|
def concept a plus b as add(a,b) --> create a concept that mimic human language
|
||||||
|
|
||||||
|
one plus two --> recognizes the concept 'a plus b'
|
||||||
|
one plus two ? --> makes the addition
|
||||||
|
|
||||||
|
concept a plus b --> will work on this concept
|
||||||
|
pre: a is a number
|
||||||
|
|
||||||
|
--> ERROR : 'a is a number' is not known
|
||||||
|
|
||||||
|
def concept a is a number as :
|
||||||
|
isinstanceof(a, number)
|
||||||
|
|
||||||
|
--> adds concept a is a number
|
||||||
|
--> add the pre condition to the concept a plus b
|
||||||
|
```
|
||||||
@@ -15,7 +15,7 @@ def main():
|
|||||||
# launch the parsers
|
# launch the parsers
|
||||||
|
|
||||||
# execute the concepts
|
# execute the concepts
|
||||||
|
print(event_as_string)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,249 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Token:
|
||||||
|
type: str
|
||||||
|
value: str
|
||||||
|
index: int
|
||||||
|
line: int
|
||||||
|
column: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class LexerError(Exception):
|
||||||
|
message: str
|
||||||
|
text: str
|
||||||
|
index: int
|
||||||
|
line: int
|
||||||
|
column: int
|
||||||
|
|
||||||
|
|
||||||
|
class Tokens:
|
||||||
|
EOF = "eof"
|
||||||
|
WHITESPACE = "whitespace"
|
||||||
|
NEWLINE = "newline"
|
||||||
|
KEYWORD = "keyword"
|
||||||
|
IDENTIFIER = "identifier"
|
||||||
|
STRING = "string"
|
||||||
|
NUMBER = "number"
|
||||||
|
TRUE = "true"
|
||||||
|
FALSE = "false"
|
||||||
|
LPAR = "lpar"
|
||||||
|
RPAR = "rpar"
|
||||||
|
LBRACKET = "lbrace"
|
||||||
|
RBRACKET = "rbracket"
|
||||||
|
LBRACE = "lbrace"
|
||||||
|
RBRACE = "rbrace"
|
||||||
|
PLUS = "plus"
|
||||||
|
MINUS = "minus"
|
||||||
|
STAR = "star"
|
||||||
|
SLASH = "slash"
|
||||||
|
PERCENT = "percent"
|
||||||
|
COMMA = "comma"
|
||||||
|
SEMICOLON = "semicolon"
|
||||||
|
COLON = "colon"
|
||||||
|
DOT = "dot"
|
||||||
|
QMARK = "qmark"
|
||||||
|
VBAR = "vbar"
|
||||||
|
AMPER = "amper"
|
||||||
|
|
||||||
|
|
||||||
|
class TokenIter:
|
||||||
|
KEYWORDS = ("def", "concept", "as", "pre", "post")
|
||||||
|
|
||||||
|
"""
|
||||||
|
Class that can iterate on the tokens
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, text):
|
||||||
|
self.text = text
|
||||||
|
self.text_len = len(text)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
line = 1
|
||||||
|
column = 1
|
||||||
|
while i < self.text_len:
|
||||||
|
c = self.text[i]
|
||||||
|
if c == "+":
|
||||||
|
yield Token(Tokens.PLUS, "+", i, line, column)
|
||||||
|
i += 1
|
||||||
|
column += 1
|
||||||
|
elif c == "-":
|
||||||
|
if i + 1 < self.text_len and self.text[i + 1].isdigit():
|
||||||
|
number = self.eat_number(i)
|
||||||
|
yield Token(Tokens.NUMBER, number, i, line, column)
|
||||||
|
i += len(number)
|
||||||
|
column += len(number)
|
||||||
|
else:
|
||||||
|
yield Token(Tokens.MINUS, "-", i, line, column)
|
||||||
|
i += 1
|
||||||
|
column += 1
|
||||||
|
elif c == "/":
|
||||||
|
yield Token(Tokens.SLASH, "/", i, line, column)
|
||||||
|
i += 1
|
||||||
|
column += 1
|
||||||
|
elif c == "*":
|
||||||
|
yield Token(Tokens.STAR, "*", i, line, column)
|
||||||
|
i += 1
|
||||||
|
column += 1
|
||||||
|
elif c == "{":
|
||||||
|
yield Token(Tokens.LBRACE, "{", i, line, column)
|
||||||
|
i += 1
|
||||||
|
column += 1
|
||||||
|
elif c == "}":
|
||||||
|
yield Token(Tokens.RBRACE, "}", i, line, column)
|
||||||
|
i += 1
|
||||||
|
column += 1
|
||||||
|
elif c == "(":
|
||||||
|
yield Token(Tokens.LPAR, "(", i, line, column)
|
||||||
|
i += 1
|
||||||
|
column += 1
|
||||||
|
elif c == ")":
|
||||||
|
yield Token(Tokens.RPAR, ")", i, line, column)
|
||||||
|
i += 1
|
||||||
|
column += 1
|
||||||
|
elif c == "[":
|
||||||
|
yield Token(Tokens.LBRACKET, "[", i, line, column)
|
||||||
|
i += 1
|
||||||
|
column += 1
|
||||||
|
elif c == "]":
|
||||||
|
yield Token(Tokens.RBRACKET, "]", i, line, column)
|
||||||
|
i += 1
|
||||||
|
column += 1
|
||||||
|
elif c == " " or c == "\t":
|
||||||
|
whitespace = self.eat_whitespace(i)
|
||||||
|
yield Token(Tokens.WHITESPACE, whitespace, i, line, column)
|
||||||
|
i += len(whitespace)
|
||||||
|
column += len(whitespace)
|
||||||
|
elif c == ",":
|
||||||
|
yield Token(Tokens.COMMA, ",", i, line, column)
|
||||||
|
i += 1
|
||||||
|
column += 1
|
||||||
|
elif c == ".":
|
||||||
|
yield Token(Tokens.DOT, ".", i, line, column)
|
||||||
|
i += 1
|
||||||
|
column += 1
|
||||||
|
elif c == ";":
|
||||||
|
yield Token(Tokens.SEMICOLON, ";", i, line, column)
|
||||||
|
i += 1
|
||||||
|
column += 1
|
||||||
|
elif c == ":":
|
||||||
|
yield Token(Tokens.COLON, ":", i, line, column)
|
||||||
|
i += 1
|
||||||
|
column += 1
|
||||||
|
elif c == "?":
|
||||||
|
yield Token(Tokens.QMARK, "?", i, line, column)
|
||||||
|
i += 1
|
||||||
|
column += 1
|
||||||
|
elif c == "\n" or c == "\r":
|
||||||
|
newline = self.eat_newline(i)
|
||||||
|
yield Token(Tokens.NEWLINE, newline, i, line, column)
|
||||||
|
i += len(newline)
|
||||||
|
column = 1
|
||||||
|
line += 1
|
||||||
|
elif c.isalpha() or c == "_":
|
||||||
|
identifier = self.eat_identifier(i)
|
||||||
|
type = Tokens.KEYWORD if identifier in self.KEYWORDS else Tokens.IDENTIFIER
|
||||||
|
yield Token(type, identifier, i, line, column)
|
||||||
|
i += len(identifier)
|
||||||
|
column += len(identifier)
|
||||||
|
elif c.isdigit():
|
||||||
|
number = self.eat_number(i)
|
||||||
|
yield Token(Tokens.NUMBER, number, i, line, column)
|
||||||
|
i += len(number)
|
||||||
|
column += len(number)
|
||||||
|
elif c == "'" or c == '"':
|
||||||
|
string, newlines = self.eat_string(i)
|
||||||
|
yield Token(Tokens.STRING, string, i, line, column)
|
||||||
|
i += len(string)
|
||||||
|
column = 1 if newlines > 0 else column + len(string)
|
||||||
|
line += newlines
|
||||||
|
else:
|
||||||
|
raise LexerError(f"Unknown token '{c}'", self.text, i, line, column)
|
||||||
|
|
||||||
|
yield Token(Tokens.EOF, "", i, line, column)
|
||||||
|
|
||||||
|
def eat_whitespace(self, start):
|
||||||
|
result = self.text[start]
|
||||||
|
i = start + 1
|
||||||
|
while i < self.text_len:
|
||||||
|
c = self.text[i]
|
||||||
|
if c == " " or c == "\t":
|
||||||
|
result += c
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def eat_newline(self, start):
|
||||||
|
if start + 1 == self.text_len:
|
||||||
|
return self.text[start]
|
||||||
|
|
||||||
|
current = self.text[start]
|
||||||
|
next = self.text[start + 1]
|
||||||
|
if current == "\n" and next == "\r" or current == "\r" and next == "\n":
|
||||||
|
return current + next
|
||||||
|
|
||||||
|
return current
|
||||||
|
|
||||||
|
def eat_identifier(self, start):
|
||||||
|
result = self.text[start]
|
||||||
|
i = start + 1
|
||||||
|
while i < self.text_len:
|
||||||
|
c = self.text[i]
|
||||||
|
if c.isalpha() or c == "_" or c == "-" or c.isdigit():
|
||||||
|
result += c
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def eat_number(self, start):
|
||||||
|
result = self.text[start]
|
||||||
|
i = start + 1
|
||||||
|
while i < self.text_len:
|
||||||
|
c = self.text[i]
|
||||||
|
if c.isdigit() or c == ".":
|
||||||
|
result += c
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def eat_string(self, start):
|
||||||
|
quote = self.text[start]
|
||||||
|
result = self.text[start]
|
||||||
|
lines_count = 0
|
||||||
|
|
||||||
|
i = start + 1
|
||||||
|
escape = False
|
||||||
|
newline = None
|
||||||
|
while i < self.text_len:
|
||||||
|
c = self.text[i]
|
||||||
|
result += c
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
if newline:
|
||||||
|
lines_count += 1
|
||||||
|
newline = c if c == newline else None
|
||||||
|
else:
|
||||||
|
if c == "\r" or c == "\n":
|
||||||
|
newline = c
|
||||||
|
|
||||||
|
if c == "\\":
|
||||||
|
escape = True
|
||||||
|
elif c == quote and not escape:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
escape = False
|
||||||
|
|
||||||
|
if newline:
|
||||||
|
lines_count += 1
|
||||||
|
|
||||||
|
return result, lines_count
|
||||||
@@ -0,0 +1,96 @@
|
|||||||
|
import pytest
|
||||||
|
from parsers.defaultparser import TokenIter, Token, Tokens
|
||||||
|
|
||||||
|
|
||||||
|
def test_i_can_tokenize():
|
||||||
|
source = "+*-/{}[]() ,;:.?\n\n\r\r\r\nidentifier_0\t \t10.15 10 'string\n' \"another string\""
|
||||||
|
tokens = list(TokenIter(source))
|
||||||
|
assert tokens[0] == Token(Tokens.PLUS, "+", 0, 1, 1)
|
||||||
|
assert tokens[1] == Token(Tokens.STAR, "*", 1, 1, 2)
|
||||||
|
assert tokens[2] == Token(Tokens.MINUS, "-", 2, 1, 3)
|
||||||
|
assert tokens[3] == Token(Tokens.SLASH, "/", 3, 1, 4)
|
||||||
|
assert tokens[4] == Token(Tokens.LBRACE, "{", 4, 1, 5)
|
||||||
|
assert tokens[5] == Token(Tokens.RBRACE, "}", 5, 1, 6)
|
||||||
|
assert tokens[6] == Token(Tokens.LBRACKET, "[", 6, 1, 7)
|
||||||
|
assert tokens[7] == Token(Tokens.RBRACKET, "]", 7, 1, 8)
|
||||||
|
assert tokens[8] == Token(Tokens.LPAR, "(", 8, 1, 9)
|
||||||
|
assert tokens[9] == Token(Tokens.RPAR, ")", 9, 1, 10)
|
||||||
|
assert tokens[10] == Token(Tokens.WHITESPACE, " ", 10, 1, 11)
|
||||||
|
assert tokens[11] == Token(Tokens.COMMA, ",", 14, 1, 15)
|
||||||
|
assert tokens[12] == Token(Tokens.SEMICOLON, ";", 15, 1, 16)
|
||||||
|
assert tokens[13] == Token(Tokens.COLON, ":", 16, 1, 17)
|
||||||
|
assert tokens[14] == Token(Tokens.DOT, ".", 17, 1, 18)
|
||||||
|
assert tokens[15] == Token(Tokens.QMARK, "?", 18, 1, 19)
|
||||||
|
assert tokens[16] == Token(Tokens.NEWLINE, "\n", 19, 1, 20)
|
||||||
|
assert tokens[17] == Token(Tokens.NEWLINE, "\n\r", 20, 2, 1)
|
||||||
|
assert tokens[18] == Token(Tokens.NEWLINE, "\r", 22, 3, 1)
|
||||||
|
assert tokens[19] == Token(Tokens.NEWLINE, "\r\n", 23, 4, 1)
|
||||||
|
assert tokens[20] == Token(Tokens.IDENTIFIER, "identifier_0", 25, 5, 1)
|
||||||
|
assert tokens[21] == Token(Tokens.WHITESPACE, "\t \t", 37, 5, 13)
|
||||||
|
assert tokens[22] == Token(Tokens.NUMBER, "10.15", 41, 5, 17)
|
||||||
|
assert tokens[23] == Token(Tokens.WHITESPACE, " ", 46, 5, 22)
|
||||||
|
assert tokens[24] == Token(Tokens.NUMBER, "10", 47, 5, 23)
|
||||||
|
assert tokens[25] == Token(Tokens.WHITESPACE, " ", 49, 5, 25)
|
||||||
|
assert tokens[26] == Token(Tokens.STRING, "'string\n'", 50, 5, 26)
|
||||||
|
assert tokens[27] == Token(Tokens.WHITESPACE, " ", 59, 6, 1)
|
||||||
|
assert tokens[28] == Token(Tokens.STRING, '"another string"', 60, 6, 2)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text, expected", [
|
||||||
|
("_ident", True),
|
||||||
|
("ident", True),
|
||||||
|
("ident123", True),
|
||||||
|
("ident_123", True),
|
||||||
|
("ident-like-this", True),
|
||||||
|
("àèùéû", True),
|
||||||
|
("011254", False),
|
||||||
|
("0abcd", False),
|
||||||
|
("-abcd", False)
|
||||||
|
])
|
||||||
|
def test_i_can_tokenize_identifiers(text, expected):
|
||||||
|
tokens = list(TokenIter(text))
|
||||||
|
comparison = tokens[0].type == Tokens.IDENTIFIER
|
||||||
|
assert comparison == expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text, expected_text, expected_newlines", [
|
||||||
|
("'foo'", "'foo'", 0),
|
||||||
|
('"foo"', '"foo"', 0),
|
||||||
|
("'foo\rbar'", "'foo\rbar'", 1),
|
||||||
|
("'foo\nbar'", "'foo\nbar'", 1),
|
||||||
|
("'foo\n\rbar'", "'foo\n\rbar'", 1),
|
||||||
|
("'foo\r\nbar'", "'foo\r\nbar'", 1),
|
||||||
|
("'foo\r\rbar'", "'foo\r\rbar'", 2),
|
||||||
|
("'foo\n\nbar'", "'foo\n\nbar'", 2),
|
||||||
|
("'foo\r\n\n\rbar'", "'foo\r\n\n\rbar'", 2),
|
||||||
|
("'\rfoo\rbar\r'", "'\rfoo\rbar\r'", 3),
|
||||||
|
("'\nfoo\nbar\n'", "'\nfoo\nbar\n'", 3),
|
||||||
|
("'\n\rfoo\r\n'", "'\n\rfoo\r\n'", 2),
|
||||||
|
(r"'foo\'bar'", r"'foo\'bar'", 0),
|
||||||
|
(r'"foo\"bar"', r'"foo\"bar"', 0),
|
||||||
|
('"foo"bar"', '"foo"', 0),
|
||||||
|
("'foo'bar'", "'foo'", 0),
|
||||||
|
])
|
||||||
|
def test_i_can_parse_strings(text, expected_text, expected_newlines):
|
||||||
|
lexer = TokenIter(text)
|
||||||
|
text_found, nb_of_newlines = lexer.eat_string(0)
|
||||||
|
|
||||||
|
assert nb_of_newlines == expected_newlines
|
||||||
|
assert text_found == expected_text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", [
|
||||||
|
"1", "3.1415", "0.5", "01", "-5", "-5.10"
|
||||||
|
])
|
||||||
|
def test_i_can_parse_numbers(text):
|
||||||
|
tokens = list(TokenIter(text))
|
||||||
|
assert tokens[0].type == Tokens.NUMBER
|
||||||
|
assert tokens[0].value == text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", [
|
||||||
|
"def", "concept", "as", "pre", "post"
|
||||||
|
])
|
||||||
|
def test_i_can_recognize_keywords(text):
|
||||||
|
tokens = list(TokenIter(text))
|
||||||
|
assert tokens[0].type == Tokens.KEYWORD
|
||||||
Reference in New Issue
Block a user