diff --git a/core/concept.py b/core/concept.py index 3af285e..7f01891 100644 --- a/core/concept.py +++ b/core/concept.py @@ -12,8 +12,7 @@ class Concept: self.is_builtin = is_builtin self.pre = None # list of pre conditions before calling the main function self.post = None # list of post conditions after calling the main function - self.main = None # main method - self.value = None # value of the concept + self.main = None # main method, can also be the value of the concept self.id = Concept.concepts_id Concept.concepts_id = Concept.concepts_id + 1 diff --git a/docs/syntax.md b/docs/syntax_v1.md similarity index 100% rename from docs/syntax.md rename to docs/syntax_v1.md diff --git a/docs/syntax_v2.md b/docs/syntax_v2.md new file mode 100644 index 0000000..9cfeec6 --- /dev/null +++ b/docs/syntax_v2.md @@ -0,0 +1,20 @@ +``` +def concept one as 1 --> creates a new concept 1 +def concept two as 2 --> creates a new concept 2 +def concept add(a,b) as a + b --> create concept that need parenthesis +def concept a plus b as add(a,b) --> create a concept that mimic human language + +one plus two --> recognizes the concept 'a plus b' +one plus two ? --> makes the addition + +concept a plus b --> will work on this concept +pre: a is a number + +--> ERROR : 'a is a number' is not known + +def concept a is a number as : + isinstanceof(a, number) + +--> adds concept a is a number +--> add the pre condition to the concept a plus b +``` \ No newline at end of file diff --git a/main.py b/main.py index 0cfd4e6..18322cf 100644 --- a/main.py +++ b/main.py @@ -15,7 +15,7 @@ def main(): # launch the parsers # execute the concepts - + print(event_as_string) return True diff --git a/p.txt b/p.txt new file mode 100644 index 0000000..11b15b1 --- /dev/null +++ b/p.txt @@ -0,0 +1 @@ +print("hello") diff --git a/parsers/__init__.py b/parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parsers/defaultparser.py b/parsers/defaultparser.py new file mode 100644 index 0000000..b0d4a2e --- /dev/null +++ b/parsers/defaultparser.py @@ -0,0 +1,249 @@ +from dataclasses import dataclass + + +@dataclass(frozen=True) +class Token: + type: str + value: str + index: int + line: int + column: int + + +@dataclass(frozen=True) +class LexerError(Exception): + message: str + text: str + index: int + line: int + column: int + + +class Tokens: + EOF = "eof" + WHITESPACE = "whitespace" + NEWLINE = "newline" + KEYWORD = "keyword" + IDENTIFIER = "identifier" + STRING = "string" + NUMBER = "number" + TRUE = "true" + FALSE = "false" + LPAR = "lpar" + RPAR = "rpar" + LBRACKET = "lbrace" + RBRACKET = "rbracket" + LBRACE = "lbrace" + RBRACE = "rbrace" + PLUS = "plus" + MINUS = "minus" + STAR = "star" + SLASH = "slash" + PERCENT = "percent" + COMMA = "comma" + SEMICOLON = "semicolon" + COLON = "colon" + DOT = "dot" + QMARK = "qmark" + VBAR = "vbar" + AMPER = "amper" + + +class TokenIter: + KEYWORDS = ("def", "concept", "as", "pre", "post") + + """ + Class that can iterate on the tokens + """ + + def __init__(self, text): + self.text = text + self.text_len = len(text) + + def __iter__(self): + + i = 0 + line = 1 + column = 1 + while i < self.text_len: + c = self.text[i] + if c == "+": + yield Token(Tokens.PLUS, "+", i, line, column) + i += 1 + column += 1 + elif c == "-": + if i + 1 < self.text_len and self.text[i + 1].isdigit(): + number = self.eat_number(i) + yield Token(Tokens.NUMBER, number, i, line, column) + i += len(number) + column += len(number) + else: + yield Token(Tokens.MINUS, "-", i, line, column) + i += 1 + column += 1 + elif c == "/": + yield Token(Tokens.SLASH, "/", i, line, column) + i += 1 + column += 1 + elif c == "*": + yield Token(Tokens.STAR, "*", i, line, column) + i += 1 + column += 1 + elif c == "{": + yield Token(Tokens.LBRACE, "{", i, line, column) + i += 1 + column += 1 + elif c == "}": + yield Token(Tokens.RBRACE, "}", i, line, column) + i += 1 + column += 1 + elif c == "(": + yield Token(Tokens.LPAR, "(", i, line, column) + i += 1 + column += 1 + elif c == ")": + yield Token(Tokens.RPAR, ")", i, line, column) + i += 1 + column += 1 + elif c == "[": + yield Token(Tokens.LBRACKET, "[", i, line, column) + i += 1 + column += 1 + elif c == "]": + yield Token(Tokens.RBRACKET, "]", i, line, column) + i += 1 + column += 1 + elif c == " " or c == "\t": + whitespace = self.eat_whitespace(i) + yield Token(Tokens.WHITESPACE, whitespace, i, line, column) + i += len(whitespace) + column += len(whitespace) + elif c == ",": + yield Token(Tokens.COMMA, ",", i, line, column) + i += 1 + column += 1 + elif c == ".": + yield Token(Tokens.DOT, ".", i, line, column) + i += 1 + column += 1 + elif c == ";": + yield Token(Tokens.SEMICOLON, ";", i, line, column) + i += 1 + column += 1 + elif c == ":": + yield Token(Tokens.COLON, ":", i, line, column) + i += 1 + column += 1 + elif c == "?": + yield Token(Tokens.QMARK, "?", i, line, column) + i += 1 + column += 1 + elif c == "\n" or c == "\r": + newline = self.eat_newline(i) + yield Token(Tokens.NEWLINE, newline, i, line, column) + i += len(newline) + column = 1 + line += 1 + elif c.isalpha() or c == "_": + identifier = self.eat_identifier(i) + type = Tokens.KEYWORD if identifier in self.KEYWORDS else Tokens.IDENTIFIER + yield Token(type, identifier, i, line, column) + i += len(identifier) + column += len(identifier) + elif c.isdigit(): + number = self.eat_number(i) + yield Token(Tokens.NUMBER, number, i, line, column) + i += len(number) + column += len(number) + elif c == "'" or c == '"': + string, newlines = self.eat_string(i) + yield Token(Tokens.STRING, string, i, line, column) + i += len(string) + column = 1 if newlines > 0 else column + len(string) + line += newlines + else: + raise LexerError(f"Unknown token '{c}'", self.text, i, line, column) + + yield Token(Tokens.EOF, "", i, line, column) + + def eat_whitespace(self, start): + result = self.text[start] + i = start + 1 + while i < self.text_len: + c = self.text[i] + if c == " " or c == "\t": + result += c + i += 1 + else: + break + + return result + + def eat_newline(self, start): + if start + 1 == self.text_len: + return self.text[start] + + current = self.text[start] + next = self.text[start + 1] + if current == "\n" and next == "\r" or current == "\r" and next == "\n": + return current + next + + return current + + def eat_identifier(self, start): + result = self.text[start] + i = start + 1 + while i < self.text_len: + c = self.text[i] + if c.isalpha() or c == "_" or c == "-" or c.isdigit(): + result += c + i += 1 + else: + break + + return result + + def eat_number(self, start): + result = self.text[start] + i = start + 1 + while i < self.text_len: + c = self.text[i] + if c.isdigit() or c == ".": + result += c + i += 1 + else: + break + + return result + + def eat_string(self, start): + quote = self.text[start] + result = self.text[start] + lines_count = 0 + + i = start + 1 + escape = False + newline = None + while i < self.text_len: + c = self.text[i] + result += c + i += 1 + + if newline: + lines_count += 1 + newline = c if c == newline else None + else: + if c == "\r" or c == "\n": + newline = c + + if c == "\\": + escape = True + elif c == quote and not escape: + break + else: + escape = False + + if newline: + lines_count += 1 + + return result, lines_count diff --git a/tests/test_defautparser.py b/tests/test_defautparser.py new file mode 100644 index 0000000..ee43553 --- /dev/null +++ b/tests/test_defautparser.py @@ -0,0 +1,96 @@ +import pytest +from parsers.defaultparser import TokenIter, Token, Tokens + + +def test_i_can_tokenize(): + source = "+*-/{}[]() ,;:.?\n\n\r\r\r\nidentifier_0\t \t10.15 10 'string\n' \"another string\"" + tokens = list(TokenIter(source)) + assert tokens[0] == Token(Tokens.PLUS, "+", 0, 1, 1) + assert tokens[1] == Token(Tokens.STAR, "*", 1, 1, 2) + assert tokens[2] == Token(Tokens.MINUS, "-", 2, 1, 3) + assert tokens[3] == Token(Tokens.SLASH, "/", 3, 1, 4) + assert tokens[4] == Token(Tokens.LBRACE, "{", 4, 1, 5) + assert tokens[5] == Token(Tokens.RBRACE, "}", 5, 1, 6) + assert tokens[6] == Token(Tokens.LBRACKET, "[", 6, 1, 7) + assert tokens[7] == Token(Tokens.RBRACKET, "]", 7, 1, 8) + assert tokens[8] == Token(Tokens.LPAR, "(", 8, 1, 9) + assert tokens[9] == Token(Tokens.RPAR, ")", 9, 1, 10) + assert tokens[10] == Token(Tokens.WHITESPACE, " ", 10, 1, 11) + assert tokens[11] == Token(Tokens.COMMA, ",", 14, 1, 15) + assert tokens[12] == Token(Tokens.SEMICOLON, ";", 15, 1, 16) + assert tokens[13] == Token(Tokens.COLON, ":", 16, 1, 17) + assert tokens[14] == Token(Tokens.DOT, ".", 17, 1, 18) + assert tokens[15] == Token(Tokens.QMARK, "?", 18, 1, 19) + assert tokens[16] == Token(Tokens.NEWLINE, "\n", 19, 1, 20) + assert tokens[17] == Token(Tokens.NEWLINE, "\n\r", 20, 2, 1) + assert tokens[18] == Token(Tokens.NEWLINE, "\r", 22, 3, 1) + assert tokens[19] == Token(Tokens.NEWLINE, "\r\n", 23, 4, 1) + assert tokens[20] == Token(Tokens.IDENTIFIER, "identifier_0", 25, 5, 1) + assert tokens[21] == Token(Tokens.WHITESPACE, "\t \t", 37, 5, 13) + assert tokens[22] == Token(Tokens.NUMBER, "10.15", 41, 5, 17) + assert tokens[23] == Token(Tokens.WHITESPACE, " ", 46, 5, 22) + assert tokens[24] == Token(Tokens.NUMBER, "10", 47, 5, 23) + assert tokens[25] == Token(Tokens.WHITESPACE, " ", 49, 5, 25) + assert tokens[26] == Token(Tokens.STRING, "'string\n'", 50, 5, 26) + assert tokens[27] == Token(Tokens.WHITESPACE, " ", 59, 6, 1) + assert tokens[28] == Token(Tokens.STRING, '"another string"', 60, 6, 2) + + +@pytest.mark.parametrize("text, expected", [ + ("_ident", True), + ("ident", True), + ("ident123", True), + ("ident_123", True), + ("ident-like-this", True), + ("àèùéû", True), + ("011254", False), + ("0abcd", False), + ("-abcd", False) +]) +def test_i_can_tokenize_identifiers(text, expected): + tokens = list(TokenIter(text)) + comparison = tokens[0].type == Tokens.IDENTIFIER + assert comparison == expected + + +@pytest.mark.parametrize("text, expected_text, expected_newlines", [ + ("'foo'", "'foo'", 0), + ('"foo"', '"foo"', 0), + ("'foo\rbar'", "'foo\rbar'", 1), + ("'foo\nbar'", "'foo\nbar'", 1), + ("'foo\n\rbar'", "'foo\n\rbar'", 1), + ("'foo\r\nbar'", "'foo\r\nbar'", 1), + ("'foo\r\rbar'", "'foo\r\rbar'", 2), + ("'foo\n\nbar'", "'foo\n\nbar'", 2), + ("'foo\r\n\n\rbar'", "'foo\r\n\n\rbar'", 2), + ("'\rfoo\rbar\r'", "'\rfoo\rbar\r'", 3), + ("'\nfoo\nbar\n'", "'\nfoo\nbar\n'", 3), + ("'\n\rfoo\r\n'", "'\n\rfoo\r\n'", 2), + (r"'foo\'bar'", r"'foo\'bar'", 0), + (r'"foo\"bar"', r'"foo\"bar"', 0), + ('"foo"bar"', '"foo"', 0), + ("'foo'bar'", "'foo'", 0), +]) +def test_i_can_parse_strings(text, expected_text, expected_newlines): + lexer = TokenIter(text) + text_found, nb_of_newlines = lexer.eat_string(0) + + assert nb_of_newlines == expected_newlines + assert text_found == expected_text + + +@pytest.mark.parametrize("text", [ + "1", "3.1415", "0.5", "01", "-5", "-5.10" +]) +def test_i_can_parse_numbers(text): + tokens = list(TokenIter(text)) + assert tokens[0].type == Tokens.NUMBER + assert tokens[0].value == text + + +@pytest.mark.parametrize("text", [ + "def", "concept", "as", "pre", "post" +]) +def test_i_can_recognize_keywords(text): + tokens = list(TokenIter(text)) + assert tokens[0].type == Tokens.KEYWORD