Files
Sheerka-Old/tests/core/test_tokenizer.py
T
kodjo 89e1f20975 Fixed #131 : Implement ExprToConditions
Fixed #130 : ArithmeticOperatorParser
Fixed #129 : python_wrapper : create_namespace
Fixed #128 : ExpressionParser: Cannot parse func(x) infixed concept 'xxx'
2021-10-13 16:06:57 +02:00

208 lines
8.3 KiB
Python

import pytest
from core.tokenizer import Tokenizer, Token, TokenKind, LexerError
def test_i_can_tokenize():
source = "+*-/{}[]() ,;:.?\n\n\r\r\r\nidentifier_0\t \t10.15 10 'string\n' \"another string\"=|&<>c:name:"
source += "$£€!_identifier°~_^\\`==#__var__10r/regex\nregex/r:xxx|1:**//%"
tokens = list(Tokenizer(source))
assert tokens[0] == Token(TokenKind.PLUS, "+", 0, 1, 1)
assert tokens[1] == Token(TokenKind.STAR, "*", 1, 1, 2)
assert tokens[2] == Token(TokenKind.MINUS, "-", 2, 1, 3)
assert tokens[3] == Token(TokenKind.SLASH, "/", 3, 1, 4)
assert tokens[4] == Token(TokenKind.LBRACE, "{", 4, 1, 5)
assert tokens[5] == Token(TokenKind.RBRACE, "}", 5, 1, 6)
assert tokens[6] == Token(TokenKind.LBRACKET, "[", 6, 1, 7)
assert tokens[7] == Token(TokenKind.RBRACKET, "]", 7, 1, 8)
assert tokens[8] == Token(TokenKind.LPAR, "(", 8, 1, 9)
assert tokens[9] == Token(TokenKind.RPAR, ")", 9, 1, 10)
assert tokens[10] == Token(TokenKind.WHITESPACE, " ", 10, 1, 11)
assert tokens[11] == Token(TokenKind.COMMA, ",", 14, 1, 15)
assert tokens[12] == Token(TokenKind.SEMICOLON, ";", 15, 1, 16)
assert tokens[13] == Token(TokenKind.COLON, ":", 16, 1, 17)
assert tokens[14] == Token(TokenKind.DOT, ".", 17, 1, 18)
assert tokens[15] == Token(TokenKind.QMARK, "?", 18, 1, 19)
assert tokens[16] == Token(TokenKind.NEWLINE, "\n", 19, 1, 20)
assert tokens[17] == Token(TokenKind.NEWLINE, "\n\r", 20, 2, 1)
assert tokens[18] == Token(TokenKind.NEWLINE, "\r", 22, 3, 1)
assert tokens[19] == Token(TokenKind.NEWLINE, "\r\n", 23, 4, 1)
assert tokens[20] == Token(TokenKind.IDENTIFIER, "identifier_0", 25, 5, 1)
assert tokens[21] == Token(TokenKind.WHITESPACE, "\t \t", 37, 5, 13)
assert tokens[22] == Token(TokenKind.NUMBER, "10.15", 41, 5, 17)
assert tokens[23] == Token(TokenKind.WHITESPACE, " ", 46, 5, 22)
assert tokens[24] == Token(TokenKind.NUMBER, "10", 47, 5, 23)
assert tokens[25] == Token(TokenKind.WHITESPACE, " ", 49, 5, 25)
assert tokens[26] == Token(TokenKind.STRING, "'string\n'", 50, 5, 26)
assert tokens[27] == Token(TokenKind.WHITESPACE, " ", 59, 6, 2)
assert tokens[28] == Token(TokenKind.STRING, '"another string"', 60, 6, 3)
assert tokens[29] == Token(TokenKind.EQUALS, '=', 76, 6, 19)
assert tokens[30] == Token(TokenKind.VBAR, '|', 77, 6, 20)
assert tokens[31] == Token(TokenKind.AMPER, '&', 78, 6, 21)
assert tokens[32] == Token(TokenKind.LESS, '<', 79, 6, 22)
assert tokens[33] == Token(TokenKind.GREATER, '>', 80, 6, 23)
assert tokens[34] == Token(TokenKind.CONCEPT, ('name', None), 81, 6, 24)
assert tokens[35] == Token(TokenKind.DOLLAR, '$', 88, 6, 31)
assert tokens[36] == Token(TokenKind.STERLING, '£', 89, 6, 32)
assert tokens[37] == Token(TokenKind.EURO, '', 90, 6, 33)
assert tokens[38] == Token(TokenKind.EMARK, '!', 91, 6, 34)
assert tokens[39] == Token(TokenKind.IDENTIFIER, '_identifier', 92, 6, 35)
assert tokens[40] == Token(TokenKind.DEGREE, '°', 103, 6, 46)
assert tokens[41] == Token(TokenKind.TILDE, '~', 104, 6, 47)
assert tokens[42] == Token(TokenKind.UNDERSCORE, '_', 105, 6, 48)
assert tokens[43] == Token(TokenKind.CARAT, '^', 106, 6, 49)
assert tokens[44] == Token(TokenKind.BACK_SLASH, '\\', 107, 6, 50)
assert tokens[45] == Token(TokenKind.BACK_QUOTE, '`', 108, 6, 51)
assert tokens[46] == Token(TokenKind.EQUALSEQUALS, '==', 109, 6, 52)
assert tokens[47] == Token(TokenKind.HASH, '#', 111, 6, 54)
assert tokens[48] == Token(TokenKind.VAR_DEF, '__var__10', 112, 6, 55)
assert tokens[49] == Token(TokenKind.REGEX, '/regex\nregex/', 121, 6, 64)
assert tokens[50] == Token(TokenKind.RULE, ("xxx", "1"), 135, 7, 7)
assert tokens[51] == Token(TokenKind.STARSTAR, "**", 143, 7, 15)
assert tokens[52] == Token(TokenKind.SLASHSLASH, "//", 145, 7, 17)
assert tokens[53] == Token(TokenKind.PERCENT, "%", 147, 7, 19)
assert tokens[54] == Token(TokenKind.EOF, '', 148, 7, 20)
@pytest.mark.parametrize("text, expected", [
("_ident", True),
("__ident", True),
("___ident", True),
("ident", True),
("ident123", True),
("ident_123", True),
("ident-like-this", True),
("àèùéû", True),
("011254", False),
("0abcd", False),
("-abcd", False)
])
def test_i_can_tokenize_identifiers(text, expected):
tokens = list(Tokenizer(text))
comparison = tokens[0].type == TokenKind.IDENTIFIER
assert comparison == expected
@pytest.mark.parametrize("text", [
"123abc",
"123",
"abc",
"abc123"
])
def test_i_can_parse_word(text):
tokens = list(Tokenizer(text, parse_word=True))
assert tokens[0].type == TokenKind.WORD
assert tokens[0].value == text
assert tokens[1].index == len(text)
@pytest.mark.parametrize("text", [
"__var__0",
"__var__1",
"__var__10",
"__var__999",
])
def test_i_can_parse_var_def(text):
tokens = list(Tokenizer(text))
assert len(tokens) == 2
assert tokens[0].type == TokenKind.VAR_DEF
assert tokens[0].value == text
@pytest.mark.parametrize("text, message, error_text, index, line, column", [
("'string", "Missing Trailing quote", "'string", 7, 1, 8),
('"string', "Missing Trailing quote", '"string', 7, 1, 8),
('"a" + "string', "Missing Trailing quote", '"string', 13, 1, 14),
('"a"\n\n"string', "Missing Trailing quote", '"string', 12, 3, 8),
("c::", "Concept identifiers not found", "", 2, 1, 3),
("c:foo\nbar:", "New line in concept name", "foo", 5, 1, 6),
("c:foo", "Missing ending colon", "foo", 5, 1, 6)
])
def test_i_can_detect_tokenizer_errors(text, message, error_text, index, line, column):
with pytest.raises(LexerError) as e:
list(Tokenizer(text))
assert e.value.message == message
assert e.value.text == error_text
assert e.value.index == index
assert e.value.line == line
assert e.value.column == column
@pytest.mark.parametrize("text, expected_text, expected_newlines, expected_column", [
("'foo'", "'foo'", 0, 6),
('"foo"', '"foo"', 0, 6),
("'foo\nbar'", "'foo\nbar'", 1, 5),
("'foo\rbar'", "'foo\rbar'", 0, 10),
("'foo\n\rbar'", "'foo\n\rbar'", 1, 6),
("'foo\r\nbar'", "'foo\r\nbar'", 1, 5),
("'foo\n\nbar'", "'foo\n\nbar'", 2, 5),
("'foo\r\n\n\rbar'", "'foo\r\n\n\rbar'", 2, 6),
("'\nfoo\nbar\n'", "'\nfoo\nbar\n'", 3, 2),
("'\n\rfoo\r\n'", "'\n\rfoo\r\n'", 2, 2),
(r"'foo\'bar'", r"'foo\'bar'", 0, 11),
(r'"foo\"bar"', r'"foo\"bar"', 0, 11),
('"foo"bar"', '"foo"', 0, 6),
("'foo'bar'", "'foo'", 0, 6),
])
def test_i_can_parse_strings(text, expected_text, expected_newlines, expected_column):
lexer = Tokenizer(text)
text_found, nb_of_newlines, column_index = lexer.eat_string(0, 1, 1)
assert text_found == expected_text
assert nb_of_newlines == expected_newlines
assert column_index == expected_column
@pytest.mark.parametrize("text", [
"1", "3.1415", "0.5", "01", "-5", "-5.10"
])
def test_i_can_parse_numbers(text):
tokens = list(Tokenizer(text))
assert tokens[0].type == TokenKind.NUMBER
assert tokens[0].value == text
@pytest.mark.parametrize("text, expected", [
("c:key:", ("key", None)),
("c:key|id:", ("key", "id")),
("c:key|:", ("key", None)),
("c:|id:", (None, "id")),
("c:125:", ("125", None)),
])
def test_i_can_parse_concept_token(text, expected):
tokens = list(Tokenizer(text))
assert tokens[0].type == TokenKind.CONCEPT
assert tokens[0].value == expected
@pytest.mark.parametrize("text, expected", [
("r:key:", ("key", None)),
("r:key|id:", ("key", "id")),
("r:key|:", ("key", None)),
("r:|id:", (None, "id")),
("r:125:", ("125", None)),
])
def test_i_can_parse_concept_token(text, expected):
tokens = list(Tokenizer(text))
assert tokens[0].type == TokenKind.RULE
assert tokens[0].value == expected
@pytest.mark.parametrize("text, expected", [
("r|regex|", "|regex|"),
("r/regex/", "/regex/"),
("r'regex'", "'regex'"),
('r"regex"', '"regex"'),
])
def test_i_can_parse_regex_token(text, expected):
tokens = list(Tokenizer(text))
assert tokens[0].type == TokenKind.REGEX
assert tokens[0].value == expected
assert tokens[0].str_value == "r" + expected
assert tokens[0].repr_value == "r" + expected
assert tokens[0].strip_quote == expected[1:-1]