import pytest from core.tokenizer import Tokenizer, Token, TokenKind, LexerError, Keywords def test_i_can_tokenize(): source = "+*-/{}[]() ,;:.?\n\n\r\r\r\nidentifier_0\t \t10.15 10 'string\n' \"another string\"=|&<>c:name:" source += "$£€!_identifier°~_^\\`==#__var__10" tokens = list(Tokenizer(source)) assert tokens[0] == Token(TokenKind.PLUS, "+", 0, 1, 1) assert tokens[1] == Token(TokenKind.STAR, "*", 1, 1, 2) assert tokens[2] == Token(TokenKind.MINUS, "-", 2, 1, 3) assert tokens[3] == Token(TokenKind.SLASH, "/", 3, 1, 4) assert tokens[4] == Token(TokenKind.LBRACE, "{", 4, 1, 5) assert tokens[5] == Token(TokenKind.RBRACE, "}", 5, 1, 6) assert tokens[6] == Token(TokenKind.LBRACKET, "[", 6, 1, 7) assert tokens[7] == Token(TokenKind.RBRACKET, "]", 7, 1, 8) assert tokens[8] == Token(TokenKind.LPAR, "(", 8, 1, 9) assert tokens[9] == Token(TokenKind.RPAR, ")", 9, 1, 10) assert tokens[10] == Token(TokenKind.WHITESPACE, " ", 10, 1, 11) assert tokens[11] == Token(TokenKind.COMMA, ",", 14, 1, 15) assert tokens[12] == Token(TokenKind.SEMICOLON, ";", 15, 1, 16) assert tokens[13] == Token(TokenKind.COLON, ":", 16, 1, 17) assert tokens[14] == Token(TokenKind.DOT, ".", 17, 1, 18) assert tokens[15] == Token(TokenKind.QMARK, "?", 18, 1, 19) assert tokens[16] == Token(TokenKind.NEWLINE, "\n", 19, 1, 20) assert tokens[17] == Token(TokenKind.NEWLINE, "\n\r", 20, 2, 1) assert tokens[18] == Token(TokenKind.NEWLINE, "\r", 22, 3, 1) assert tokens[19] == Token(TokenKind.NEWLINE, "\r\n", 23, 4, 1) assert tokens[20] == Token(TokenKind.IDENTIFIER, "identifier_0", 25, 5, 1) assert tokens[21] == Token(TokenKind.WHITESPACE, "\t \t", 37, 5, 13) assert tokens[22] == Token(TokenKind.NUMBER, "10.15", 41, 5, 17) assert tokens[23] == Token(TokenKind.WHITESPACE, " ", 46, 5, 22) assert tokens[24] == Token(TokenKind.NUMBER, "10", 47, 5, 23) assert tokens[25] == Token(TokenKind.WHITESPACE, " ", 49, 5, 25) assert tokens[26] == Token(TokenKind.STRING, "'string\n'", 50, 5, 26) assert tokens[27] == Token(TokenKind.WHITESPACE, " ", 59, 6, 1) assert tokens[28] == Token(TokenKind.STRING, '"another string"', 60, 6, 2) assert tokens[29] == Token(TokenKind.EQUALS, '=', 76, 6, 18) assert tokens[30] == Token(TokenKind.VBAR, '|', 77, 6, 19) assert tokens[31] == Token(TokenKind.AMPER, '&', 78, 6, 20) assert tokens[32] == Token(TokenKind.LESS, '<', 79, 6, 21) assert tokens[33] == Token(TokenKind.GREATER, '>', 80, 6, 22) assert tokens[34] == Token(TokenKind.CONCEPT, ('name', None), 81, 6, 23) assert tokens[35] == Token(TokenKind.DOLLAR, '$', 88, 6, 30) assert tokens[36] == Token(TokenKind.STERLING, '£', 89, 6, 31) assert tokens[37] == Token(TokenKind.EURO, '€', 90, 6, 32) assert tokens[38] == Token(TokenKind.EMARK, '!', 91, 6, 33) assert tokens[39] == Token(TokenKind.IDENTIFIER, '_identifier', 92, 6, 34) assert tokens[40] == Token(TokenKind.DEGREE, '°', 103, 6, 45) assert tokens[41] == Token(TokenKind.TILDE, '~', 104, 6, 46) assert tokens[42] == Token(TokenKind.UNDERSCORE, '_', 105, 6, 47) assert tokens[43] == Token(TokenKind.CARAT, '^', 106, 6, 48) assert tokens[44] == Token(TokenKind.BACK_SLASH, '\\', 107, 6, 49) assert tokens[45] == Token(TokenKind.BACK_QUOTE, '`', 108, 6, 50) assert tokens[46] == Token(TokenKind.EQUALSEQUALS, '==', 109, 6, 51) assert tokens[47] == Token(TokenKind.HASH, '#', 111, 6, 53) assert tokens[48] == Token(TokenKind.VAR_DEF, '__var__10', 112, 6, 54) assert tokens[49] == Token(TokenKind.EOF, '', 121, 6, 63) @pytest.mark.parametrize("text, expected", [ ("_ident", True), ("ident", True), ("ident123", True), ("ident_123", True), ("ident-like-this", True), ("àèùéû", True), ("011254", False), ("0abcd", False), ("-abcd", False) ]) def test_i_can_tokenize_identifiers(text, expected): tokens = list(Tokenizer(text)) comparison = tokens[0].type == TokenKind.IDENTIFIER assert comparison == expected @pytest.mark.parametrize("text", [ "123abc", "123", "abc", "abc123" ]) def test_i_can_parse_word(text): tokens = list(Tokenizer(text, parse_word=True)) assert tokens[0].type == TokenKind.WORD assert tokens[0].value == text assert tokens[1].index == len(text) @pytest.mark.parametrize("text", [ "__var__0", "__var__1", "__var__10", "__var__999", ]) def test_i_can_parse_var_def(text): tokens = list(Tokenizer(text)) assert len(tokens) == 2 assert tokens[0].type == TokenKind.VAR_DEF assert tokens[0].value == text @pytest.mark.parametrize("text, message, error_text, index, line, column", [ ("'string", "Missing Trailing quote", "'string", 7, 1, 8), ('"string', "Missing Trailing quote", '"string', 7, 1, 8), ('"a" + "string', "Missing Trailing quote", '"string', 13, 1, 14), ('"a"\n\n"string', "Missing Trailing quote", '"string', 12, 3, 8), ("c::", "Concept identifiers not found", "", 2, 1, 3), ("c:foo\nbar:", "New line in concept name", "foo", 5, 1, 6), ("c:foo", "Missing ending colon", "foo", 5, 1, 6) ]) def test_i_can_detect_tokenizer_errors(text, message, error_text, index, line, column): with pytest.raises(LexerError) as e: list(Tokenizer(text)) assert e.value.message == message assert e.value.text == error_text assert e.value.index == index assert e.value.line == line assert e.value.column == column @pytest.mark.parametrize("text, expected_text, expected_newlines", [ ("'foo'", "'foo'", 0), ('"foo"', '"foo"', 0), ("'foo\rbar'", "'foo\rbar'", 1), ("'foo\nbar'", "'foo\nbar'", 1), ("'foo\n\rbar'", "'foo\n\rbar'", 1), ("'foo\r\nbar'", "'foo\r\nbar'", 1), ("'foo\r\rbar'", "'foo\r\rbar'", 2), ("'foo\n\nbar'", "'foo\n\nbar'", 2), ("'foo\r\n\n\rbar'", "'foo\r\n\n\rbar'", 2), ("'\rfoo\rbar\r'", "'\rfoo\rbar\r'", 3), ("'\nfoo\nbar\n'", "'\nfoo\nbar\n'", 3), ("'\n\rfoo\r\n'", "'\n\rfoo\r\n'", 2), (r"'foo\'bar'", r"'foo\'bar'", 0), (r'"foo\"bar"', r'"foo\"bar"', 0), ('"foo"bar"', '"foo"', 0), ("'foo'bar'", "'foo'", 0), ]) def test_i_can_parse_strings(text, expected_text, expected_newlines): lexer = Tokenizer(text) text_found, nb_of_newlines = lexer.eat_string(0, 1, 1) assert nb_of_newlines == expected_newlines assert text_found == expected_text @pytest.mark.parametrize("text", [ "1", "3.1415", "0.5", "01", "-5", "-5.10" ]) def test_i_can_parse_numbers(text): tokens = list(Tokenizer(text)) assert tokens[0].type == TokenKind.NUMBER assert tokens[0].value == text @pytest.mark.parametrize("text, expected", [ ("def", Keywords.DEF), ("concept", Keywords.CONCEPT), ("as", Keywords.AS), ("pre", Keywords.PRE), ("post", Keywords.POST) ]) def test_i_can_recognize_keywords(text, expected): tokens = list(Tokenizer(text)) assert tokens[0].type == TokenKind.KEYWORD assert tokens[0].value == expected @pytest.mark.parametrize("text, expected", [ ("c:key:", ("key", None)), ("c:key|id:", ("key", "id")), ("c:key|:", ("key", None)), ("c:|id:", (None, "id")), ("c:125:", ("125", None)), ]) def test_i_can_parse_concept_token(text, expected): tokens = list(Tokenizer(text)) assert tokens[0].type == TokenKind.CONCEPT assert tokens[0].value == expected