diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ec3986..7cbfb71 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,6 +64,7 @@ set(ARCHEUS_STD_SOURCES src/std/io.c src/std/lexer.c src/std/parser.c + src/std/parser/parserlang.c src/std/queue.c src/std/stack.c src/std/string.c @@ -130,6 +131,7 @@ if(ARCHEUS_STD_TESTS) #tests/std/vector.c tests/std/lexer.c tests/std/parser.c + tests/std/temp_parserlang.c ${ARCHEUS_STD_SOURCES} ) diff --git a/include/arc/std/parser/parserlang.h b/include/arc/std/parser/parserlang.h index 5f75287..6b1fc6d 100644 --- a/include/arc/std/parser/parserlang.h +++ b/include/arc/std/parser/parserlang.h @@ -5,9 +5,35 @@ extern "C" { #endif -#include -#include -#include +#include "arc/std/parser.h" + +/* + -> NEWLINE | | NEWLINE | LAMBDA + -> WHITESPACE ARROW WHITESPACE + + -> WHITESPACE OR WHITESPACE | + -> WHITESPACE | + -> | + + -> ALPHA_UPPER_CHAR + -> | LAMBDA + -> ALPHA_UPPER_CHAR | UNDERSCORE + + -> LESS_THAN GREATER_THAN + -> | UNDERSCORE + -> | LAMBDA + -> | NUMBER | UNDERSCORE + -> ALPHA_LOWER_CHAR | ALPHA_UPPER_CHAR +*/ + +/* + * @brief creates a parser for the Parser Lang + * + * @note the rules will be inited for the parser lang + * + * @param[out] parser the parser to create +*/ +void ARC_Parser_CreateAsParserLang(ARC_Parser **parser); #define ARC_PARSERLANG_TOKEN_NULL 0 #define ARC_PARSERLANG_TOKEN_NUMBER 1 @@ -44,91 +70,6 @@ extern "C" { #define ARC_PARSERLANG_VARIABLE_CHAR 22 #define ARC_PARSERLANG_ALPHA_CHAR 23 -void ARC_Language_InitLexerRulesFn(ARC_Lexer *lexer){ - //null - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_NULL, 0)); - - //number - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_PARSERLANG_TOKEN_NUMBER, '0', '9')); - - //alpha char - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_PARSERLANG_TOKEN_ALPHA_LOWER_CHAR, 'a', 'z')); - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_PARSERLANG_TOKEN_ALPHA_UPPER_CHAR, 'A', 'Z')); - - //whitespace - ARC_String *whitespaceString; - ARC_String_CreateWithStrlen(&whitespaceString, " \t"); - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(ARC_LEXER_TOKEN_WHITESPACE, whitespaceString)); - ARC_String_Destroy(whitespaceString); - - //single char tokens - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_NEWLINE_ID , ARC_PARSERLANG_TOKEN_NEWLINE_CHAR )); - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_LESS_THAN_ID , ARC_PARSERLANG_TOKEN_LESS_THAN_CHAR )); - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_GREATER_THAN_ID, ARC_PARSERLANG_TOKEN_GREATER_THAN_CHAR)); - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_OR_ID , ARC_PARSERLANG_TOKEN_OR_CHAR )); - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_UNDERSCORE_ID , ARC_PARSERLANG_TOKEN_UNDERSCORE_CHAR )); - - //arrow - ARC_String *arrowString; - ARC_String_CreateWithStrlen(&arrowString, ARC_PARSERLANG_TOKEN_ARROW_CSTRING); - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchStringRule(ARC_PARSERLANG_TOKEN_ARROW_ID, arrowString)); - ARC_String_Destroy(arrowString); -} - -/* - -> NEWLINE | | NEWLINE | LAMBDA - -> WHITESPACE ARROW - - -> WHITESPACE OR WHITESPACE | - -> WHITESPACE | - -> | - - -> ALPHA_UPPER_CHAR - -> | LAMBDA - -> ALPHA_UPPER_CHAR | UNDERSCORE - - -> LESS_THAN GREATER_THAN - -> | UNDERSCORE - -> | LAMBDA - -> | NUMBER | UNDERSCORE - -> ALPHA_LOWER_CHAR | ALPHA_UPPER_CHAR -*/ - -// -> | LAMBDA -uint32_t *variableBody[] = { (uint32_t[]){ 2, ARC_PARSERLANG_VARIABLE_CHAR, ARC_PARSERLANG_VARIABLE_BODY }, (uint32_t[]){ 1, ARC_PARSERLANG_LAMBDA }}; - -// -> | NUMBER | UNDERSCORE -uint32_t *variableChar[] = { (uint32_t[]){ 1, ARC_PARSERLANG_ALPHA_CHAR }, (uint32_t[]){ 1, ARC_PARSERLANG_TOKEN_NUMBER }, (uint32_t[]){ 1, ARC_PARSERLANG_TOKEN_NUMBER }}; - -// -> ALPHA_LOWER_CHAR | ALPHA_UPPER_CHAR -uint32_t *alphaChar[] = { (uint32_t[]){ 1, ARC_PARSERLANG_TOKEN_ALPHA_LOWER_CHAR }, (uint32_t[]){ 1, ARC_PARSERLANG_TOKEN_ALPHA_UPPER_CHAR }}; - -/* -ARC_ParserLanguageTag testTags[3] = { - { - VARIABLE, //tagId - variableTokensOrTags, //tokensOrTags - 1 //tokenOrTagsSize - }, - { - VARIABLE_NAME, //tagId - variableNameTags, //tokensOrTags - 2 //tokenOrTagsSize - }, - { - CHAR_OR_NUM, //tagId - charOrNumTokens, //tokensOrTags - 2 //tokenOrTagsSize - } -}; - -ARC_Array languageArray = { - 3, //size - testTags //data -}; -*/ - - #ifdef __cplusplus } #endif diff --git a/src/std/lexer.c b/src/std/lexer.c index e2f48d8..8a2cce6 100644 --- a/src/std/lexer.c +++ b/src/std/lexer.c @@ -176,11 +176,6 @@ void ARC_Lexer_LexString(ARC_Lexer *lexer, ARC_String **data){ //check if the token rule is found ARC_LexerTokenRule *tokenRule = ARC_Vector_Get(lexer->tokenRules, index); - //set the last token length if the last token had a length - if(tokenLength > 0){ - lastTokenLength = tokenLength; - } - //tokenData should only exist if tokenLength is ARC_True as stated in the header ARC_String *tokenData; tokenLength = tokenRule->automataFn(&tokenData, *data, tokenRule->automataData); @@ -201,6 +196,9 @@ void ARC_Lexer_LexString(ARC_Lexer *lexer, ARC_String **data){ token = (ARC_LexerToken *)malloc(sizeof(ARC_LexerToken)); token->rule = tokenRule->id; token->data = tokenData; + + //update the last found tokenLength to the max length + lastTokenLength = tokenLength; } } @@ -415,6 +413,7 @@ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(uint32_t //return the created tokenRule return tokenRule; } + //private function to free automataData stored as an ARC_String void ARC_LexerTokenRule_DestroyStringAutomataDataFn(void *automataData){ ARC_String_Destroy((ARC_String *)automataData); diff --git a/src/std/parser/parserlang.c b/src/std/parser/parserlang.c new file mode 100644 index 0000000..235b57e --- /dev/null +++ b/src/std/parser/parserlang.c @@ -0,0 +1,99 @@ +#include "arc/std/parser/parserlang.h" +#include "arc/std/lexer.h" +#include "arc/std/parser.h" +#include "arc/std/string.h" + +void ARC_ParserLang_InitLexerRulesFn(ARC_Lexer *lexer){ + //null + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_NULL, 0)); + + //number + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_PARSERLANG_TOKEN_NUMBER, '0', '9')); + + //alpha char + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_PARSERLANG_TOKEN_ALPHA_LOWER_CHAR, 'a', 'z')); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_PARSERLANG_TOKEN_ALPHA_UPPER_CHAR, 'A', 'Z')); + + //whitespace + ARC_String *whitespaceString; + ARC_String_CreateWithStrlen(&whitespaceString, " \t"); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(ARC_LEXER_TOKEN_WHITESPACE, whitespaceString)); + ARC_String_Destroy(whitespaceString); + + //single char tokens + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_NEWLINE_ID , ARC_PARSERLANG_TOKEN_NEWLINE_CHAR )); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_LESS_THAN_ID , ARC_PARSERLANG_TOKEN_LESS_THAN_CHAR )); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_GREATER_THAN_ID, ARC_PARSERLANG_TOKEN_GREATER_THAN_CHAR)); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_OR_ID , ARC_PARSERLANG_TOKEN_OR_CHAR )); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_UNDERSCORE_ID , ARC_PARSERLANG_TOKEN_UNDERSCORE_CHAR )); + + //arrow + ARC_String *arrowString; + ARC_String_CreateWithStrlen(&arrowString, ARC_PARSERLANG_TOKEN_ARROW_CSTRING); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchStringRule(ARC_PARSERLANG_TOKEN_ARROW_ID, arrowString)); + ARC_String_Destroy(arrowString); +} + +void ARC_Parser_CreateAsParserLang(ARC_Parser **parser){ + // -> NEWLINE | | NEWLINE | LAMBDA + uint32_t *line[] = { (uint32_t[]){ 3, ARC_PARSERLANG_BODY, ARC_PARSERLANG_TOKEN_NEWLINE_ID, ARC_PARSERLANG_TOKEN_ARROW_ID }, (uint32_t[]){ 1, ARC_PARSERLANG_BODY }, (uint32_t[]){ 2, ARC_PARSERLANG_TOKEN_NEWLINE_ID, ARC_PARSERLANG_LINE }, (uint32_t[]){ 1, ARC_PARSERLANG_LAMBDA } }; + + // -> WHITESPACE ARROW WHITESPACE + uint32_t *body[] = { (uint32_t[]){ 5, ARC_PARSERLANG_TAG, ARC_PARSERLANG_TOKEN_WHITESPACE, ARC_PARSERLANG_TOKEN_ARROW_ID, ARC_PARSERLANG_TOKEN_WHITESPACE, ARC_PARSERLANG_ARGUMENTS } }; + + // -> WHITESPACE OR WHITESPACE | + uint32_t *arguments[] = { (uint32_t[]){ 5, ARC_PARSERLANG_ARGUMENT, ARC_PARSERLANG_TOKEN_WHITESPACE, ARC_PARSERLANG_TOKEN_OR_ID, ARC_PARSERLANG_TOKEN_WHITESPACE, ARC_PARSERLANG_ARGUMENTS }, (uint32_t[]){ 1, ARC_PARSERLANG_TAG_OR_CONSTANT } }; + + // -> WHITESPACE | + uint32_t *argument[] = { (uint32_t[]){ 3, ARC_PARSERLANG_TAG_OR_CONSTANT, ARC_PARSERLANG_TOKEN_WHITESPACE, ARC_PARSERLANG_ARGUMENT }, (uint32_t[]){ 1, ARC_PARSERLANG_TAG_OR_CONSTANT } }; + + // -> | + uint32_t *tagOrConstant[] = { (uint32_t[]){ 1, ARC_PARSERLANG_TAG }, (uint32_t[]){ 1, ARC_PARSERLANG_CONSTANT } }; + + // -> ALPHA_UPPER_CHAR + uint32_t *constant[] = { (uint32_t[]){ 2, ARC_PARSERLANG_TOKEN_ALPHA_UPPER_CHAR, ARC_PARSERLANG_CONSTANT_BODY } }; + + // -> | LAMBDA + uint32_t *constantBody[] = { (uint32_t[]){ 2, ARC_PARSERLANG_CONSTANT_CHAR, ARC_PARSERLANG_CONSTANT_BODY }, (uint32_t[]){ 1, ARC_PARSERLANG_LAMBDA } }; + + // -> ALPHA_UPPER_CHAR | UNDERSCORE + uint32_t *constantChar[] = { (uint32_t[]){ 2, ARC_PARSERLANG_TOKEN_ALPHA_UPPER_CHAR, ARC_PARSERLANG_TOKEN_UNDERSCORE_ID } }; + + // -> LESS_THAN GREATER_THAN + uint32_t *tag[] = { (uint32_t[]){ 3, ARC_PARSERLANG_TOKEN_LESS_THAN_ID, ARC_PARSERLANG_VARIABLE, ARC_PARSERLANG_TOKEN_GREATER_THAN_ID } }; + + // -> | UNDERSCORE + uint32_t *variable[] = { (uint32_t[]){ 2, ARC_PARSERLANG_ALPHA_CHAR, ARC_PARSERLANG_VARIABLE_BODY }, (uint32_t[]){ 2, ARC_PARSERLANG_TOKEN_UNDERSCORE_ID, ARC_PARSERLANG_VARIABLE_BODY } }; + + // -> | LAMBDA + uint32_t *variableBody[] = { (uint32_t[]){ 2, ARC_PARSERLANG_VARIABLE_CHAR, ARC_PARSERLANG_VARIABLE_BODY }, (uint32_t[]){ 1, ARC_PARSERLANG_LAMBDA } }; + + // -> | NUMBER | UNDERSCORE + uint32_t *variableChar[] = { (uint32_t[]){ 1, ARC_PARSERLANG_ALPHA_CHAR }, (uint32_t[]){ 1, ARC_PARSERLANG_TOKEN_NUMBER }, (uint32_t[]){ 1, ARC_PARSERLANG_TOKEN_UNDERSCORE_ID } }; + + // -> ALPHA_LOWER_CHAR | ALPHA_UPPER_CHAR + uint32_t *alphaChar[] = { (uint32_t[]){ 1, ARC_PARSERLANG_TOKEN_ALPHA_LOWER_CHAR }, (uint32_t[]){ 1, ARC_PARSERLANG_TOKEN_ALPHA_UPPER_CHAR }}; + + ARC_ParserLanguageTag parserLangTags[13] = { + { ARC_PARSERLANG_LINE , line , 4 }, + { ARC_PARSERLANG_BODY , body , 1 }, + { ARC_PARSERLANG_ARGUMENTS , arguments , 2 }, + { ARC_PARSERLANG_ARGUMENT , argument , 2 }, + { ARC_PARSERLANG_TAG_OR_CONSTANT, tagOrConstant, 2 }, + { ARC_PARSERLANG_CONSTANT , constant , 1 }, + { ARC_PARSERLANG_CONSTANT_BODY , constantBody , 2 }, + { ARC_PARSERLANG_CONSTANT_CHAR , constantChar , 2 }, + { ARC_PARSERLANG_TAG , tag , 1 }, + { ARC_PARSERLANG_VARIABLE , variable , 2 }, + { ARC_PARSERLANG_VARIABLE_BODY , variableBody , 2 }, + { ARC_PARSERLANG_VARIABLE_CHAR , variableChar , 3 }, + { ARC_PARSERLANG_ALPHA_CHAR , alphaChar , 2 } + }; + + ARC_Array parserLanguageArray = { + 13, //size + parserLangTags //data + }; + + ARC_Parser_Create(parser, &parserLanguageArray, ARC_ParserLang_InitLexerRulesFn); +} diff --git a/tests/std/parser.c b/tests/std/parser.c index c1edbfe..d7f27a2 100644 --- a/tests/std/parser.c +++ b/tests/std/parser.c @@ -1,8 +1,8 @@ #include "../test.h" #include "arc/std/errno.h" #include "arc/std/parser.h" +#include "arc/std/parser/parserlang.h" -//TODO: fix lambda #define LAMBDA ARC_PARSER_TAG_LAMBDA #define CHAR ARC_LEXER_TOKEN_ALPHALOWERCHAR #define NUM ARC_LEXER_TOKEN_NUMBER @@ -150,4 +150,23 @@ ARC_TEST(Parser_Basic_ParseError){ ARC_Parser_Parse(parser, &tempString); ARC_CHECK(arc_errno == ARC_ERRNO_DATA); + + //reset for next test + arc_errno = 0; } + +//ARC_TEST(Parser_ParserLang_BasicTest){ +// ARC_Parser *parser; +// ARC_Parser_CreateAsParserLang(&parser); +// +// ARC_String *tempString; +// ARC_String_CreateWithStrlen(&tempString, " -> \n"); +// +// //this destroys string, so no need for cleanup +// ARC_Parser_Parse(parser, &tempString); +// +// ARC_Parser_Destroy(parser); +// +// ARC_CHECK(arc_errno == 0); +//} +// \ No newline at end of file