diff --git a/CMakeLists.txt b/CMakeLists.txt index 55ec589..0ec3986 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,7 +128,7 @@ if(ARCHEUS_STD_TESTS) tests/test.c #tests/std/vector.c - #tests/std/lexer.c + tests/std/lexer.c tests/std/parser.c ${ARCHEUS_STD_SOURCES} diff --git a/include/arc/std/lexer.h b/include/arc/std/lexer.h index 248a710..2544ef8 100644 --- a/include/arc/std/lexer.h +++ b/include/arc/std/lexer.h @@ -4,7 +4,6 @@ #ifdef __cplusplus extern "C" { #endif - #include "arc/std/string.h" #include @@ -115,6 +114,16 @@ ARC_LexerToken ARC_Lexer_GetToken(ARC_Lexer *lexer, uint32_t index); */ uint32_t ARC_Lexer_GetTokensSize(ARC_Lexer *lexer); +/** + * @brief returns a boolean based on if a given id is a stored token rule id + * + * @param[in] lexer the lexer to check stored token rule ids + * @param[in] id the id to check against the token rules + * + * @return ARC_True if the id is a rule id, ARC_False otherwise +*/ +ARC_Bool ARC_Lexer_IsTokenId(ARC_Lexer *lexer, uint32_t id); + /** * @brief checks if the first character of string matches the automataData cast as a char * diff --git a/include/arc/std/parser.h b/include/arc/std/parser.h index 0768d54..a78dd58 100644 --- a/include/arc/std/parser.h +++ b/include/arc/std/parser.h @@ -77,10 +77,10 @@ void ARC_Parser_Destroy(ARC_Parser *parser); /** * @brief * - * @param[in] parser - * @param[in] language + * @param[in] parser + * @param[in/out] data the string to parse, will be freed and set to NULL by the end of this function */ -void ARC_Parser_Parse(ARC_Parser *parser, ARC_String *data); +void ARC_Parser_Parse(ARC_Parser *parser, ARC_String **data); /** * @brief diff --git a/src/std/lexer.c b/src/std/lexer.c index 84396a9..ac39e2f 100644 --- a/src/std/lexer.c +++ b/src/std/lexer.c @@ -10,6 +10,10 @@ struct ARC_Lexer { ARC_Vector *tokenRules; ARC_Vector *tokens; + + //these are used for checking if an uint32_t is a value, if token rules are continuous we can just check the max token value + ARC_Bool tokenRulesAreContinuous; + uint32_t tokenRulesMaxVal; }; //private function for checking if two lexer token rules are the same in a vector (based on id) @@ -50,6 +54,10 @@ void ARC_Lexer_Create(ARC_Lexer **lexer){ //setup tokens vector with delete funtion, we don't want a deleteDataFn because their index will be used as the id ARC_Vector_DestroyDataFn tokenVectorDestroyDataFn = ARC_LexerToken_VectorDestroyDataFn; ARC_Vector_Create(&(*lexer)->tokens, NULL, &tokenVectorDestroyDataFn); + + //set token rules to continuous and initialize the token rules max value + (*lexer)->tokenRulesAreContinuous = ARC_True; + (*lexer)->tokenRulesMaxVal = 0; } void ARC_Lexer_Destroy(ARC_Lexer *lexer){ @@ -74,6 +82,73 @@ void ARC_Lexer_RegisterTokenRule(ARC_Lexer *lexer, ARC_LexerTokenRule tokenRule) ARC_DEBUG_LOG_ERROR("ARC_Lexer_RegisterTokenRule(lexer, tokenRule), errored when running ARC_Vector_Add(lexer->tokenRules, storedTokenRule);. check logs for more info"); free(storedTokenRule); } + + //check if the value still is continuous + if(lexer->tokenRulesAreContinuous == ARC_True){ + //if it is already continuous we just check if it is one value above the tokens already in the vector + for(uint32_t tokenRuleIndex = ARC_Vector_GetSize(lexer->tokenRules) - 1; tokenRuleIndex > 0; tokenRuleIndex--){ + //get the current token rule + ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex - 1); + + //check if the token rule is continuous (then next max value by one) + if(tokenRule.id - currentTokenRule->id == 1){ + //the token rule is already continuous so we can update the max value and return + lexer->tokenRulesMaxVal = tokenRule.id; + return; + } + } + + //the token is no longer continous + lexer->tokenRulesAreContinuous = ARC_False; + return; + } + + //check to see if this value makes the token rule continuous again + //TODO: might want to optomize this + uint32_t minValue = ~(uint32_t)0; + for(uint32_t tokenRuleIndex = 0; tokenRuleIndex < ARC_Vector_GetSize(lexer->tokenRules); tokenRuleIndex++){ + //get the current token rule + ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex); + + //check each token to find the minimum one + if(currentTokenRule->id < minValue){ + minValue = currentTokenRule->id; + } + } + + //loop through untill either all the values are checked and in order or the token rule is not continuous + //TODO: might want to optomize this + for(uint32_t foundSize = 0; foundSize != ARC_Vector_GetSize(lexer->tokenRules); foundSize++){ + //check all current rules + ARC_Bool currentAreContinuous = ARC_False; + for(uint32_t tokenRuleIndex = 0; tokenRuleIndex < ARC_Vector_GetSize(lexer->tokenRules); tokenRuleIndex++){ + //get the current token rule + ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex); + + //check if the value is smaller than or equal to the minimum value and if it is we can skip it + if(currentTokenRule->id <= minValue){ + continue; + } + + //check if the value is continous + if(currentTokenRule->id - minValue == 1){ + //set the token rule max val to the next most continuous value + lexer->tokenRulesMaxVal = currentTokenRule->id; + + //set the next smallest value to check to the the next most continuous value + minValue = currentTokenRule->id; + currentAreContinuous = ARC_True; + break; + } + } + + //the current values are not continuous so we can return as token rules are continuous is already set to false + if(currentAreContinuous == ARC_False){ + return; + } + + //a continuous value was found so loop to next value + } } void ARC_Lexer_LexString(ARC_Lexer *lexer, ARC_String **data){ @@ -198,6 +273,23 @@ uint32_t ARC_Lexer_GetTokensSize(ARC_Lexer *lexer){ return ARC_Vector_GetSize(lexer->tokens); } +ARC_Bool ARC_Lexer_IsTokenId(ARC_Lexer *lexer, uint32_t id){ + //if the rules are continuous we can just check if it is less than the max rules value + if(lexer->tokenRulesAreContinuous == ARC_True){ + return id <= lexer->tokenRulesMaxVal; + } + + //the rules are not continuous so we need to check each individually + for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){ + ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, index); + if(currentTokenRule->id == id){ + return ARC_True; + } + } + + return ARC_False; +} + uint32_t ARC_Lexer_AutomataMatchCharFn(ARC_String **tokenData, ARC_String *string, void *automataData){ //if there is a match the token will be the same as automataData, so we don't need to store it again *tokenData = NULL; @@ -321,6 +413,7 @@ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint3 } void ARC_Lexer_InitBasicTokenRules(ARC_Lexer *lexer){ + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_NULL , 0 )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_COLON_ID , ARC_LEXER_TOKEN_COLON_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_SEMICOLON_ID , ARC_LEXER_TOKEN_SEMICOLON_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_COMMA_ID , ARC_LEXER_TOKEN_COMMA_CHAR )); diff --git a/src/std/parser.c b/src/std/parser.c index 93ff500..75ae49c 100644 --- a/src/std/parser.c +++ b/src/std/parser.c @@ -1,7 +1,8 @@ #include "arc/std/parser.h" +#include "arc/std/bool.h" #include "arc/std/errno.h" #include "arc/std/lexer.h" -#include "arc/std/vector.h" +//#include "arc/std/vector.h" #include #include #include @@ -46,7 +47,7 @@ void ARC_Parser_Destroy(ARC_Parser *parser){ } //private recusive function to parse a tag -void ARC_Parser_ParseTag(ARC_Parser *parser, uint32_t lexerIndex, uint32_t tagId){ +void ARC_Parser_ParseTag(ARC_Parser *parser, uint32_t *lexerIndex, uint32_t tagId){ //get the current tag ARC_ParserLanguageTag *tag = NULL; for(uint32_t index = 0; index < parser->language.size; index++){ @@ -67,16 +68,54 @@ void ARC_Parser_ParseTag(ARC_Parser *parser, uint32_t lexerIndex, uint32_t tagId //loop through each or section of the tags and tokens for(uint32_t orIndex = 0; orIndex < tag->tokensOrTagsSize; orIndex++){ //loop through each token or tag to check if the lexed data matches - uint32_t lexerCheckIndex = lexerIndex; + uint32_t lexerCheckIndex = *lexerIndex; + ARC_Bool foundRule = ARC_True; for(uint32_t tokenOrTagIndex = 1; tokenOrTagIndex < tag->tokensOrTags[orIndex][0] + 1; tokenOrTagIndex++){ - + //if the value isn't a token it is a tag, so recurs if it isn't a token + ARC_Bool isToken = ARC_Lexer_IsTokenId(parser->lexer, tag->tokensOrTags[orIndex][tokenOrTagIndex]); + if(isToken == ARC_False){ + ARC_Parser_ParseTag(parser, lexerIndex, tag->tokensOrTags[orIndex][tokenOrTagIndex]); + return; + } + + //get the next token in the lexer and increment the lexers index + ARC_LexerToken token = ARC_Lexer_GetToken(parser->lexer, lexerCheckIndex); + lexerCheckIndex++; + + //if the token rule does not match the current token in the current or statement the token rule could not be found for the current or index so break + if(token.rule != tag->tokensOrTags[orIndex][tokenOrTagIndex]){ + foundRule = ARC_False; + break; + } + } + + //if the rule is found we don't need to check anymore so we can return out + if(foundRule == ARC_True){ + *lexerIndex = lexerCheckIndex; + return; } - // } + + //no rule was found, so set an error and log + arc_errno = ARC_ERRNO_DATA; + ARC_DEBUG_LOG_ERROR_WITH_VARIABLES("ARC_Parser_ParseTag(parser, lexerIndex, tagId), tag id: %u could not find a matching rule at token index %u", tagId, *lexerIndex); } -void ARC_Parser_Parse(ARC_Parser *parser, ARC_String *data){ - +void ARC_Parser_Parse(ARC_Parser *parser, ARC_String **data){ + //lex the subdata + ARC_Lexer_LexString(parser->lexer, data); + if(arc_errno){ + ARC_DEBUG_LOG_ERROR("ARC_Parser_Parse(parser, data), could not lex the given data"); + } + + uint32_t lexerIndex = 0; + ARC_ParserLanguageTag startTag = ((ARC_ParserLanguageTag *)parser->language.data)[0]; + + //recursivly parse from the inital start tag + ARC_Parser_ParseTag(parser, &lexerIndex, startTag.tagId); + if(arc_errno){ + ARC_DEBUG_LOG_ERROR("ARC_Parser_Parse(parser, data), could not parse the given data"); + } } void ARC_Parser_ParseFile(ARC_Parser *parser, ARC_String *path){ diff --git a/tests/std/lexer.c b/tests/std/lexer.c index e78a7b8..cc46b3e 100644 --- a/tests/std/lexer.c +++ b/tests/std/lexer.c @@ -37,3 +37,60 @@ ARC_TEST(Lexer_Char_Match){ ARC_Lexer_Destroy(lexer); } + +ARC_TEST(Lexer_Check_Id_Basic){ + ARC_Lexer *lexer; + ARC_Lexer_Create(&lexer); + + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(0, 0 )); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(1, ':')); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(2, ':')); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(3, ':')); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(4, ':')); + + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 0) == ARC_True ); + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 7) == ARC_False); + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 2) == ARC_True ); + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 4) == ARC_True ); + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 5) == ARC_False); + + ARC_Lexer_Destroy(lexer); +} + +ARC_TEST(Lexer_Check_Id_Unordered_But_Continious){ + ARC_Lexer *lexer; + ARC_Lexer_Create(&lexer); + + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(2, ':')); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(0, 0 )); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(3, ':')); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(1, ':')); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(4, ':')); + + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 0) == ARC_True ); + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 7) == ARC_False); + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 2) == ARC_True ); + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 4) == ARC_True ); + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 5) == ARC_False); + + ARC_Lexer_Destroy(lexer); +} + +ARC_TEST(Lexer_Check_Id_Unordered_Not_Continious){ + ARC_Lexer *lexer; + ARC_Lexer_Create(&lexer); + + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(2, ':')); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(8, 0 )); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(3, ':')); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(1, ':')); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(4, ':')); + + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 8) == ARC_True ); + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 7) == ARC_False); + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 2) == ARC_True ); + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 4) == ARC_True ); + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 5) == ARC_False); + + ARC_Lexer_Destroy(lexer); +} diff --git a/tests/std/parser.c b/tests/std/parser.c index cf1d773..f33656d 100644 --- a/tests/std/parser.c +++ b/tests/std/parser.c @@ -12,7 +12,7 @@ void TEST_Parser_InitLexerRulesFn(ARC_Lexer *lexer){ ARC_Lexer_InitBasicTokenRules(lexer); } -ARC_TEST(Lexer_Char_Match){ +ARC_TEST(Parser_Init){ ARC_Parser *parser; uint32_t *charOrNumTokens[] = { (uint32_t[]){ 1, CHAR }, (uint32_t[]){ 1, NUM } };