From 5b2b8ef21c690b1e8dfb56c02eb0e344a8645b33 Mon Sep 17 00:00:00 2001 From: herbglitch Date: Tue, 7 Jan 2025 16:29:15 -0700 Subject: [PATCH] fixed lexer, continous should now work fairly efficiently --- src/std/lexer.c | 113 +++++++++++++++++++++++++++------------------ tests/std/lexer.c | 36 +++------------ tests/std/parser.c | 2 +- 3 files changed, 74 insertions(+), 77 deletions(-) diff --git a/src/std/lexer.c b/src/std/lexer.c index 6e35bdd..c2ee606 100644 --- a/src/std/lexer.c +++ b/src/std/lexer.c @@ -14,6 +14,7 @@ struct ARC_Lexer { //these are used for checking if an uint32_t is a value, if token rules are continuous we can just check the max token value ARC_Bool tokenRulesAreContinuous; uint32_t tokenRulesMaxVal; + uint32_t tokenRulesMinVal; }; //private function for checking if two lexer token rules are the same in a vector (based on id) @@ -63,6 +64,7 @@ void ARC_Lexer_Create(ARC_Lexer **lexer){ //set token rules to continuous and initialize the token rules max value (*lexer)->tokenRulesAreContinuous = ARC_True; (*lexer)->tokenRulesMaxVal = 0; + (*lexer)->tokenRulesMinVal = 0; } void ARC_Lexer_Destroy(ARC_Lexer *lexer){ @@ -90,19 +92,42 @@ void ARC_Lexer_RegisterTokenRule(ARC_Lexer *lexer, ARC_LexerTokenRule tokenRule) free(storedTokenRule); } + //check if first or index to init the min + if(ARC_Vector_GetSize(lexer->tokenRules) == 1){ + lexer->tokenRulesMinVal = tokenRule.id; + lexer->tokenRulesMaxVal = tokenRule.id; + + lexer->tokenRulesAreContinuous = ARC_True; + return; + } + //check if the value still is continuous if(lexer->tokenRulesAreContinuous == ARC_True){ - //if it is already continuous we just check if it is one value above the tokens already in the vector - for(uint32_t tokenRuleIndex = ARC_Vector_GetSize(lexer->tokenRules) - 1; tokenRuleIndex > 0; tokenRuleIndex--){ - //get the current token rule - ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex - 1); - - //check if the token rule is continuous (then next max value by one) - if(tokenRule.id - currentTokenRule->id == 1){ - //the token rule is already continuous so we can update the max value and return - lexer->tokenRulesMaxVal = tokenRule.id; - return; + //check if there is a new min if the minimum is bigger than zero (min is a uint so the zero check is to prevent underflow) + if(lexer->tokenRulesMinVal != 0 && tokenRule.id < lexer->tokenRulesMinVal){ + //if the value is smaller than the min val minus one it is no longer continuous + if(lexer->tokenRulesMinVal - 1 != tokenRule.id){ + lexer->tokenRulesAreContinuous = ARC_False; } + + lexer->tokenRulesMinVal = tokenRule.id; + return; + } + + //check if the value is beetween the continous range + if(tokenRule.id <= lexer->tokenRulesMaxVal && tokenRule.id >= lexer->tokenRulesMinVal){ + return; + } + + //check if there is a new max making sure not to overflow + if(lexer->tokenRulesMaxVal != ~(uint32_t)0 && tokenRule.id > lexer->tokenRulesMaxVal){ + //if the value is bigger than the max val plus one it is no longer continuous + if(lexer->tokenRulesMaxVal + 1 != tokenRule.id){ + lexer->tokenRulesAreContinuous = ARC_False; + } + + lexer->tokenRulesMaxVal = tokenRule.id; + return; } //the token is no longer continous @@ -110,52 +135,48 @@ void ARC_Lexer_RegisterTokenRule(ARC_Lexer *lexer, ARC_LexerTokenRule tokenRule) return; } - //check to see if this value makes the token rule continuous again - //TODO: might want to optomize this - uint32_t minValue = ~(uint32_t)0; - for(uint32_t tokenRuleIndex = 0; tokenRuleIndex < ARC_Vector_GetSize(lexer->tokenRules); tokenRuleIndex++){ - //get the current token rule - ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex); - - //check each token to find the minimum one - if(currentTokenRule->id < minValue){ - minValue = currentTokenRule->id; - } + //if the token is not within min max bounds, then there is no chance it will be continous + if(lexer->tokenRulesMinVal != 0 && tokenRule.id < lexer->tokenRulesMinVal){ + lexer->tokenRulesMinVal = tokenRule.id; + return; } - //loop through untill either all the values are checked and in order or the token rule is not continuous - //TODO: might want to optomize this - for(uint32_t foundSize = 0; foundSize != ARC_Vector_GetSize(lexer->tokenRules); foundSize++){ - //check all current rules - ARC_Bool currentAreContinuous = ARC_False; - for(uint32_t tokenRuleIndex = 0; tokenRuleIndex < ARC_Vector_GetSize(lexer->tokenRules); tokenRuleIndex++){ + //if the token is not within min max bounds, then there is no chance it will be continous + if(lexer->tokenRulesMaxVal != ~(uint32_t)0 && tokenRule.id > lexer->tokenRulesMaxVal){ + lexer->tokenRulesMaxVal = tokenRule.id; + return; + } + + //check to see if this value makes the token rule continuous again + uint32_t currentVal = lexer->tokenRulesMinVal; + + //minimum exists, so add one to the current value + currentVal++; + + //check to see if every index between min an max exist + for(; currentVal < lexer->tokenRulesMaxVal; currentVal++){ + ARC_Bool currentContinous = ARC_False; + + //TODO: probs want to optomize this + for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){ //get the current token rule - ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex); + ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, index); - //check if the value is smaller than or equal to the minimum value and if it is we can skip it - if(currentTokenRule->id <= minValue){ - continue; - } - - //check if the value is continous - if(currentTokenRule->id - minValue == 1){ - //set the token rule max val to the next most continuous value - lexer->tokenRulesMaxVal = currentTokenRule->id; - - //set the next smallest value to check to the the next most continuous value - minValue = currentTokenRule->id; - currentAreContinuous = ARC_True; + //check if a token matches the current needed value + if(currentTokenRule->id == currentVal){ + currentContinous = ARC_True; break; } } - //the current values are not continuous so we can return as token rules are continuous is already set to false - if(currentAreContinuous == ARC_False){ + //if it is still not continuous return + if(currentContinous == ARC_False){ return; } - - //a continuous value was found so loop to next value } + + //the tokens were all continuous + lexer->tokenRulesAreContinuous = ARC_True; } void ARC_Lexer_Clear(ARC_Lexer *lexer){ @@ -294,7 +315,7 @@ ARC_Bool ARC_Lexer_IsContinious(ARC_Lexer *lexer){ ARC_Bool ARC_Lexer_IsTokenId(ARC_Lexer *lexer, uint32_t id){ //if the rules are continuous we can just check if it is less than the max rules value if(lexer->tokenRulesAreContinuous == ARC_True){ - return id <= lexer->tokenRulesMaxVal; + return (ARC_Bool)(id >= lexer->tokenRulesMinVal && id <= lexer->tokenRulesMaxVal); } //the rules are not continuous so we need to check each individually diff --git a/tests/std/lexer.c b/tests/std/lexer.c index 8468abb..30342f1 100644 --- a/tests/std/lexer.c +++ b/tests/std/lexer.c @@ -48,6 +48,8 @@ ARC_TEST(Lexer_Check_Id_Basic){ ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(3, ':')); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(4, ':')); + ARC_CHECK(ARC_Lexer_IsContinious(lexer) == ARC_True); + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 0) == ARC_True ); ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 7) == ARC_False); ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 2) == ARC_True ); @@ -67,6 +69,8 @@ ARC_TEST(Lexer_Check_Id_Unordered_But_Continious){ ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(1, ':')); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(4, ':')); + ARC_CHECK(ARC_Lexer_IsContinious(lexer) == ARC_True); + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 0) == ARC_True ); ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 7) == ARC_False); ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 2) == ARC_True ); @@ -86,6 +90,8 @@ ARC_TEST(Lexer_Check_Id_Unordered_Not_Continious){ ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(1, ':')); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(4, ':')); + ARC_CHECK(ARC_Lexer_IsContinious(lexer) == ARC_False); + ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 8) == ARC_True ); ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 7) == ARC_False); ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 2) == ARC_True ); @@ -94,33 +100,3 @@ ARC_TEST(Lexer_Check_Id_Unordered_Not_Continious){ ARC_Lexer_Destroy(lexer); } - -ARC_TEST(Lexer_Check_Continious){ - ARC_Lexer *lexer; - ARC_Lexer_Create(&lexer); - - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(2, ':')); - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(0, 0 )); - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(3, ':')); - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(1, ':')); - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(4, ':')); - - ARC_CHECK(ARC_Lexer_IsContinious(lexer) == ARC_True); - - ARC_Lexer_Destroy(lexer); -} - -ARC_TEST(Lexer_Check_Not_Continious){ - ARC_Lexer *lexer; - ARC_Lexer_Create(&lexer); - - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(2, ':')); - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(8, 0 )); - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(3, ':')); - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(1, ':')); - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(4, ':')); - - ARC_CHECK(ARC_Lexer_IsContinious(lexer) == ARC_False); - - ARC_Lexer_Destroy(lexer); -} diff --git a/tests/std/parser.c b/tests/std/parser.c index 96c7034..60cf72a 100644 --- a/tests/std/parser.c +++ b/tests/std/parser.c @@ -90,10 +90,10 @@ ARC_TEST(Parser_Basic_Parse){ /* ~ second test ~ */ ARC_String_CreateWithStrlen(&tempString, "z1xwvq"); + //this destroys string, so no need for cleanup ARC_Parser_Parse(parser, &tempString); - ARC_CHECK(arc_errno == 0);