older lexer stuff

This commit is contained in:
herbglitch 2024-10-16 17:35:38 -06:00
parent 7bd7cc4aa5
commit 380e74a0e6
3 changed files with 211 additions and 72 deletions

View file

@ -86,53 +86,74 @@ void ARC_Lexer_LexString(ARC_Lexer *lexer, ARC_String **data){
//this will run untill everything token is stripped or there is an error
while(*data != NULL){
ARC_Bool tokenFound = ARC_False;
uint32_t tokenLength = 0;
uint32_t lastTokenLength = 0;
ARC_LexerToken *token = NULL;
for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){
//check if the token rule is found
ARC_LexerTokenRule *tokenRule = ARC_Vector_Get(lexer->tokenRules, index);
//tokenData should only exist if tokenFound is ARC_True as stated in the header
//set the last token length if the last token had a length
if(tokenLength > 0){
lastTokenLength = tokenLength;
}
//tokenData should only exist if tokenLength is ARC_True as stated in the header
ARC_String *tokenData;
tokenFound = tokenRule->automataFn(data, &tokenData, tokenRule->automataData);
tokenLength = tokenRule->automataFn(&tokenData, *data, tokenRule->automataData);
//check if a token was found if it wasn't continue. I'm doing this to try to cut down on the ammount of indentation
if(tokenFound != ARC_True){
if(tokenLength == 0){
continue;
}
//create the token to add
ARC_LexerToken *token = (ARC_LexerToken *)malloc(sizeof(ARC_LexerToken));
token->rule = tokenRule->id;
token->data = tokenData;
//check to see if we found a better match
if(tokenLength > lastTokenLength){
//free the current token if it exists
if(token != NULL){
ARC_LexerTokenRule_VectorDestroyDataFn((void *)token);
}
//add to the vector and check for error (I'd be surprised if the error ever happened because that would most likely mean overflow)
ARC_Vector_Add(lexer->tokens, (void *)token);
if(arc_errno){
ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexString(lexer, data), errored when running ARC_Vector_Add(lexer->tokens, token);. check logs for more info");
free(token);
//clean up errored string
ARC_String_Destroy(*data);
*data = NULL;
return;
//create the token to add
token = (ARC_LexerToken *)malloc(sizeof(ARC_LexerToken));
token->rule = tokenRule->id;
token->data = tokenData;
}
//the token was added, so break to start checking tokens again
break;
}
//if no token was found, throw an error
if(tokenFound == ARC_False){
if(token == NULL){
arc_errno = ARC_ERRNO_DATA;
ARC_DEBUG_LOG_ERROR_WITH_VARIABLES("ARC_Lexer_LexString(lexer, data), no tokens found with current string: \"%s\"", (*data)->data);
//clean up errored string
ARC_String_Destroy(*data);
*data = NULL;
//TODO: might want to do smthn with already tokened data
return;
}
//token exists (something must have gone very wrong if it doesn't), so add it and check for overflow (which I'd be surprised if that happens)
ARC_Vector_Add(lexer->tokens, (void *)token);
if(arc_errno){
ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexString(lexer, data), errored when running ARC_Vector_Add(lexer->tokens, token);. check logs for more info");
free(token);
//clean up errored string
ARC_String_Destroy(*data);
*data = NULL;
return;
}
//if the last token was found, destroy the string and return
if(lastTokenLength == (*data)->length){
ARC_String_Destroy(*data);
*data = NULL;
return;
}
//strip the string
ARC_String_ReplaceWithSubstring(data, lastTokenLength, (*data)->length - lastTokenLength);
}
}
@ -177,49 +198,50 @@ uint32_t ARC_Lexer_GetTokensSize(ARC_Lexer *lexer){
return ARC_Vector_GetSize(lexer->tokens);
}
ARC_Bool ARC_Lexer_AutomataMatchCharFn(ARC_String **string, ARC_String **tokenData, void *automataData){
uint32_t ARC_Lexer_AutomataMatchCharFn(ARC_String **tokenData, ARC_String *string, void *automataData){
//if there is a match the token will be the same as automataData, so we don't need to store it again
*tokenData = NULL;
//check to see if there is a match with automataData as a char
if((*string)->data[0] == *(char *)automataData){
//to keep from erroring instead of stripping from a 1 character string we can just delete it
if((*string)->length == 1){
ARC_String_Destroy(*string);
*string = NULL;
return ARC_True;
}
//strip the charater from the front of the string and return that a match was found
ARC_String_ReplaceWithSubstring(string, 1, (*string)->length - 1);
return ARC_True;
if(string->data[0] == *(char *)automataData){
//return the token was found of length 1
return 1;
}
//no match was found
return ARC_False;
return 0;
}
ARC_Bool ARC_Lexer_AutomataMatchStringFn(ARC_String **string, ARC_String **tokenData, void *automataData){
uint32_t ARC_Lexer_AutomataMatchStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){
//if there is a match the token will be the same as automataData, so we don't need to store it again
*tokenData = NULL;
//check to see if there is a match with automataData as a string
ARC_String *automataDataString = (ARC_String *)automataData;
//to keep from erroring instead of stripping from a same length string we can just delete it
if(ARC_String_Equals(*string, automataDataString)){
if((*string)->length == automataDataString->length){
ARC_String_Destroy(*string);
*string = NULL;
}
//strip the token string from the front of the string and return that a match was found
ARC_String_ReplaceWithSubstring(string, automataDataString->length, (*string)->length - automataDataString->length);
return ARC_True;
if(ARC_String_SubstringEquals(string, 0, automataDataString)){
//return the token was found of the string length
return automataDataString->length;
}
//no match was found
return ARC_False;
return 0;
}
uint32_t ARC_Lexer_AutomataMatchCharInStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){
//if there is a match the token will be the same as automataData, so we don't need to store it again
*tokenData = NULL;
//check to see if there is a char match in automataData as a string
ARC_String *automataDataString = (ARC_String *)automataData;
for(uint64_t index = 0; index < automataDataString->length; index++){
if(string->data[0] == automataDataString->data[index]){
//return the token was found in the string of length 1
return 1;
}
}
//no match was found
return 0;
}
//private function to free automataData stored as a char
@ -266,8 +288,30 @@ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchStringRule(uint32_t id
ARC_String_Copy(&automataData, string);
tokenRule.automataData = (void *)automataData;
//we can use the ARC_Lexer_AutomataMatchCharFn for this
tokenRule.automataFn = ARC_Lexer_AutomataMatchCharFn;
//we can use the ARC_Lexer_AutomataMatchStringFn for this
tokenRule.automataFn = ARC_Lexer_AutomataMatchStringFn;
//add the private destroy function
tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyStringAutomataDataFn;
//return the created tokenRule
return tokenRule;
}
ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint32_t id, ARC_String *string){
//create the token rule
ARC_LexerTokenRule tokenRule;
//set the id
tokenRule.id = id;
//copy and store the automataData (which is just an ARC_String)
ARC_String *automataData;
ARC_String_Copy(&automataData, string);
tokenRule.automataData = (void *)automataData;
//we can use the ARC_Lexer_AutomataMatchCharInStringFn for this
tokenRule.automataFn = ARC_Lexer_AutomataMatchCharInStringFn;
//add the private destroy function
tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyStringAutomataDataFn;