archeus/src/std/lexer.c

504 lines
21 KiB
C
Raw Normal View History

#include "arc/std/lexer.h"
#include "arc/std/bool.h"
#include "arc/std/errno.h"
#include "arc/std/string.h"
#include "arc/std/vector.h"
#include "arc/std/io.h"
#include <stdlib.h>
struct ARC_Lexer {
ARC_Vector *tokenRules;
ARC_Vector *tokens;
//these are used for checking if an uint32_t is a value, if token rules are continuous we can just check the max token value
ARC_Bool tokenRulesAreContinuous;
uint32_t tokenRulesMaxVal;
};
//private function for checking if two lexer token rules are the same in a vector (based on id)
ARC_Bool ARC_LexerTokenRule_VectorCompareDataFn(void *dataA, void *dataB){
ARC_LexerTokenRule *tokenRuleA = (ARC_LexerTokenRule *)dataA;
ARC_LexerTokenRule *tokenRuleB = (ARC_LexerTokenRule *)dataB;
if(tokenRuleA->id == tokenRuleB->id){
return ARC_True;
}
return ARC_False;
}
//private function for destroying a lexer token rule from a vector
void ARC_LexerTokenRule_VectorDestroyDataFn(void *data){
ARC_LexerTokenRule *tokenRule = (ARC_LexerTokenRule *)data;
tokenRule->destroyAutomataDataFn(tokenRule->automataData);
free(tokenRule);
}
//private function for destroying a lexer token from a vector
void ARC_LexerToken_VectorDestroyDataFn(void *data){
ARC_LexerToken *token = (ARC_LexerToken *)data;
free(token);
}
void ARC_Lexer_Create(ARC_Lexer **lexer){
//create the lexer
*lexer = (ARC_Lexer *)malloc(sizeof(ARC_Lexer));
//setup token rules vector with compare and delete functions
ARC_Vector_CompareDataFn tokenRulesVectorCompareDataFn = ARC_LexerTokenRule_VectorCompareDataFn;
ARC_Vector_DestroyDataFn tokenRulesVectorDestroyDataFn = ARC_LexerTokenRule_VectorDestroyDataFn;
ARC_Vector_Create(&(*lexer)->tokenRules, &tokenRulesVectorCompareDataFn, &tokenRulesVectorDestroyDataFn);
//setup tokens vector with delete funtion, we don't want a deleteDataFn because their index will be used as the id
ARC_Vector_DestroyDataFn tokenVectorDestroyDataFn = ARC_LexerToken_VectorDestroyDataFn;
ARC_Vector_Create(&(*lexer)->tokens, NULL, &tokenVectorDestroyDataFn);
//set token rules to continuous and initialize the token rules max value
(*lexer)->tokenRulesAreContinuous = ARC_True;
(*lexer)->tokenRulesMaxVal = 0;
}
void ARC_Lexer_Destroy(ARC_Lexer *lexer){
//free the tokens (there is a vectorDeleteDataFn, so tokens should be freed)
ARC_Vector_Destroy(lexer->tokens);
//free the token rules (there is a vectorDeleteDataFn, so token rules should be freed)
ARC_Vector_Destroy(lexer->tokenRules);
//free the lexer
free(lexer);
}
void ARC_Lexer_RegisterTokenRule(ARC_Lexer *lexer, ARC_LexerTokenRule tokenRule){
//copy the token to a new pointer
ARC_LexerTokenRule *storedTokenRule = (ARC_LexerTokenRule *)malloc(sizeof(ARC_LexerTokenRule));
*storedTokenRule = tokenRule;
2024-11-04 19:58:09 -07:00
//TODO: add warning here for if arc_errno is already set
//add to the vector and check for error (I'd be surprised if the error ever happened because that would most likely mean overflow)
ARC_Vector_Add(lexer->tokenRules, storedTokenRule);
if(arc_errno){
ARC_DEBUG_LOG_ERROR("ARC_Lexer_RegisterTokenRule(lexer, tokenRule), errored when running ARC_Vector_Add(lexer->tokenRules, storedTokenRule);. check logs for more info");
free(storedTokenRule);
}
//check if the value still is continuous
if(lexer->tokenRulesAreContinuous == ARC_True){
//if it is already continuous we just check if it is one value above the tokens already in the vector
for(uint32_t tokenRuleIndex = ARC_Vector_GetSize(lexer->tokenRules) - 1; tokenRuleIndex > 0; tokenRuleIndex--){
//get the current token rule
ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex - 1);
//check if the token rule is continuous (then next max value by one)
if(tokenRule.id - currentTokenRule->id == 1){
//the token rule is already continuous so we can update the max value and return
lexer->tokenRulesMaxVal = tokenRule.id;
return;
}
}
//the token is no longer continous
lexer->tokenRulesAreContinuous = ARC_False;
return;
}
//check to see if this value makes the token rule continuous again
//TODO: might want to optomize this
uint32_t minValue = ~(uint32_t)0;
for(uint32_t tokenRuleIndex = 0; tokenRuleIndex < ARC_Vector_GetSize(lexer->tokenRules); tokenRuleIndex++){
//get the current token rule
ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex);
//check each token to find the minimum one
if(currentTokenRule->id < minValue){
minValue = currentTokenRule->id;
}
}
//loop through untill either all the values are checked and in order or the token rule is not continuous
//TODO: might want to optomize this
for(uint32_t foundSize = 0; foundSize != ARC_Vector_GetSize(lexer->tokenRules); foundSize++){
//check all current rules
ARC_Bool currentAreContinuous = ARC_False;
for(uint32_t tokenRuleIndex = 0; tokenRuleIndex < ARC_Vector_GetSize(lexer->tokenRules); tokenRuleIndex++){
//get the current token rule
ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex);
//check if the value is smaller than or equal to the minimum value and if it is we can skip it
if(currentTokenRule->id <= minValue){
continue;
}
//check if the value is continous
if(currentTokenRule->id - minValue == 1){
//set the token rule max val to the next most continuous value
lexer->tokenRulesMaxVal = currentTokenRule->id;
//set the next smallest value to check to the the next most continuous value
minValue = currentTokenRule->id;
currentAreContinuous = ARC_True;
break;
}
}
//the current values are not continuous so we can return as token rules are continuous is already set to false
if(currentAreContinuous == ARC_False){
return;
}
//a continuous value was found so loop to next value
}
}
2024-10-30 18:41:01 -06:00
void ARC_Lexer_Clear(ARC_Lexer *lexer){
//clear the tokens vector
ARC_Vector_Clear(lexer->tokens);
}
void ARC_Lexer_LexString(ARC_Lexer *lexer, ARC_String **data){
//check if there are any token rules to use
if(ARC_Vector_GetSize(lexer->tokenRules) == 0){
arc_errno = ARC_ERRNO_DATA;
ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexString(lexer, data), no tokens registered to lexer to use");
return;
}
//this will run untill everything token is stripped or there is an error
while(*data != NULL){
2024-10-16 17:35:38 -06:00
uint32_t tokenLength = 0;
uint32_t lastTokenLength = 0;
ARC_LexerToken *token = NULL;
for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){
//check if the token rule is found
ARC_LexerTokenRule *tokenRule = ARC_Vector_Get(lexer->tokenRules, index);
2024-10-16 17:35:38 -06:00
//tokenData should only exist if tokenLength is ARC_True as stated in the header
ARC_String *tokenData;
2024-10-16 17:35:38 -06:00
tokenLength = tokenRule->automataFn(&tokenData, *data, tokenRule->automataData);
//check if a token was found if it wasn't continue. I'm doing this to try to cut down on the ammount of indentation
2024-10-16 17:35:38 -06:00
if(tokenLength == 0){
continue;
}
2024-10-16 17:35:38 -06:00
//check to see if we found a better match
if(tokenLength > lastTokenLength){
//free the current token if it exists
if(token != NULL){
ARC_LexerTokenRule_VectorDestroyDataFn((void *)token);
}
//create the token to add
token = (ARC_LexerToken *)malloc(sizeof(ARC_LexerToken));
token->rule = tokenRule->id;
token->data = tokenData;
//update the last found tokenLength to the max length
lastTokenLength = tokenLength;
}
}
//if no token was found, throw an error
2024-10-16 17:35:38 -06:00
if(token == NULL){
arc_errno = ARC_ERRNO_DATA;
ARC_DEBUG_LOG_ERROR_WITH_VARIABLES("ARC_Lexer_LexString(lexer, data), no tokens found with current string: \"%s\"", (*data)->data);
//clean up errored string
ARC_String_Destroy(*data);
*data = NULL;
2024-10-16 17:35:38 -06:00
return;
}
//token exists (something must have gone very wrong if it doesn't), so add it and check for overflow (which I'd be surprised if that happens)
ARC_Vector_Add(lexer->tokens, (void *)token);
if(arc_errno){
ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexString(lexer, data), errored when running ARC_Vector_Add(lexer->tokens, token);. check logs for more info");
free(token);
//clean up errored string
ARC_String_Destroy(*data);
*data = NULL;
return;
}
2024-10-16 17:35:38 -06:00
//if the last token was found, destroy the string and return
if(lastTokenLength == (*data)->length){
ARC_String_Destroy(*data);
*data = NULL;
return;
}
2024-10-16 17:35:38 -06:00
//strip the string
ARC_String_ReplaceWithSubstring(data, lastTokenLength, (*data)->length - lastTokenLength);
}
}
void ARC_Lexer_LexFile(ARC_Lexer *lexer, ARC_String *path){
//read file and clean up if it errors
ARC_String *data;
ARC_IO_FileToStr(path, &data);
if(arc_errno){
ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexFile(lexer, path), errored when running ARC_IO_FileToStr(path, &data);. check logs for more info");
if(data != NULL){
ARC_String_Destroy(data);
}
return;
}
//lex the string and log if there is an error, ARC_Lexer_LexString will clean up the string
ARC_Lexer_LexString(lexer, &data);
if(arc_errno){
ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexFile(lexer, path), errored when running ARC_Lexer_LexString(lexer, data);. check logs for more info");
}
}
void ARC_Lexer_PrintTokenRules(ARC_Lexer *lexer){
for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){
ARC_LexerTokenRule *tokenRule = ARC_Vector_Get(lexer->tokenRules, index);
printf("Rule: %02i\tFunction: %p\n", tokenRule->id, tokenRule->automataFn);
}
}
ARC_LexerToken ARC_Lexer_GetToken(ARC_Lexer *lexer, uint32_t index){
//get the token and log if there is an error
ARC_LexerToken *token = ARC_Vector_Get(lexer->tokens, index);
if(arc_errno){
ARC_DEBUG_LOG_ERROR("ARC_Lexer_GetToken(lexer, index), errored when running ARC_Vector_Get(lexer->tokens, index);. check logs for more info");
//return a token with max rule value, and NULL for the string to signify an error
return (ARC_LexerToken){
~(uint32_t)0,
NULL
};
}
//the token was found, so return a copy to that
return *token;
}
uint32_t ARC_Lexer_GetTokensSize(ARC_Lexer *lexer){
return ARC_Vector_GetSize(lexer->tokens);
}
ARC_Bool ARC_Lexer_IsTokenId(ARC_Lexer *lexer, uint32_t id){
//if the rules are continuous we can just check if it is less than the max rules value
if(lexer->tokenRulesAreContinuous == ARC_True){
return id <= lexer->tokenRulesMaxVal;
}
//the rules are not continuous so we need to check each individually
for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){
ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, index);
if(currentTokenRule->id == id){
return ARC_True;
}
}
return ARC_False;
}
2024-10-16 17:35:38 -06:00
uint32_t ARC_Lexer_AutomataMatchCharFn(ARC_String **tokenData, ARC_String *string, void *automataData){
//if there is a match the token will be the same as automataData, so we don't need to store it again
*tokenData = NULL;
//check to see if there is a match with automataData as a char
2024-10-16 17:35:38 -06:00
if(string->data[0] == *(char *)automataData){
//return the token was found of length 1
return 1;
}
//no match was found
2024-10-16 17:35:38 -06:00
return 0;
}
uint32_t ARC_Lexer_AutomataMatchCharOrBetweenFn(ARC_String **tokenData, ARC_String *string, void *automataData){
//if there is a match the token will be the same as automataData, so we don't need to store it again
*tokenData = NULL;
//check to see if there is a match with automataData as a range of chars
char *automataDataChars = (char *)automataData;
if(string->data[0] >= automataDataChars[0] && string->data[0] <= ((char *)automataData)[1]){
//return the token as token data and the token was found of length 1
//TODO: fix this
//ARC_String_Create(tokenData, string->data, 1);
return 1;
}
//no match was found
return 0;
}
2024-10-16 17:35:38 -06:00
uint32_t ARC_Lexer_AutomataMatchStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){
//if there is a match the token will be the same as automataData, so we don't need to store it again
*tokenData = NULL;
//check to see if there is a match with automataData as a string
ARC_String *automataDataString = (ARC_String *)automataData;
2024-10-16 17:35:38 -06:00
if(ARC_String_SubstringEquals(string, 0, automataDataString)){
//return the token was found of the string length
return automataDataString->length;
}
2024-10-16 17:35:38 -06:00
//no match was found
return 0;
}
2024-10-16 17:35:38 -06:00
uint32_t ARC_Lexer_AutomataMatchCharInStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){
//if there is a match the token will be the same as automataData, so we don't need to store it again
*tokenData = NULL;
//check to see if there is a char match in automataData as a string
ARC_String *automataDataString = (ARC_String *)automataData;
for(uint64_t index = 0; index < automataDataString->length; index++){
if(string->data[0] == automataDataString->data[index]){
//return the token was found in the string of length 1
return 1;
}
}
//no match was found
2024-10-16 17:35:38 -06:00
return 0;
}
//private function to free automataData stored as a char
void ARC_LexerTokenRule_DestroyCharAutomataDataFn(void *automataData){
free((char *)automataData);
}
ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharRule(uint32_t id, char character){
//create the token rule
ARC_LexerTokenRule tokenRule;
//set the id
tokenRule.id = id;
//create and store the automataData (which is just a char)
char *automataData = (char *)malloc(sizeof(char));
*automataData = character;
tokenRule.automataData = (void *)automataData;
//we can use the ARC_Lexer_AutomataMatchCharFn for this
tokenRule.automataFn = ARC_Lexer_AutomataMatchCharFn;
//add the private destroy function
tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyCharAutomataDataFn;
//return the created tokenRule
return tokenRule;
}
ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(uint32_t id, char start, char end){
//create the token rule
ARC_LexerTokenRule tokenRule;
//set the id
tokenRule.id = id;
//create and store the automataData (which is just two chars (the minumum and manximum))
char *automataData = (char *)malloc(sizeof(char) * 2);
automataData[0] = start;
automataData[1] = end;
tokenRule.automataData = (void *)automataData;
//we can use the ARC_Lexer_AutomataMatchCharInStringFn for this
tokenRule.automataFn = ARC_Lexer_AutomataMatchCharOrBetweenFn;
//add the private destroy function (we can use the char as it destroys a char pointer of any size)
tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyCharAutomataDataFn;
//return the created tokenRule
return tokenRule;
}
//private function to free automataData stored as an ARC_String
void ARC_LexerTokenRule_DestroyStringAutomataDataFn(void *automataData){
ARC_String_Destroy((ARC_String *)automataData);
}
ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchStringRule(uint32_t id, ARC_String *string){
//create the token rule
ARC_LexerTokenRule tokenRule;
//set the id
tokenRule.id = id;
//copy and store the automataData (which is just an ARC_String)
ARC_String *automataData;
ARC_String_Copy(&automataData, string);
tokenRule.automataData = (void *)automataData;
2024-10-16 17:35:38 -06:00
//we can use the ARC_Lexer_AutomataMatchStringFn for this
tokenRule.automataFn = ARC_Lexer_AutomataMatchStringFn;
//add the private destroy function
tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyStringAutomataDataFn;
//return the created tokenRule
return tokenRule;
}
ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint32_t id, ARC_String *string){
//create the token rule
ARC_LexerTokenRule tokenRule;
//set the id
tokenRule.id = id;
//copy and store the automataData (which is just an ARC_String)
ARC_String *automataData;
ARC_String_Copy(&automataData, string);
tokenRule.automataData = (void *)automataData;
//we can use the ARC_Lexer_AutomataMatchCharInStringFn for this
tokenRule.automataFn = ARC_Lexer_AutomataMatchCharInStringFn;
//add the private destroy function
tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyStringAutomataDataFn;
//return the created tokenRule
return tokenRule;
}
void ARC_Lexer_InitBasicTokenRules(ARC_Lexer *lexer){
//null
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_NULL, 0));
//number
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_LEXER_TOKEN_NUMBER, '0', '9'));
//alpha char
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_LEXER_TOKEN_ALPHALOWERCHAR, 'a', 'z'));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_LEXER_TOKEN_ALPHAUPPERCHAR, 'A', 'Z'));
//whitespace
//TODO: fix this
ARC_String *whitespaceString;
ARC_String_CreateWithStrlen(&whitespaceString, " \t");
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(ARC_LEXER_TOKEN_WHITESPACE, whitespaceString));
ARC_String_Destroy(whitespaceString);
//TEMP FIX:
//ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_WHITESPACE, ' '));
//single char tokens
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_NEWLINE_ID , ARC_LEXER_TOKEN_NEWLINE_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_COLON_ID , ARC_LEXER_TOKEN_COLON_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_SEMICOLON_ID , ARC_LEXER_TOKEN_SEMICOLON_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_COMMA_ID , ARC_LEXER_TOKEN_COMMA_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_PERIOD_ID , ARC_LEXER_TOKEN_PERIOD_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_FORWARD_SLASH_ID , ARC_LEXER_TOKEN_FORWARD_SLASH_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_BACK_SLASH_ID , ARC_LEXER_TOKEN_BACK_SLASH_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_LEFT_PARENTHESIS_ID , ARC_LEXER_TOKEN_LEFT_PARENTHESIS_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_ID, ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_CHAR));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_ID , ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_ID, ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_CHAR));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_BANG_ID , ARC_LEXER_TOKEN_BANG_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_AT_ID , ARC_LEXER_TOKEN_AT_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_HASH_ID , ARC_LEXER_TOKEN_HASH_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_PERCENT_ID , ARC_LEXER_TOKEN_PERCENT_CHAR ));
}