#include "arc/std/lexer.h" #include "arc/std/bool.h" #include "arc/std/errno.h" #include "arc/std/string.h" #include "arc/std/vector.h" #include "arc/std/io.h" #include struct ARC_Lexer { ARC_Vector *tokenRules; ARC_Vector *tokens; //these are used for checking if an uint32_t is a value, if token rules are continuous we can just check the max token value ARC_Bool tokenRulesAreContinuous; uint32_t tokenRulesMaxVal; }; //private function for checking if two lexer token rules are the same in a vector (based on id) ARC_Bool ARC_LexerTokenRule_VectorCompareDataFn(void *dataA, void *dataB){ ARC_LexerTokenRule *tokenRuleA = (ARC_LexerTokenRule *)dataA; ARC_LexerTokenRule *tokenRuleB = (ARC_LexerTokenRule *)dataB; if(tokenRuleA->id == tokenRuleB->id){ return ARC_True; } return ARC_False; } //private function for destroying a lexer token rule from a vector void ARC_LexerTokenRule_VectorDestroyDataFn(void *data){ ARC_LexerTokenRule *tokenRule = (ARC_LexerTokenRule *)data; tokenRule->destroyAutomataDataFn(tokenRule->automataData); free(tokenRule); } //private function for destroying a lexer token from a vector void ARC_LexerToken_VectorDestroyDataFn(void *data){ ARC_LexerToken *token = (ARC_LexerToken *)data; free(token); } void ARC_Lexer_Create(ARC_Lexer **lexer){ //create the lexer *lexer = (ARC_Lexer *)malloc(sizeof(ARC_Lexer)); //setup token rules vector with compare and delete functions ARC_Vector_CompareDataFn tokenRulesVectorCompareDataFn = ARC_LexerTokenRule_VectorCompareDataFn; ARC_Vector_DestroyDataFn tokenRulesVectorDestroyDataFn = ARC_LexerTokenRule_VectorDestroyDataFn; ARC_Vector_Create(&(*lexer)->tokenRules, &tokenRulesVectorCompareDataFn, &tokenRulesVectorDestroyDataFn); //setup tokens vector with delete funtion, we don't want a deleteDataFn because their index will be used as the id ARC_Vector_DestroyDataFn tokenVectorDestroyDataFn = ARC_LexerToken_VectorDestroyDataFn; ARC_Vector_Create(&(*lexer)->tokens, NULL, &tokenVectorDestroyDataFn); //set token rules to continuous and initialize the token rules max value (*lexer)->tokenRulesAreContinuous = ARC_True; (*lexer)->tokenRulesMaxVal = 0; } void ARC_Lexer_Destroy(ARC_Lexer *lexer){ //free the tokens (there is a vectorDeleteDataFn, so tokens should be freed) ARC_Vector_Destroy(lexer->tokens); //free the token rules (there is a vectorDeleteDataFn, so token rules should be freed) ARC_Vector_Destroy(lexer->tokenRules); //free the lexer free(lexer); } void ARC_Lexer_RegisterTokenRule(ARC_Lexer *lexer, ARC_LexerTokenRule tokenRule){ //copy the token to a new pointer ARC_LexerTokenRule *storedTokenRule = (ARC_LexerTokenRule *)malloc(sizeof(ARC_LexerTokenRule)); *storedTokenRule = tokenRule; //add to the vector and check for error (I'd be surprised if the error ever happened because that would most likely mean overflow) ARC_Vector_Add(lexer->tokenRules, storedTokenRule); if(arc_errno){ ARC_DEBUG_LOG_ERROR("ARC_Lexer_RegisterTokenRule(lexer, tokenRule), errored when running ARC_Vector_Add(lexer->tokenRules, storedTokenRule);. check logs for more info"); free(storedTokenRule); } //check if the value still is continuous if(lexer->tokenRulesAreContinuous == ARC_True){ //if it is already continuous we just check if it is one value above the tokens already in the vector for(uint32_t tokenRuleIndex = ARC_Vector_GetSize(lexer->tokenRules) - 1; tokenRuleIndex > 0; tokenRuleIndex--){ //get the current token rule ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex - 1); //check if the token rule is continuous (then next max value by one) if(tokenRule.id - currentTokenRule->id == 1){ //the token rule is already continuous so we can update the max value and return lexer->tokenRulesMaxVal = tokenRule.id; return; } } //the token is no longer continous lexer->tokenRulesAreContinuous = ARC_False; return; } //check to see if this value makes the token rule continuous again //TODO: might want to optomize this uint32_t minValue = ~(uint32_t)0; for(uint32_t tokenRuleIndex = 0; tokenRuleIndex < ARC_Vector_GetSize(lexer->tokenRules); tokenRuleIndex++){ //get the current token rule ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex); //check each token to find the minimum one if(currentTokenRule->id < minValue){ minValue = currentTokenRule->id; } } //loop through untill either all the values are checked and in order or the token rule is not continuous //TODO: might want to optomize this for(uint32_t foundSize = 0; foundSize != ARC_Vector_GetSize(lexer->tokenRules); foundSize++){ //check all current rules ARC_Bool currentAreContinuous = ARC_False; for(uint32_t tokenRuleIndex = 0; tokenRuleIndex < ARC_Vector_GetSize(lexer->tokenRules); tokenRuleIndex++){ //get the current token rule ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex); //check if the value is smaller than or equal to the minimum value and if it is we can skip it if(currentTokenRule->id <= minValue){ continue; } //check if the value is continous if(currentTokenRule->id - minValue == 1){ //set the token rule max val to the next most continuous value lexer->tokenRulesMaxVal = currentTokenRule->id; //set the next smallest value to check to the the next most continuous value minValue = currentTokenRule->id; currentAreContinuous = ARC_True; break; } } //the current values are not continuous so we can return as token rules are continuous is already set to false if(currentAreContinuous == ARC_False){ return; } //a continuous value was found so loop to next value } } void ARC_Lexer_Clear(ARC_Lexer *lexer){ //clear the tokens vector ARC_Vector_Clear(lexer->tokens); } void ARC_Lexer_LexString(ARC_Lexer *lexer, ARC_String **data){ //check if there are any token rules to use if(ARC_Vector_GetSize(lexer->tokenRules) == 0){ arc_errno = ARC_ERRNO_DATA; ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexString(lexer, data), no tokens registered to lexer to use"); return; } //this will run untill everything token is stripped or there is an error while(*data != NULL){ uint32_t tokenLength = 0; uint32_t lastTokenLength = 0; ARC_LexerToken *token = NULL; for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){ //check if the token rule is found ARC_LexerTokenRule *tokenRule = ARC_Vector_Get(lexer->tokenRules, index); //set the last token length if the last token had a length if(tokenLength > 0){ lastTokenLength = tokenLength; } //tokenData should only exist if tokenLength is ARC_True as stated in the header ARC_String *tokenData; tokenLength = tokenRule->automataFn(&tokenData, *data, tokenRule->automataData); //check if a token was found if it wasn't continue. I'm doing this to try to cut down on the ammount of indentation if(tokenLength == 0){ continue; } //check to see if we found a better match if(tokenLength > lastTokenLength){ //free the current token if it exists if(token != NULL){ ARC_LexerTokenRule_VectorDestroyDataFn((void *)token); } //create the token to add token = (ARC_LexerToken *)malloc(sizeof(ARC_LexerToken)); token->rule = tokenRule->id; token->data = tokenData; } } //if no token was found, throw an error if(token == NULL){ arc_errno = ARC_ERRNO_DATA; ARC_DEBUG_LOG_ERROR_WITH_VARIABLES("ARC_Lexer_LexString(lexer, data), no tokens found with current string: \"%s\"", (*data)->data); //clean up errored string ARC_String_Destroy(*data); *data = NULL; return; } //token exists (something must have gone very wrong if it doesn't), so add it and check for overflow (which I'd be surprised if that happens) ARC_Vector_Add(lexer->tokens, (void *)token); if(arc_errno){ ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexString(lexer, data), errored when running ARC_Vector_Add(lexer->tokens, token);. check logs for more info"); free(token); //clean up errored string ARC_String_Destroy(*data); *data = NULL; return; } //if the last token was found, destroy the string and return if(lastTokenLength == (*data)->length){ ARC_String_Destroy(*data); *data = NULL; return; } //strip the string ARC_String_ReplaceWithSubstring(data, lastTokenLength, (*data)->length - lastTokenLength); } } void ARC_Lexer_LexFile(ARC_Lexer *lexer, ARC_String *path){ //read file and clean up if it errors ARC_String *data; ARC_IO_FileToStr(path, &data); if(arc_errno){ ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexFile(lexer, path), errored when running ARC_IO_FileToStr(path, &data);. check logs for more info"); if(data != NULL){ ARC_String_Destroy(data); } return; } //lex the string and log if there is an error, ARC_Lexer_LexString will clean up the string ARC_Lexer_LexString(lexer, &data); if(arc_errno){ ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexFile(lexer, path), errored when running ARC_Lexer_LexString(lexer, data);. check logs for more info"); } } void ARC_Lexer_PrintTokenRules(ARC_Lexer *lexer){ for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){ ARC_LexerTokenRule *tokenRule = ARC_Vector_Get(lexer->tokenRules, index); printf("Rule: %02i\tFunction: %p\n", tokenRule->id, tokenRule->automataFn); } } ARC_LexerToken ARC_Lexer_GetToken(ARC_Lexer *lexer, uint32_t index){ //get the token and log if there is an error ARC_LexerToken *token = ARC_Vector_Get(lexer->tokens, index); if(arc_errno){ ARC_DEBUG_LOG_ERROR("ARC_Lexer_GetToken(lexer, index), errored when running ARC_Vector_Get(lexer->tokens, index);. check logs for more info"); //return a token with max rule value, and NULL for the string to signify an error return (ARC_LexerToken){ ~(uint32_t)0, NULL }; } //the token was found, so return a copy to that return *token; } uint32_t ARC_Lexer_GetTokensSize(ARC_Lexer *lexer){ return ARC_Vector_GetSize(lexer->tokens); } ARC_Bool ARC_Lexer_IsTokenId(ARC_Lexer *lexer, uint32_t id){ //if the rules are continuous we can just check if it is less than the max rules value if(lexer->tokenRulesAreContinuous == ARC_True){ return id <= lexer->tokenRulesMaxVal; } //the rules are not continuous so we need to check each individually for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){ ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, index); if(currentTokenRule->id == id){ return ARC_True; } } return ARC_False; } uint32_t ARC_Lexer_AutomataMatchCharFn(ARC_String **tokenData, ARC_String *string, void *automataData){ //if there is a match the token will be the same as automataData, so we don't need to store it again *tokenData = NULL; //check to see if there is a match with automataData as a char if(string->data[0] == *(char *)automataData){ //return the token was found of length 1 return 1; } //no match was found return 0; } uint32_t ARC_Lexer_AutomataMatchCharOrBetweenFn(ARC_String **tokenData, ARC_String *string, void *automataData){ //if there is a match the token will be the same as automataData, so we don't need to store it again *tokenData = NULL; //check to see if there is a match with automataData as a range of chars char *automataDataChars = (char *)automataData; if(string->data[0] >= automataDataChars[0] && string->data[0] <= ((char *)automataData)[1]){ //return the token as token data and the token was found of length 1 ARC_String_Create(tokenData, string->data, 1); return 1; } //no match was found return 0; } uint32_t ARC_Lexer_AutomataMatchStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){ //if there is a match the token will be the same as automataData, so we don't need to store it again *tokenData = NULL; //check to see if there is a match with automataData as a string ARC_String *automataDataString = (ARC_String *)automataData; if(ARC_String_SubstringEquals(string, 0, automataDataString)){ //return the token was found of the string length return automataDataString->length; } //no match was found return 0; } uint32_t ARC_Lexer_AutomataMatchCharInStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){ //if there is a match the token will be the same as automataData, so we don't need to store it again *tokenData = NULL; //check to see if there is a char match in automataData as a string ARC_String *automataDataString = (ARC_String *)automataData; for(uint64_t index = 0; index < automataDataString->length; index++){ if(string->data[0] == automataDataString->data[index]){ //return the token was found in the string of length 1 return 1; } } //no match was found return 0; } //private function to free automataData stored as a char void ARC_LexerTokenRule_DestroyCharAutomataDataFn(void *automataData){ free((char *)automataData); } ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharRule(uint32_t id, char character){ //create the token rule ARC_LexerTokenRule tokenRule; //set the id tokenRule.id = id; //create and store the automataData (which is just a char) char *automataData = (char *)malloc(sizeof(char)); *automataData = character; tokenRule.automataData = (void *)automataData; //we can use the ARC_Lexer_AutomataMatchCharFn for this tokenRule.automataFn = ARC_Lexer_AutomataMatchCharFn; //add the private destroy function tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyCharAutomataDataFn; //return the created tokenRule return tokenRule; } ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(uint32_t id, char start, char end){ //create the token rule ARC_LexerTokenRule tokenRule; //set the id tokenRule.id = id; //create and store the automataData (which is just two chars (the minumum and manximum)) char *automataData = (char *)malloc(sizeof(char) * 2); automataData[0] = start; automataData[1] = end; tokenRule.automataData = (void *)automataData; //we can use the ARC_Lexer_AutomataMatchCharInStringFn for this tokenRule.automataFn = ARC_Lexer_AutomataMatchCharOrBetweenFn; //add the private destroy function (we can use the char as it destroys a char pointer of any size) tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyCharAutomataDataFn; //return the created tokenRule return tokenRule; } //private function to free automataData stored as an ARC_String void ARC_LexerTokenRule_DestroyStringAutomataDataFn(void *automataData){ ARC_String_Destroy((ARC_String *)automataData); } ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchStringRule(uint32_t id, ARC_String *string){ //create the token rule ARC_LexerTokenRule tokenRule; //set the id tokenRule.id = id; //copy and store the automataData (which is just an ARC_String) ARC_String *automataData; ARC_String_Copy(&automataData, string); tokenRule.automataData = (void *)automataData; //we can use the ARC_Lexer_AutomataMatchStringFn for this tokenRule.automataFn = ARC_Lexer_AutomataMatchStringFn; //add the private destroy function tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyStringAutomataDataFn; //return the created tokenRule return tokenRule; } ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint32_t id, ARC_String *string){ //create the token rule ARC_LexerTokenRule tokenRule; //set the id tokenRule.id = id; //copy and store the automataData (which is just an ARC_String) ARC_String *automataData; ARC_String_Copy(&automataData, string); tokenRule.automataData = (void *)automataData; //we can use the ARC_Lexer_AutomataMatchCharInStringFn for this tokenRule.automataFn = ARC_Lexer_AutomataMatchCharInStringFn; //add the private destroy function tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyStringAutomataDataFn; //return the created tokenRule return tokenRule; } void ARC_Lexer_InitBasicTokenRules(ARC_Lexer *lexer){ //null ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_NULL, 0)); //number ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_LEXER_TOKEN_NUMBER, '0', '9')); //alpha char ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_LEXER_TOKEN_ALPHALOWERCHAR, 'a', 'z')); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_LEXER_TOKEN_ALPHAUPPERCHAR, 'A', 'Z')); //whitespace //TODO: fix this ARC_String *whitespaceString; ARC_String_CreateWithStrlen(&whitespaceString, " \t"); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(ARC_LEXER_TOKEN_WHITESPACE, whitespaceString)); ARC_String_Destroy(whitespaceString); //TEMP FIX: //ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_WHITESPACE, ' ')); //single char tokens ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_NEWLINE_ID , ARC_LEXER_TOKEN_NEWLINE_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_COLON_ID , ARC_LEXER_TOKEN_COLON_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_SEMICOLON_ID , ARC_LEXER_TOKEN_SEMICOLON_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_COMMA_ID , ARC_LEXER_TOKEN_COMMA_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_PERIOD_ID , ARC_LEXER_TOKEN_PERIOD_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_FORWARD_SLASH_ID , ARC_LEXER_TOKEN_FORWARD_SLASH_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_BACK_SLASH_ID , ARC_LEXER_TOKEN_BACK_SLASH_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_LEFT_PARENTHESIS_ID , ARC_LEXER_TOKEN_LEFT_PARENTHESIS_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_ID, ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_CHAR)); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_ID , ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_ID, ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_CHAR)); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_BANG_ID , ARC_LEXER_TOKEN_BANG_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_AT_ID , ARC_LEXER_TOKEN_AT_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_HASH_ID , ARC_LEXER_TOKEN_HASH_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_PERCENT_ID , ARC_LEXER_TOKEN_PERCENT_CHAR )); }