diff --git a/include/arc/std/lexer.h b/include/arc/std/lexer.h index c94d768..d5743ec 100644 --- a/include/arc/std/lexer.h +++ b/include/arc/std/lexer.h @@ -5,7 +5,6 @@ extern "C" { #endif -#include "arc/std/bool.h" #include "arc/std/string.h" #include @@ -25,16 +24,15 @@ typedef struct ARC_LexerToken { /** * @brief checks to see if a string is a type of token * - * @note do not set tokenData if this function returns ARC_False, doing so will create a memory leak + * @note do not set tokenData if this function returns 0, doing so will create a memory leak * - * @param[in/out] string a string to be checked to see if it matches a token, - * this needs to srip the token out for the lexer to avoid an infinite loop - * @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed - * @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn + * @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed + * @param[in] string a string to be checked to see if it matches a token + * @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn * - * @return if a token was successfully found ARC_True, otherwise ARC_False + * @return the size of the token found, or 0 if the token was not found */ -typedef ARC_Bool (* ARC_LexerTokenRule_AutomataFn)(ARC_String **string, ARC_String **tokenData, void *automataData); +typedef uint32_t (* ARC_LexerTokenRule_AutomataFn)(ARC_String **tokenData, ARC_String *string, void *automataData); /** * @brief a callback function to clean up ARC_LexerTokenRule's automataData @@ -124,14 +122,13 @@ uint32_t ARC_Lexer_GetTokensSize(ARC_Lexer *lexer); * @note this is intended as a helper callback * @note this function is a ARC_Lexer_AutomataFn callback * - * @param[in/out] string a string to be checked to see if it matches a token, - * this needs to srip the token out for the lexer to avoid an infinite loop - * @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed - * @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn + * @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed + * @param[in] string a string to be checked to see if it matches a token + * @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn * - * @return if a token was successfully found ARC_True, otherwise ARC_False + * @return the size of the token found, or 0 if the token was not found */ -ARC_Bool ARC_Lexer_AutomataMatchCharFn(ARC_String **string, ARC_String **tokenData, void *automataData); +uint32_t ARC_Lexer_AutomataMatchCharFn(ARC_String **tokenData, ARC_String *string, void *automataData); /** * @brief checks if the substring automataData as an ARC_String matches the first part of string @@ -139,14 +136,27 @@ ARC_Bool ARC_Lexer_AutomataMatchCharFn(ARC_String **string, ARC_String **tokenDa * @note this is intended as a helper callback * @note this function is a ARC_Lexer_AutomataFn callback * - * @param[in/out] string a string to be checked to see if it matches a token, - * this needs to srip the token out for the lexer to avoid an infinite loop - * @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed - * @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn + * @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed + * @param[in] string a string to be checked to see if it matches a token + * @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn * - * @return if a token was successfully found ARC_True, otherwise ARC_False + * @return the size of the token found, or 0 if the token was not found */ -ARC_Bool ARC_Lexer_AutomataMatchStringFn(ARC_String **string, ARC_String **tokenData, void *automataData); +uint32_t ARC_Lexer_AutomataMatchStringFn(ARC_String **tokenData, ARC_String *string, void *automataData); + +/** + * @brief checks if the first part of string is a character in substring + * + * @note this is intended as a helper callback + * @note this function is a ARC_Lexer_AutomataFn callback + * + * @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed + * @param[in] string a string to be checked to see if it matches a token + * @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn + * + * @return the size of the token found, or 0 if the token was not found +*/ +uint32_t ARC_Lexer_AutomataMatchCharInStringFn(ARC_String **tokenData, ARC_String *string, void *automataData); /** * @brief creates a ARC_LexerTokenRule with a given id and character @@ -173,6 +183,28 @@ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharRule(uint32_t id, */ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchStringRule(uint32_t id, ARC_String *string); +/** + * @brief creates a ARC_LexerTokenRule with a given id and string + * + * @note this is intended as a helper funtion + * #note string will not be freed (it will be copied and the copy will be freed) + * + * @param[in] id a tokens id (basically the token value) + * @param[in] character the string to match against, will be copied + * + * @return a token rule based in the id and string +*/ +ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint32_t id, ARC_String *string); + +/** + * @brief basic tokens +*/ +#define ARC_LEXER_TOKEN_NULL 0 +#define ARC_LEXER_TOKEN_EOF 1 +#define ARC_LEXER_TOKEN_NUMBER 2 +#define ARC_LEXER_TOKEN_ALPHACHAR 3 +#define ARC_LEXER_TOKEN_WHITESPACE 4 + /** * @brief basic token type ids, chars, and tags */ diff --git a/include/arc/std/parser.h b/include/arc/std/parser.h index e69de29..37657bf 100644 --- a/include/arc/std/parser.h +++ b/include/arc/std/parser.h @@ -0,0 +1,63 @@ +#ifndef ARC_STD_PARSER_H_ +#define ARC_STD_PARSER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "arc/std/string.h" + +/** + * @brief a parser type +*/ +typedef struct ARC_Parser ARC_Parser; + +/** + * @brief a parser node +*/ +typedef struct ARC_ParserNode ARC_ParserNode; + +/** + * @brief creates an ARC_Parser type + * + * @param[out] parser + * @param[in] language ..., can be NULL +*/ +void ARC_Parser_Create(ARC_Parser **parser, ARC_String *language); + +/** + * @brief destroys an ARC_Parser type + * + * @param[in] parser ARC_Parser to free +*/ +void ARC_Parser_Destroy(ARC_Parser *parser); + +/** + * @brief sets the definition of the parser, the language itself is parsed and will throw an error if invalid + * + * @param[in] parser ARC_Parser to set the language to + * @param[in] language the language as a string the parser should use +*/ +void ARC_Parser_SetLanguage(ARC_Parser *parser, ARC_String *language); + +/** + * @brief sets the definition of the parser, the language itself is parsed and will throw an error if invalid + * + * @param[in] parser ARC_Parser to set the language to + * @param[in] language the language as a string the parser should use +*/ +void ARC_Parser_Parse(ARC_Parser *parser, ARC_String *data); + +/** + * @brief sets the definition of the parser, the language itself is parsed and will throw an error if invalid + * + * @param[in] parser ARC_Parser to set the language to + * @param[in] language the language as a string the parser should use +*/ +void ARC_Parser_ParseFile(ARC_Parser *parser, ARC_String *path); + +#ifdef __cplusplus +} +#endif + +#endif // !ARC_STD_LEXER_H_ diff --git a/src/std/lexer.c b/src/std/lexer.c index 6a513b0..84396a9 100644 --- a/src/std/lexer.c +++ b/src/std/lexer.c @@ -86,53 +86,74 @@ void ARC_Lexer_LexString(ARC_Lexer *lexer, ARC_String **data){ //this will run untill everything token is stripped or there is an error while(*data != NULL){ - ARC_Bool tokenFound = ARC_False; + uint32_t tokenLength = 0; + uint32_t lastTokenLength = 0; + ARC_LexerToken *token = NULL; + for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){ //check if the token rule is found ARC_LexerTokenRule *tokenRule = ARC_Vector_Get(lexer->tokenRules, index); - //tokenData should only exist if tokenFound is ARC_True as stated in the header + //set the last token length if the last token had a length + if(tokenLength > 0){ + lastTokenLength = tokenLength; + } + + //tokenData should only exist if tokenLength is ARC_True as stated in the header ARC_String *tokenData; - tokenFound = tokenRule->automataFn(data, &tokenData, tokenRule->automataData); + tokenLength = tokenRule->automataFn(&tokenData, *data, tokenRule->automataData); //check if a token was found if it wasn't continue. I'm doing this to try to cut down on the ammount of indentation - if(tokenFound != ARC_True){ + if(tokenLength == 0){ continue; } - //create the token to add - ARC_LexerToken *token = (ARC_LexerToken *)malloc(sizeof(ARC_LexerToken)); - token->rule = tokenRule->id; - token->data = tokenData; + //check to see if we found a better match + if(tokenLength > lastTokenLength){ + //free the current token if it exists + if(token != NULL){ + ARC_LexerTokenRule_VectorDestroyDataFn((void *)token); + } - //add to the vector and check for error (I'd be surprised if the error ever happened because that would most likely mean overflow) - ARC_Vector_Add(lexer->tokens, (void *)token); - if(arc_errno){ - ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexString(lexer, data), errored when running ARC_Vector_Add(lexer->tokens, token);. check logs for more info"); - free(token); - - //clean up errored string - ARC_String_Destroy(*data); - *data = NULL; - return; + //create the token to add + token = (ARC_LexerToken *)malloc(sizeof(ARC_LexerToken)); + token->rule = tokenRule->id; + token->data = tokenData; } - - //the token was added, so break to start checking tokens again - break; } //if no token was found, throw an error - if(tokenFound == ARC_False){ + if(token == NULL){ arc_errno = ARC_ERRNO_DATA; ARC_DEBUG_LOG_ERROR_WITH_VARIABLES("ARC_Lexer_LexString(lexer, data), no tokens found with current string: \"%s\"", (*data)->data); //clean up errored string ARC_String_Destroy(*data); *data = NULL; - - //TODO: might want to do smthn with already tokened data return; } + + //token exists (something must have gone very wrong if it doesn't), so add it and check for overflow (which I'd be surprised if that happens) + ARC_Vector_Add(lexer->tokens, (void *)token); + if(arc_errno){ + ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexString(lexer, data), errored when running ARC_Vector_Add(lexer->tokens, token);. check logs for more info"); + free(token); + + //clean up errored string + ARC_String_Destroy(*data); + *data = NULL; + return; + } + + //if the last token was found, destroy the string and return + if(lastTokenLength == (*data)->length){ + ARC_String_Destroy(*data); + *data = NULL; + return; + } + + //strip the string + ARC_String_ReplaceWithSubstring(data, lastTokenLength, (*data)->length - lastTokenLength); } } @@ -177,49 +198,50 @@ uint32_t ARC_Lexer_GetTokensSize(ARC_Lexer *lexer){ return ARC_Vector_GetSize(lexer->tokens); } -ARC_Bool ARC_Lexer_AutomataMatchCharFn(ARC_String **string, ARC_String **tokenData, void *automataData){ +uint32_t ARC_Lexer_AutomataMatchCharFn(ARC_String **tokenData, ARC_String *string, void *automataData){ //if there is a match the token will be the same as automataData, so we don't need to store it again *tokenData = NULL; //check to see if there is a match with automataData as a char - if((*string)->data[0] == *(char *)automataData){ - //to keep from erroring instead of stripping from a 1 character string we can just delete it - if((*string)->length == 1){ - ARC_String_Destroy(*string); - *string = NULL; - return ARC_True; - } - - //strip the charater from the front of the string and return that a match was found - ARC_String_ReplaceWithSubstring(string, 1, (*string)->length - 1); - return ARC_True; + if(string->data[0] == *(char *)automataData){ + //return the token was found of length 1 + return 1; } //no match was found - return ARC_False; + return 0; } -ARC_Bool ARC_Lexer_AutomataMatchStringFn(ARC_String **string, ARC_String **tokenData, void *automataData){ +uint32_t ARC_Lexer_AutomataMatchStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){ //if there is a match the token will be the same as automataData, so we don't need to store it again *tokenData = NULL; //check to see if there is a match with automataData as a string ARC_String *automataDataString = (ARC_String *)automataData; - - //to keep from erroring instead of stripping from a same length string we can just delete it - if(ARC_String_Equals(*string, automataDataString)){ - if((*string)->length == automataDataString->length){ - ARC_String_Destroy(*string); - *string = NULL; - } - - //strip the token string from the front of the string and return that a match was found - ARC_String_ReplaceWithSubstring(string, automataDataString->length, (*string)->length - automataDataString->length); - return ARC_True; + if(ARC_String_SubstringEquals(string, 0, automataDataString)){ + //return the token was found of the string length + return automataDataString->length; } //no match was found - return ARC_False; + return 0; +} + +uint32_t ARC_Lexer_AutomataMatchCharInStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){ + //if there is a match the token will be the same as automataData, so we don't need to store it again + *tokenData = NULL; + + //check to see if there is a char match in automataData as a string + ARC_String *automataDataString = (ARC_String *)automataData; + for(uint64_t index = 0; index < automataDataString->length; index++){ + if(string->data[0] == automataDataString->data[index]){ + //return the token was found in the string of length 1 + return 1; + } + } + + //no match was found + return 0; } //private function to free automataData stored as a char @@ -266,8 +288,30 @@ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchStringRule(uint32_t id ARC_String_Copy(&automataData, string); tokenRule.automataData = (void *)automataData; - //we can use the ARC_Lexer_AutomataMatchCharFn for this - tokenRule.automataFn = ARC_Lexer_AutomataMatchCharFn; + //we can use the ARC_Lexer_AutomataMatchStringFn for this + tokenRule.automataFn = ARC_Lexer_AutomataMatchStringFn; + + //add the private destroy function + tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyStringAutomataDataFn; + + //return the created tokenRule + return tokenRule; +} + +ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint32_t id, ARC_String *string){ + //create the token rule + ARC_LexerTokenRule tokenRule; + + //set the id + tokenRule.id = id; + + //copy and store the automataData (which is just an ARC_String) + ARC_String *automataData; + ARC_String_Copy(&automataData, string); + tokenRule.automataData = (void *)automataData; + + //we can use the ARC_Lexer_AutomataMatchCharInStringFn for this + tokenRule.automataFn = ARC_Lexer_AutomataMatchCharInStringFn; //add the private destroy function tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyStringAutomataDataFn;