#ifndef ARC_STD_LEXER_H_ #define ARC_STD_LEXER_H_ #ifdef __cplusplus extern "C" { #endif #include "arc/std/string.h" #include /** * @brief a lexer type */ typedef struct ARC_Lexer ARC_Lexer; /** * @brief a lexer token type */ typedef struct ARC_LexerToken { uint32_t rule; ARC_String *data; } ARC_LexerToken; /** * @brief checks to see if a string is a type of token * * @note do not set tokenData if this function returns 0, doing so will create a memory leak * * @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed * @param[in] string a string to be checked to see if it matches a token * @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn * * @return the size of the token found, or 0 if the token was not found */ typedef uint32_t (* ARC_LexerTokenRule_AutomataFn)(ARC_String **tokenData, ARC_String *string, void *automataData); /** * @brief a callback function to clean up ARC_LexerTokenRule's automataData * * @param automataData the void * automataData to destroy */ typedef void (* ARC_LexerTokenRule_DestroyAutomataDataFn)(void *automataData); /** * @brief a lexer token rule type */ typedef struct ARC_LexerTokenRule { uint32_t id; void *automataData; ARC_LexerTokenRule_AutomataFn automataFn; ARC_LexerTokenRule_DestroyAutomataDataFn destroyAutomataDataFn; } ARC_LexerTokenRule; /** * @brief creates an ARC_Lexer type * * @param[out] lexer ARC_Lexer to create */ void ARC_Lexer_Create(ARC_Lexer **lexer); /** * @brief destroys an ARC_Lexer type * * @param[in] lexer ARC_Lexer to free */ void ARC_Lexer_Destroy(ARC_Lexer *lexer); /** * @brief adds a token rule to a lexer * * @param [in] lexer the lexer to add a token rule to * @param [in] tokenRule the token rule to add */ void ARC_Lexer_RegisterTokenRule(ARC_Lexer *lexer, ARC_LexerTokenRule tokenRule); /** * @brief clears all tokens from a lexer (will not clear token rules) * * @param lexer the lexer to clear tokens from */ void ARC_Lexer_Clear(ARC_Lexer *lexer); /** * @brief creates tokens using a given string with ARC_LexerToken rules * * @param[in] lexer the lexer to get the ARC_LexerTokens from * @param[in/out] data the string to lex, will be freed and set to NULL by the end of this function */ void ARC_Lexer_LexString(ARC_Lexer *lexer, ARC_String **data); /** * @brief reads in and lexs a file * * @note this function will call ARC_Lexer_LexString, so it's notes are applicable to this function * * @param[in] lexer the lexer which holds to rules to use * @param[in] path path of file to read in and lex */ void ARC_Lexer_LexFile(ARC_Lexer *lexer, ARC_String *path); /** * @brief prints rule id and hex of the function name * * @note this is mostly used for debugging * * @param[in] lexer the lexer to print rules from */ void ARC_Lexer_PrintTokenRules(ARC_Lexer *lexer); /** * @brief gets a token at a given index from a lexer * * @note unless you have a very good reason, you probably don't want to mess with the tokens string. * that will probably change the token's string inside the lexer * * @param[in] lexer the lexer to get the token from * @param[in] index the index of the token in the lexer to get * * @return a copy of the token, or a token with max value for rule and NULL for data on error */ ARC_LexerToken ARC_Lexer_GetToken(ARC_Lexer *lexer, uint32_t index); /** * @brief gets a token at a given index from a lexer * * @param[in] lexer the lexer to get the tokens size from * * @return the size of the token array in a lexer */ uint32_t ARC_Lexer_GetTokensSize(ARC_Lexer *lexer); /** * @brief returns a boolean based on if a given id is a stored token rule id * * @param[in] lexer the lexer to check stored token rule ids * @param[in] id the id to check against the token rules * * @return ARC_True if the id is a rule id, ARC_False otherwise */ ARC_Bool ARC_Lexer_IsTokenId(ARC_Lexer *lexer, uint32_t id); /** * @brief checks if the first character of string matches the automataData cast as a char * * @note this is intended as a helper callback * @note this function is a ARC_Lexer_AutomataFn callback * * @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed * @param[in] string a string to be checked to see if it matches a token * @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn * * @return the size of the token found, or 0 if the token was not found */ uint32_t ARC_Lexer_AutomataMatchCharFn(ARC_String **tokenData, ARC_String *string, void *automataData); /** * @brief checks if the substring automataData as an ARC_String matches the first part of string * * @note this is intended as a helper callback * @note this function is a ARC_Lexer_AutomataFn callback * * @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed * @param[in] string a string to be checked to see if it matches a token * @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn * * @return the size of the token found, or 0 if the token was not found */ uint32_t ARC_Lexer_AutomataMatchStringFn(ARC_String **tokenData, ARC_String *string, void *automataData); /** * @brief checks if the first part of string is a character in substring * * @note this is intended as a helper callback * @note this function is a ARC_Lexer_AutomataFn callback * * @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed * @param[in] string a string to be checked to see if it matches a token * @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn * * @return the size of the token found, or 0 if the token was not found */ uint32_t ARC_Lexer_AutomataMatchCharInStringFn(ARC_String **tokenData, ARC_String *string, void *automataData); /** * @brief creates a ARC_LexerTokenRule with a given id and character * * @note this is intended as a helper funtion * * @param[in] id a tokens id (basically the token value) * @param[in] character the character to match against * * @return a token rule based in the id and character */ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharRule(uint32_t id, char character); /** * @brief creates a ARC_LexerTokenRule with a given id and string * * @note this is intended as a helper funtion * #note string will not be freed (it will be copied and the copy will be freed) * * @param[in] id a tokens id (basically the token value) * @param[in] character the string to match against, will be copied * * @return a token rule based in the id and string */ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchStringRule(uint32_t id, ARC_String *string); /** * @brief creates a ARC_LexerTokenRule with a given id and string * * @note this is intended as a helper funtion * #note string will not be freed (it will be copied and the copy will be freed) * * @param[in] id a tokens id (basically the token value) * @param[in] character the string to match against, will be copied * * @return a token rule based in the id and string */ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint32_t id, ARC_String *string); /** * @brief basic tokens */ #define ARC_LEXER_TOKEN_NULL 0 #define ARC_LEXER_TOKEN_NUMBER 1 #define ARC_LEXER_TOKEN_ALPHALOWERCHAR 2 #define ARC_LEXER_TOKEN_ALPHAUPPERCHAR 3 #define ARC_LEXER_TOKEN_WHITESPACE 4 /** * @brief basic token type ids, chars, and tags */ #define ARC_LEXER_TOKEN_NEWLINE_ID 5 #define ARC_LEXER_TOKEN_NEWLINE_CHAR '\n' #define ARC_LEXER_TOKEN_COLON_ID 6 #define ARC_LEXER_TOKEN_COLON_CHAR ':' #define ARC_LEXER_TOKEN_COLON_TAG "COLON" #define ARC_LEXER_TOKEN_SEMICOLON_ID 7 #define ARC_LEXER_TOKEN_SEMICOLON_CHAR ';' #define ARC_LEXER_TOKEN_SEMICOLON_TAG "SEMICOLON" #define ARC_LEXER_TOKEN_COMMA_ID 8 #define ARC_LEXER_TOKEN_COMMA_CHAR ',' #define ARC_LEXER_TOKEN_COMMA_TAG "COMMA" #define ARC_LEXER_TOKEN_PERIOD_ID 9 #define ARC_LEXER_TOKEN_PERIOD_CHAR '.' #define ARC_LEXER_TOKEN_PERIOD_TAG "PERIOD" #define ARC_LEXER_TOKEN_FORWARD_SLASH_ID 10 #define ARC_LEXER_TOKEN_FORWARD_SLASH_CHAR '/' #define ARC_LEXER_TOKEN_FORWARD_SLASH_TAG "FORWARD_SLASH" #define ARC_LEXER_TOKEN_BACK_SLASH_ID 11 #define ARC_LEXER_TOKEN_BACK_SLASH_CHAR '\\' #define ARC_LEXER_TOKEN_BACK_SLASH_TAG "BACK_SLASH" #define ARC_LEXER_TOKEN_LEFT_PARENTHESIS_ID 12 #define ARC_LEXER_TOKEN_LEFT_PARENTHESIS_CHAR '(' #define ARC_LEXER_TOKEN_LEFT_PARENTHESIS_TAG "LEFT_PARENTHESIS" #define ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_ID 13 #define ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_CHAR ')' #define ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_TAG "RIGHT_PARENTHESIS" #define ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_ID 14 #define ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_CHAR '{' #define ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_TAG "LEFT_CURLY_BRACE" #define ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_ID 15 #define ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_CHAR '}' #define ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_TAG "RIGHT_CURLY_BRACE" #define ARC_LEXER_TOKEN_BANG_ID 16 #define ARC_LEXER_TOKEN_BANG_CHAR '!' #define ARC_LEXER_TOKEN_BANG_TAG "BANG" #define ARC_LEXER_TOKEN_AT_ID 17 #define ARC_LEXER_TOKEN_AT_CHAR '!' #define ARC_LEXER_TOKEN_AT_TAG "AT" #define ARC_LEXER_TOKEN_HASH_ID 18 #define ARC_LEXER_TOKEN_HASH_CHAR '#' #define ARC_LEXER_TOKEN_HASH_TAG "HASH" #define ARC_LEXER_TOKEN_PERCENT_ID 19 #define ARC_LEXER_TOKEN_PERCENT_CHAR '%' #define ARC_LEXER_TOKEN_PERCENT_TAG "PERCENT" /** * @brief adds a bunch of basic token rules (matching the BasicTokens above) */ void ARC_Lexer_InitBasicTokenRules(ARC_Lexer *lexer); #ifdef __cplusplus } #endif #endif // !ARC_STD_LEXER_H_