2024-08-27 03:23:29 -06:00
|
|
|
#ifndef ARC_STD_LEXER_H_
|
|
|
|
|
#define ARC_STD_LEXER_H_
|
|
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
|
extern "C" {
|
|
|
|
|
#endif
|
2024-08-28 20:04:18 -06:00
|
|
|
#include "arc/std/string.h"
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
|
|
|
|
|
/**
|
2024-08-29 05:04:08 -06:00
|
|
|
* @brief a lexer type
|
|
|
|
|
*/
|
|
|
|
|
typedef struct ARC_Lexer ARC_Lexer;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @brief a lexer token type
|
|
|
|
|
*/
|
|
|
|
|
typedef struct ARC_LexerToken {
|
|
|
|
|
uint32_t rule;
|
|
|
|
|
ARC_String *data;
|
|
|
|
|
} ARC_LexerToken;
|
2024-08-28 20:04:18 -06:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @brief checks to see if a string is a type of token
|
|
|
|
|
*
|
2024-10-16 17:35:38 -06:00
|
|
|
* @note do not set tokenData if this function returns 0, doing so will create a memory leak
|
2024-08-29 05:04:08 -06:00
|
|
|
*
|
2024-10-16 17:35:38 -06:00
|
|
|
* @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed
|
|
|
|
|
* @param[in] string a string to be checked to see if it matches a token
|
|
|
|
|
* @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn
|
2024-08-28 20:04:18 -06:00
|
|
|
*
|
2024-10-16 17:35:38 -06:00
|
|
|
* @return the size of the token found, or 0 if the token was not found
|
2024-08-28 20:04:18 -06:00
|
|
|
*/
|
2024-10-16 17:35:38 -06:00
|
|
|
typedef uint32_t (* ARC_LexerTokenRule_AutomataFn)(ARC_String **tokenData, ARC_String *string, void *automataData);
|
2024-08-28 20:04:18 -06:00
|
|
|
|
2024-08-27 03:23:29 -06:00
|
|
|
/**
|
2024-08-29 05:04:08 -06:00
|
|
|
* @brief a callback function to clean up ARC_LexerTokenRule's automataData
|
|
|
|
|
*
|
|
|
|
|
* @param automataData the void * automataData to destroy
|
2024-08-27 03:23:29 -06:00
|
|
|
*/
|
2024-08-29 05:04:08 -06:00
|
|
|
typedef void (* ARC_LexerTokenRule_DestroyAutomataDataFn)(void *automataData);
|
|
|
|
|
|
2024-08-28 20:04:18 -06:00
|
|
|
/**
|
2024-08-29 05:04:08 -06:00
|
|
|
* @brief a lexer token rule type
|
2024-08-28 20:04:18 -06:00
|
|
|
*/
|
2024-08-29 05:04:08 -06:00
|
|
|
typedef struct ARC_LexerTokenRule {
|
2024-08-28 20:04:18 -06:00
|
|
|
uint32_t id;
|
|
|
|
|
|
|
|
|
|
void *automataData;
|
2024-08-29 05:04:08 -06:00
|
|
|
|
|
|
|
|
ARC_LexerTokenRule_AutomataFn automataFn;
|
|
|
|
|
ARC_LexerTokenRule_DestroyAutomataDataFn destroyAutomataDataFn;
|
|
|
|
|
} ARC_LexerTokenRule;
|
2024-08-28 20:04:18 -06:00
|
|
|
|
2024-08-27 03:23:29 -06:00
|
|
|
/**
|
2024-08-29 05:04:08 -06:00
|
|
|
* @brief creates an ARC_Lexer type
|
2024-08-27 03:23:29 -06:00
|
|
|
*
|
2024-10-16 05:14:53 -06:00
|
|
|
* @param[out] lexer ARC_Lexer to create
|
2024-08-27 03:23:29 -06:00
|
|
|
*/
|
|
|
|
|
void ARC_Lexer_Create(ARC_Lexer **lexer);
|
|
|
|
|
|
|
|
|
|
/**
|
2024-08-29 05:04:08 -06:00
|
|
|
* @brief destroys an ARC_Lexer type
|
2024-08-27 03:23:29 -06:00
|
|
|
*
|
|
|
|
|
* @param[in] lexer ARC_Lexer to free
|
|
|
|
|
*/
|
|
|
|
|
void ARC_Lexer_Destroy(ARC_Lexer *lexer);
|
|
|
|
|
|
2024-08-28 20:04:18 -06:00
|
|
|
/**
|
2024-08-29 05:04:08 -06:00
|
|
|
* @brief adds a token rule to a lexer
|
2024-08-28 20:04:18 -06:00
|
|
|
*
|
2024-08-29 05:04:08 -06:00
|
|
|
* @param [in] lexer the lexer to add a token rule to
|
|
|
|
|
* @param [in] tokenRule the token rule to add
|
2024-08-28 20:04:18 -06:00
|
|
|
*/
|
2024-08-29 05:04:08 -06:00
|
|
|
void ARC_Lexer_RegisterTokenRule(ARC_Lexer *lexer, ARC_LexerTokenRule tokenRule);
|
2024-08-28 20:04:18 -06:00
|
|
|
|
|
|
|
|
/**
|
2024-08-29 05:04:08 -06:00
|
|
|
* @brief creates tokens using a given string with ARC_LexerToken rules
|
2024-08-28 20:04:18 -06:00
|
|
|
*
|
2024-08-29 05:04:08 -06:00
|
|
|
* @param[in] lexer the lexer to get the ARC_LexerTokens from
|
|
|
|
|
* @param[in/out] data the string to lex, will be freed and set to NULL by the end of this function
|
|
|
|
|
*/
|
|
|
|
|
void ARC_Lexer_LexString(ARC_Lexer *lexer, ARC_String **data);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @brief reads in and lexs a file
|
|
|
|
|
*
|
|
|
|
|
* @note this function will call ARC_Lexer_LexString, so it's notes are applicable to this function
|
|
|
|
|
*
|
|
|
|
|
* @param[in] lexer the lexer which holds to rules to use
|
|
|
|
|
* @param[in] path path of file to read in and lex
|
2024-08-28 20:04:18 -06:00
|
|
|
*/
|
|
|
|
|
void ARC_Lexer_LexFile(ARC_Lexer *lexer, ARC_String *path);
|
|
|
|
|
|
2024-08-29 05:04:08 -06:00
|
|
|
/**
|
|
|
|
|
* @brief gets a token at a given index from a lexer
|
|
|
|
|
*
|
|
|
|
|
* @note unless you have a very good reason, you probably don't want to mess with the tokens string.
|
|
|
|
|
* that will probably change the token's string inside the lexer
|
|
|
|
|
*
|
|
|
|
|
* @param[in] lexer the lexer to get the token from
|
|
|
|
|
* @param[in] index the index of the token in the lexer to get
|
|
|
|
|
*
|
|
|
|
|
* @return a copy of the token, or a token with max value for rule and NULL for data on error
|
|
|
|
|
*/
|
|
|
|
|
ARC_LexerToken ARC_Lexer_GetToken(ARC_Lexer *lexer, uint32_t index);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @brief gets a token at a given index from a lexer
|
|
|
|
|
*
|
|
|
|
|
* @param[in] lexer the lexer to get the tokens size from
|
|
|
|
|
*
|
|
|
|
|
* @return the size of the token array in a lexer
|
|
|
|
|
*/
|
|
|
|
|
uint32_t ARC_Lexer_GetTokensSize(ARC_Lexer *lexer);
|
|
|
|
|
|
2024-10-24 19:56:26 -06:00
|
|
|
/**
|
|
|
|
|
* @brief returns a boolean based on if a given id is a stored token rule id
|
|
|
|
|
*
|
|
|
|
|
* @param[in] lexer the lexer to check stored token rule ids
|
|
|
|
|
* @param[in] id the id to check against the token rules
|
|
|
|
|
*
|
|
|
|
|
* @return ARC_True if the id is a rule id, ARC_False otherwise
|
|
|
|
|
*/
|
|
|
|
|
ARC_Bool ARC_Lexer_IsTokenId(ARC_Lexer *lexer, uint32_t id);
|
|
|
|
|
|
2024-08-28 20:04:18 -06:00
|
|
|
/**
|
|
|
|
|
* @brief checks if the first character of string matches the automataData cast as a char
|
|
|
|
|
*
|
2024-08-29 05:04:08 -06:00
|
|
|
* @note this is intended as a helper callback
|
2024-08-28 20:04:18 -06:00
|
|
|
* @note this function is a ARC_Lexer_AutomataFn callback
|
|
|
|
|
*
|
2024-10-16 17:35:38 -06:00
|
|
|
* @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed
|
|
|
|
|
* @param[in] string a string to be checked to see if it matches a token
|
|
|
|
|
* @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn
|
2024-08-28 20:04:18 -06:00
|
|
|
*
|
2024-10-16 17:35:38 -06:00
|
|
|
* @return the size of the token found, or 0 if the token was not found
|
2024-08-28 20:04:18 -06:00
|
|
|
*/
|
2024-10-16 17:35:38 -06:00
|
|
|
uint32_t ARC_Lexer_AutomataMatchCharFn(ARC_String **tokenData, ARC_String *string, void *automataData);
|
2024-08-28 20:04:18 -06:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @brief checks if the substring automataData as an ARC_String matches the first part of string
|
|
|
|
|
*
|
2024-08-29 05:04:08 -06:00
|
|
|
* @note this is intended as a helper callback
|
2024-08-28 20:04:18 -06:00
|
|
|
* @note this function is a ARC_Lexer_AutomataFn callback
|
|
|
|
|
*
|
2024-10-16 17:35:38 -06:00
|
|
|
* @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed
|
|
|
|
|
* @param[in] string a string to be checked to see if it matches a token
|
|
|
|
|
* @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn
|
2024-08-28 20:04:18 -06:00
|
|
|
*
|
2024-10-16 17:35:38 -06:00
|
|
|
* @return the size of the token found, or 0 if the token was not found
|
2024-08-28 20:04:18 -06:00
|
|
|
*/
|
2024-10-16 17:35:38 -06:00
|
|
|
uint32_t ARC_Lexer_AutomataMatchStringFn(ARC_String **tokenData, ARC_String *string, void *automataData);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @brief checks if the first part of string is a character in substring
|
|
|
|
|
*
|
|
|
|
|
* @note this is intended as a helper callback
|
|
|
|
|
* @note this function is a ARC_Lexer_AutomataFn callback
|
|
|
|
|
*
|
|
|
|
|
* @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed
|
|
|
|
|
* @param[in] string a string to be checked to see if it matches a token
|
|
|
|
|
* @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn
|
|
|
|
|
*
|
|
|
|
|
* @return the size of the token found, or 0 if the token was not found
|
|
|
|
|
*/
|
|
|
|
|
uint32_t ARC_Lexer_AutomataMatchCharInStringFn(ARC_String **tokenData, ARC_String *string, void *automataData);
|
2024-08-28 20:04:18 -06:00
|
|
|
|
2024-08-29 05:04:08 -06:00
|
|
|
/**
|
|
|
|
|
* @brief creates a ARC_LexerTokenRule with a given id and character
|
|
|
|
|
*
|
|
|
|
|
* @note this is intended as a helper funtion
|
|
|
|
|
*
|
|
|
|
|
* @param[in] id a tokens id (basically the token value)
|
|
|
|
|
* @param[in] character the character to match against
|
|
|
|
|
*
|
|
|
|
|
* @return a token rule based in the id and character
|
|
|
|
|
*/
|
|
|
|
|
ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharRule(uint32_t id, char character);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @brief creates a ARC_LexerTokenRule with a given id and string
|
|
|
|
|
*
|
|
|
|
|
* @note this is intended as a helper funtion
|
|
|
|
|
* #note string will not be freed (it will be copied and the copy will be freed)
|
|
|
|
|
*
|
|
|
|
|
* @param[in] id a tokens id (basically the token value)
|
|
|
|
|
* @param[in] character the string to match against, will be copied
|
|
|
|
|
*
|
|
|
|
|
* @return a token rule based in the id and string
|
|
|
|
|
*/
|
|
|
|
|
ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchStringRule(uint32_t id, ARC_String *string);
|
|
|
|
|
|
2024-10-16 17:35:38 -06:00
|
|
|
/**
|
|
|
|
|
* @brief creates a ARC_LexerTokenRule with a given id and string
|
|
|
|
|
*
|
|
|
|
|
* @note this is intended as a helper funtion
|
|
|
|
|
* #note string will not be freed (it will be copied and the copy will be freed)
|
|
|
|
|
*
|
|
|
|
|
* @param[in] id a tokens id (basically the token value)
|
|
|
|
|
* @param[in] character the string to match against, will be copied
|
|
|
|
|
*
|
|
|
|
|
* @return a token rule based in the id and string
|
|
|
|
|
*/
|
|
|
|
|
ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint32_t id, ARC_String *string);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @brief basic tokens
|
|
|
|
|
*/
|
2024-10-30 07:36:43 -06:00
|
|
|
#define ARC_LEXER_TOKEN_NULL 0
|
|
|
|
|
#define ARC_LEXER_TOKEN_NUMBER 1
|
|
|
|
|
#define ARC_LEXER_TOKEN_ALPHALOWERCHAR 2
|
|
|
|
|
#define ARC_LEXER_TOKEN_ALPHAUPPERCHAR 3
|
|
|
|
|
#define ARC_LEXER_TOKEN_WHITESPACE 4
|
2024-10-16 17:35:38 -06:00
|
|
|
|
2024-08-29 05:04:08 -06:00
|
|
|
/**
|
|
|
|
|
* @brief basic token type ids, chars, and tags
|
|
|
|
|
*/
|
2024-10-30 07:36:43 -06:00
|
|
|
#define ARC_LEXER_TOKEN_NEWLINE_ID 5
|
|
|
|
|
#define ARC_LEXER_TOKEN_NEWLINE_CHAR '\n'
|
|
|
|
|
#define ARC_LEXER_TOKEN_COLON_ID 6
|
2024-08-31 06:09:33 -06:00
|
|
|
#define ARC_LEXER_TOKEN_COLON_CHAR ':'
|
|
|
|
|
#define ARC_LEXER_TOKEN_COLON_TAG "COLON"
|
2024-10-30 07:36:43 -06:00
|
|
|
#define ARC_LEXER_TOKEN_SEMICOLON_ID 7
|
2024-08-31 06:09:33 -06:00
|
|
|
#define ARC_LEXER_TOKEN_SEMICOLON_CHAR ';'
|
|
|
|
|
#define ARC_LEXER_TOKEN_SEMICOLON_TAG "SEMICOLON"
|
2024-10-30 07:36:43 -06:00
|
|
|
#define ARC_LEXER_TOKEN_COMMA_ID 8
|
2024-08-31 06:09:33 -06:00
|
|
|
#define ARC_LEXER_TOKEN_COMMA_CHAR ','
|
|
|
|
|
#define ARC_LEXER_TOKEN_COMMA_TAG "COMMA"
|
2024-10-30 07:36:43 -06:00
|
|
|
#define ARC_LEXER_TOKEN_PERIOD_ID 9
|
2024-08-31 06:09:33 -06:00
|
|
|
#define ARC_LEXER_TOKEN_PERIOD_CHAR '.'
|
|
|
|
|
#define ARC_LEXER_TOKEN_PERIOD_TAG "PERIOD"
|
2024-10-30 07:36:43 -06:00
|
|
|
#define ARC_LEXER_TOKEN_FORWARD_SLASH_ID 10
|
2024-08-31 06:09:33 -06:00
|
|
|
#define ARC_LEXER_TOKEN_FORWARD_SLASH_CHAR '/'
|
|
|
|
|
#define ARC_LEXER_TOKEN_FORWARD_SLASH_TAG "FORWARD_SLASH"
|
2024-10-30 07:36:43 -06:00
|
|
|
#define ARC_LEXER_TOKEN_BACK_SLASH_ID 11
|
2024-08-31 06:09:33 -06:00
|
|
|
#define ARC_LEXER_TOKEN_BACK_SLASH_CHAR '\\'
|
|
|
|
|
#define ARC_LEXER_TOKEN_BACK_SLASH_TAG "BACK_SLASH"
|
2024-10-30 07:36:43 -06:00
|
|
|
#define ARC_LEXER_TOKEN_LEFT_PARENTHESIS_ID 12
|
2024-08-31 06:09:33 -06:00
|
|
|
#define ARC_LEXER_TOKEN_LEFT_PARENTHESIS_CHAR '('
|
|
|
|
|
#define ARC_LEXER_TOKEN_LEFT_PARENTHESIS_TAG "LEFT_PARENTHESIS"
|
2024-10-30 07:36:43 -06:00
|
|
|
#define ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_ID 13
|
2024-08-31 06:09:33 -06:00
|
|
|
#define ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_CHAR ')'
|
|
|
|
|
#define ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_TAG "RIGHT_PARENTHESIS"
|
2024-10-30 07:36:43 -06:00
|
|
|
#define ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_ID 14
|
2024-08-31 06:09:33 -06:00
|
|
|
#define ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_CHAR '{'
|
|
|
|
|
#define ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_TAG "LEFT_CURLY_BRACE"
|
2024-10-30 07:36:43 -06:00
|
|
|
#define ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_ID 15
|
2024-08-31 06:09:33 -06:00
|
|
|
#define ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_CHAR '}'
|
|
|
|
|
#define ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_TAG "RIGHT_CURLY_BRACE"
|
2024-10-30 07:36:43 -06:00
|
|
|
#define ARC_LEXER_TOKEN_BANG_ID 16
|
2024-08-31 06:09:33 -06:00
|
|
|
#define ARC_LEXER_TOKEN_BANG_CHAR '!'
|
|
|
|
|
#define ARC_LEXER_TOKEN_BANG_TAG "BANG"
|
2024-10-30 07:36:43 -06:00
|
|
|
#define ARC_LEXER_TOKEN_AT_ID 17
|
2024-08-31 06:09:33 -06:00
|
|
|
#define ARC_LEXER_TOKEN_AT_CHAR '!'
|
|
|
|
|
#define ARC_LEXER_TOKEN_AT_TAG "AT"
|
2024-10-30 07:36:43 -06:00
|
|
|
#define ARC_LEXER_TOKEN_HASH_ID 18
|
2024-08-31 06:09:33 -06:00
|
|
|
#define ARC_LEXER_TOKEN_HASH_CHAR '#'
|
|
|
|
|
#define ARC_LEXER_TOKEN_HASH_TAG "HASH"
|
2024-10-30 07:36:43 -06:00
|
|
|
#define ARC_LEXER_TOKEN_PERCENT_ID 19
|
2024-08-31 06:09:33 -06:00
|
|
|
#define ARC_LEXER_TOKEN_PERCENT_CHAR '%'
|
|
|
|
|
#define ARC_LEXER_TOKEN_PERCENT_TAG "PERCENT"
|
2024-08-29 05:04:08 -06:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @brief adds a bunch of basic token rules (matching the BasicTokens above)
|
|
|
|
|
*/
|
|
|
|
|
void ARC_Lexer_InitBasicTokenRules(ARC_Lexer *lexer);
|
2024-08-28 20:04:18 -06:00
|
|
|
|
2024-08-27 03:23:29 -06:00
|
|
|
#ifdef __cplusplus
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#endif // !ARC_STD_LEXER_H_
|