older lexer stuff
This commit is contained in:
parent
7bd7cc4aa5
commit
380e74a0e6
3 changed files with 211 additions and 72 deletions
|
|
@ -5,7 +5,6 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "arc/std/bool.h"
|
|
||||||
#include "arc/std/string.h"
|
#include "arc/std/string.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
|
@ -25,16 +24,15 @@ typedef struct ARC_LexerToken {
|
||||||
/**
|
/**
|
||||||
* @brief checks to see if a string is a type of token
|
* @brief checks to see if a string is a type of token
|
||||||
*
|
*
|
||||||
* @note do not set tokenData if this function returns ARC_False, doing so will create a memory leak
|
* @note do not set tokenData if this function returns 0, doing so will create a memory leak
|
||||||
*
|
*
|
||||||
* @param[in/out] string a string to be checked to see if it matches a token,
|
* @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed
|
||||||
* this needs to srip the token out for the lexer to avoid an infinite loop
|
* @param[in] string a string to be checked to see if it matches a token
|
||||||
* @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed
|
* @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn
|
||||||
* @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn
|
|
||||||
*
|
*
|
||||||
* @return if a token was successfully found ARC_True, otherwise ARC_False
|
* @return the size of the token found, or 0 if the token was not found
|
||||||
*/
|
*/
|
||||||
typedef ARC_Bool (* ARC_LexerTokenRule_AutomataFn)(ARC_String **string, ARC_String **tokenData, void *automataData);
|
typedef uint32_t (* ARC_LexerTokenRule_AutomataFn)(ARC_String **tokenData, ARC_String *string, void *automataData);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief a callback function to clean up ARC_LexerTokenRule's automataData
|
* @brief a callback function to clean up ARC_LexerTokenRule's automataData
|
||||||
|
|
@ -124,14 +122,13 @@ uint32_t ARC_Lexer_GetTokensSize(ARC_Lexer *lexer);
|
||||||
* @note this is intended as a helper callback
|
* @note this is intended as a helper callback
|
||||||
* @note this function is a ARC_Lexer_AutomataFn callback
|
* @note this function is a ARC_Lexer_AutomataFn callback
|
||||||
*
|
*
|
||||||
* @param[in/out] string a string to be checked to see if it matches a token,
|
* @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed
|
||||||
* this needs to srip the token out for the lexer to avoid an infinite loop
|
* @param[in] string a string to be checked to see if it matches a token
|
||||||
* @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed
|
* @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn
|
||||||
* @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn
|
|
||||||
*
|
*
|
||||||
* @return if a token was successfully found ARC_True, otherwise ARC_False
|
* @return the size of the token found, or 0 if the token was not found
|
||||||
*/
|
*/
|
||||||
ARC_Bool ARC_Lexer_AutomataMatchCharFn(ARC_String **string, ARC_String **tokenData, void *automataData);
|
uint32_t ARC_Lexer_AutomataMatchCharFn(ARC_String **tokenData, ARC_String *string, void *automataData);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief checks if the substring automataData as an ARC_String matches the first part of string
|
* @brief checks if the substring automataData as an ARC_String matches the first part of string
|
||||||
|
|
@ -139,14 +136,27 @@ ARC_Bool ARC_Lexer_AutomataMatchCharFn(ARC_String **string, ARC_String **tokenDa
|
||||||
* @note this is intended as a helper callback
|
* @note this is intended as a helper callback
|
||||||
* @note this function is a ARC_Lexer_AutomataFn callback
|
* @note this function is a ARC_Lexer_AutomataFn callback
|
||||||
*
|
*
|
||||||
* @param[in/out] string a string to be checked to see if it matches a token,
|
* @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed
|
||||||
* this needs to srip the token out for the lexer to avoid an infinite loop
|
* @param[in] string a string to be checked to see if it matches a token
|
||||||
* @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed
|
* @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn
|
||||||
* @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn
|
|
||||||
*
|
*
|
||||||
* @return if a token was successfully found ARC_True, otherwise ARC_False
|
* @return the size of the token found, or 0 if the token was not found
|
||||||
*/
|
*/
|
||||||
ARC_Bool ARC_Lexer_AutomataMatchStringFn(ARC_String **string, ARC_String **tokenData, void *automataData);
|
uint32_t ARC_Lexer_AutomataMatchStringFn(ARC_String **tokenData, ARC_String *string, void *automataData);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief checks if the first part of string is a character in substring
|
||||||
|
*
|
||||||
|
* @note this is intended as a helper callback
|
||||||
|
* @note this function is a ARC_Lexer_AutomataFn callback
|
||||||
|
*
|
||||||
|
* @param[out] tokenData a place to store token data (like a variable name), can be NULL if not needed
|
||||||
|
* @param[in] string a string to be checked to see if it matches a token
|
||||||
|
* @param[in] automataData any data that needs to be used for the ARC_Lexer_AutomataFn
|
||||||
|
*
|
||||||
|
* @return the size of the token found, or 0 if the token was not found
|
||||||
|
*/
|
||||||
|
uint32_t ARC_Lexer_AutomataMatchCharInStringFn(ARC_String **tokenData, ARC_String *string, void *automataData);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief creates a ARC_LexerTokenRule with a given id and character
|
* @brief creates a ARC_LexerTokenRule with a given id and character
|
||||||
|
|
@ -173,6 +183,28 @@ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharRule(uint32_t id,
|
||||||
*/
|
*/
|
||||||
ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchStringRule(uint32_t id, ARC_String *string);
|
ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchStringRule(uint32_t id, ARC_String *string);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief creates a ARC_LexerTokenRule with a given id and string
|
||||||
|
*
|
||||||
|
* @note this is intended as a helper funtion
|
||||||
|
* #note string will not be freed (it will be copied and the copy will be freed)
|
||||||
|
*
|
||||||
|
* @param[in] id a tokens id (basically the token value)
|
||||||
|
* @param[in] character the string to match against, will be copied
|
||||||
|
*
|
||||||
|
* @return a token rule based in the id and string
|
||||||
|
*/
|
||||||
|
ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint32_t id, ARC_String *string);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief basic tokens
|
||||||
|
*/
|
||||||
|
#define ARC_LEXER_TOKEN_NULL 0
|
||||||
|
#define ARC_LEXER_TOKEN_EOF 1
|
||||||
|
#define ARC_LEXER_TOKEN_NUMBER 2
|
||||||
|
#define ARC_LEXER_TOKEN_ALPHACHAR 3
|
||||||
|
#define ARC_LEXER_TOKEN_WHITESPACE 4
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief basic token type ids, chars, and tags
|
* @brief basic token type ids, chars, and tags
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,63 @@
|
||||||
|
#ifndef ARC_STD_PARSER_H_
|
||||||
|
#define ARC_STD_PARSER_H_
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "arc/std/string.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief a parser type
|
||||||
|
*/
|
||||||
|
typedef struct ARC_Parser ARC_Parser;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief a parser node
|
||||||
|
*/
|
||||||
|
typedef struct ARC_ParserNode ARC_ParserNode;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief creates an ARC_Parser type
|
||||||
|
*
|
||||||
|
* @param[out] parser
|
||||||
|
* @param[in] language ..., can be NULL
|
||||||
|
*/
|
||||||
|
void ARC_Parser_Create(ARC_Parser **parser, ARC_String *language);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief destroys an ARC_Parser type
|
||||||
|
*
|
||||||
|
* @param[in] parser ARC_Parser to free
|
||||||
|
*/
|
||||||
|
void ARC_Parser_Destroy(ARC_Parser *parser);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief sets the definition of the parser, the language itself is parsed and will throw an error if invalid
|
||||||
|
*
|
||||||
|
* @param[in] parser ARC_Parser to set the language to
|
||||||
|
* @param[in] language the language as a string the parser should use
|
||||||
|
*/
|
||||||
|
void ARC_Parser_SetLanguage(ARC_Parser *parser, ARC_String *language);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief sets the definition of the parser, the language itself is parsed and will throw an error if invalid
|
||||||
|
*
|
||||||
|
* @param[in] parser ARC_Parser to set the language to
|
||||||
|
* @param[in] language the language as a string the parser should use
|
||||||
|
*/
|
||||||
|
void ARC_Parser_Parse(ARC_Parser *parser, ARC_String *data);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief sets the definition of the parser, the language itself is parsed and will throw an error if invalid
|
||||||
|
*
|
||||||
|
* @param[in] parser ARC_Parser to set the language to
|
||||||
|
* @param[in] language the language as a string the parser should use
|
||||||
|
*/
|
||||||
|
void ARC_Parser_ParseFile(ARC_Parser *parser, ARC_String *path);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // !ARC_STD_LEXER_H_
|
||||||
148
src/std/lexer.c
148
src/std/lexer.c
|
|
@ -86,53 +86,74 @@ void ARC_Lexer_LexString(ARC_Lexer *lexer, ARC_String **data){
|
||||||
|
|
||||||
//this will run untill everything token is stripped or there is an error
|
//this will run untill everything token is stripped or there is an error
|
||||||
while(*data != NULL){
|
while(*data != NULL){
|
||||||
ARC_Bool tokenFound = ARC_False;
|
uint32_t tokenLength = 0;
|
||||||
|
uint32_t lastTokenLength = 0;
|
||||||
|
ARC_LexerToken *token = NULL;
|
||||||
|
|
||||||
for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){
|
for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){
|
||||||
//check if the token rule is found
|
//check if the token rule is found
|
||||||
ARC_LexerTokenRule *tokenRule = ARC_Vector_Get(lexer->tokenRules, index);
|
ARC_LexerTokenRule *tokenRule = ARC_Vector_Get(lexer->tokenRules, index);
|
||||||
|
|
||||||
//tokenData should only exist if tokenFound is ARC_True as stated in the header
|
//set the last token length if the last token had a length
|
||||||
|
if(tokenLength > 0){
|
||||||
|
lastTokenLength = tokenLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
//tokenData should only exist if tokenLength is ARC_True as stated in the header
|
||||||
ARC_String *tokenData;
|
ARC_String *tokenData;
|
||||||
tokenFound = tokenRule->automataFn(data, &tokenData, tokenRule->automataData);
|
tokenLength = tokenRule->automataFn(&tokenData, *data, tokenRule->automataData);
|
||||||
|
|
||||||
//check if a token was found if it wasn't continue. I'm doing this to try to cut down on the ammount of indentation
|
//check if a token was found if it wasn't continue. I'm doing this to try to cut down on the ammount of indentation
|
||||||
if(tokenFound != ARC_True){
|
if(tokenLength == 0){
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
//create the token to add
|
//check to see if we found a better match
|
||||||
ARC_LexerToken *token = (ARC_LexerToken *)malloc(sizeof(ARC_LexerToken));
|
if(tokenLength > lastTokenLength){
|
||||||
token->rule = tokenRule->id;
|
//free the current token if it exists
|
||||||
token->data = tokenData;
|
if(token != NULL){
|
||||||
|
ARC_LexerTokenRule_VectorDestroyDataFn((void *)token);
|
||||||
|
}
|
||||||
|
|
||||||
//add to the vector and check for error (I'd be surprised if the error ever happened because that would most likely mean overflow)
|
//create the token to add
|
||||||
ARC_Vector_Add(lexer->tokens, (void *)token);
|
token = (ARC_LexerToken *)malloc(sizeof(ARC_LexerToken));
|
||||||
if(arc_errno){
|
token->rule = tokenRule->id;
|
||||||
ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexString(lexer, data), errored when running ARC_Vector_Add(lexer->tokens, token);. check logs for more info");
|
token->data = tokenData;
|
||||||
free(token);
|
|
||||||
|
|
||||||
//clean up errored string
|
|
||||||
ARC_String_Destroy(*data);
|
|
||||||
*data = NULL;
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//the token was added, so break to start checking tokens again
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//if no token was found, throw an error
|
//if no token was found, throw an error
|
||||||
if(tokenFound == ARC_False){
|
if(token == NULL){
|
||||||
arc_errno = ARC_ERRNO_DATA;
|
arc_errno = ARC_ERRNO_DATA;
|
||||||
ARC_DEBUG_LOG_ERROR_WITH_VARIABLES("ARC_Lexer_LexString(lexer, data), no tokens found with current string: \"%s\"", (*data)->data);
|
ARC_DEBUG_LOG_ERROR_WITH_VARIABLES("ARC_Lexer_LexString(lexer, data), no tokens found with current string: \"%s\"", (*data)->data);
|
||||||
|
|
||||||
//clean up errored string
|
//clean up errored string
|
||||||
ARC_String_Destroy(*data);
|
ARC_String_Destroy(*data);
|
||||||
*data = NULL;
|
*data = NULL;
|
||||||
|
|
||||||
//TODO: might want to do smthn with already tokened data
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//token exists (something must have gone very wrong if it doesn't), so add it and check for overflow (which I'd be surprised if that happens)
|
||||||
|
ARC_Vector_Add(lexer->tokens, (void *)token);
|
||||||
|
if(arc_errno){
|
||||||
|
ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexString(lexer, data), errored when running ARC_Vector_Add(lexer->tokens, token);. check logs for more info");
|
||||||
|
free(token);
|
||||||
|
|
||||||
|
//clean up errored string
|
||||||
|
ARC_String_Destroy(*data);
|
||||||
|
*data = NULL;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
//if the last token was found, destroy the string and return
|
||||||
|
if(lastTokenLength == (*data)->length){
|
||||||
|
ARC_String_Destroy(*data);
|
||||||
|
*data = NULL;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
//strip the string
|
||||||
|
ARC_String_ReplaceWithSubstring(data, lastTokenLength, (*data)->length - lastTokenLength);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -177,49 +198,50 @@ uint32_t ARC_Lexer_GetTokensSize(ARC_Lexer *lexer){
|
||||||
return ARC_Vector_GetSize(lexer->tokens);
|
return ARC_Vector_GetSize(lexer->tokens);
|
||||||
}
|
}
|
||||||
|
|
||||||
ARC_Bool ARC_Lexer_AutomataMatchCharFn(ARC_String **string, ARC_String **tokenData, void *automataData){
|
uint32_t ARC_Lexer_AutomataMatchCharFn(ARC_String **tokenData, ARC_String *string, void *automataData){
|
||||||
//if there is a match the token will be the same as automataData, so we don't need to store it again
|
//if there is a match the token will be the same as automataData, so we don't need to store it again
|
||||||
*tokenData = NULL;
|
*tokenData = NULL;
|
||||||
|
|
||||||
//check to see if there is a match with automataData as a char
|
//check to see if there is a match with automataData as a char
|
||||||
if((*string)->data[0] == *(char *)automataData){
|
if(string->data[0] == *(char *)automataData){
|
||||||
//to keep from erroring instead of stripping from a 1 character string we can just delete it
|
//return the token was found of length 1
|
||||||
if((*string)->length == 1){
|
return 1;
|
||||||
ARC_String_Destroy(*string);
|
|
||||||
*string = NULL;
|
|
||||||
return ARC_True;
|
|
||||||
}
|
|
||||||
|
|
||||||
//strip the charater from the front of the string and return that a match was found
|
|
||||||
ARC_String_ReplaceWithSubstring(string, 1, (*string)->length - 1);
|
|
||||||
return ARC_True;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//no match was found
|
//no match was found
|
||||||
return ARC_False;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
ARC_Bool ARC_Lexer_AutomataMatchStringFn(ARC_String **string, ARC_String **tokenData, void *automataData){
|
uint32_t ARC_Lexer_AutomataMatchStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){
|
||||||
//if there is a match the token will be the same as automataData, so we don't need to store it again
|
//if there is a match the token will be the same as automataData, so we don't need to store it again
|
||||||
*tokenData = NULL;
|
*tokenData = NULL;
|
||||||
|
|
||||||
//check to see if there is a match with automataData as a string
|
//check to see if there is a match with automataData as a string
|
||||||
ARC_String *automataDataString = (ARC_String *)automataData;
|
ARC_String *automataDataString = (ARC_String *)automataData;
|
||||||
|
if(ARC_String_SubstringEquals(string, 0, automataDataString)){
|
||||||
//to keep from erroring instead of stripping from a same length string we can just delete it
|
//return the token was found of the string length
|
||||||
if(ARC_String_Equals(*string, automataDataString)){
|
return automataDataString->length;
|
||||||
if((*string)->length == automataDataString->length){
|
|
||||||
ARC_String_Destroy(*string);
|
|
||||||
*string = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
//strip the token string from the front of the string and return that a match was found
|
|
||||||
ARC_String_ReplaceWithSubstring(string, automataDataString->length, (*string)->length - automataDataString->length);
|
|
||||||
return ARC_True;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//no match was found
|
//no match was found
|
||||||
return ARC_False;
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t ARC_Lexer_AutomataMatchCharInStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){
|
||||||
|
//if there is a match the token will be the same as automataData, so we don't need to store it again
|
||||||
|
*tokenData = NULL;
|
||||||
|
|
||||||
|
//check to see if there is a char match in automataData as a string
|
||||||
|
ARC_String *automataDataString = (ARC_String *)automataData;
|
||||||
|
for(uint64_t index = 0; index < automataDataString->length; index++){
|
||||||
|
if(string->data[0] == automataDataString->data[index]){
|
||||||
|
//return the token was found in the string of length 1
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//no match was found
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
//private function to free automataData stored as a char
|
//private function to free automataData stored as a char
|
||||||
|
|
@ -266,8 +288,30 @@ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchStringRule(uint32_t id
|
||||||
ARC_String_Copy(&automataData, string);
|
ARC_String_Copy(&automataData, string);
|
||||||
tokenRule.automataData = (void *)automataData;
|
tokenRule.automataData = (void *)automataData;
|
||||||
|
|
||||||
//we can use the ARC_Lexer_AutomataMatchCharFn for this
|
//we can use the ARC_Lexer_AutomataMatchStringFn for this
|
||||||
tokenRule.automataFn = ARC_Lexer_AutomataMatchCharFn;
|
tokenRule.automataFn = ARC_Lexer_AutomataMatchStringFn;
|
||||||
|
|
||||||
|
//add the private destroy function
|
||||||
|
tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyStringAutomataDataFn;
|
||||||
|
|
||||||
|
//return the created tokenRule
|
||||||
|
return tokenRule;
|
||||||
|
}
|
||||||
|
|
||||||
|
ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint32_t id, ARC_String *string){
|
||||||
|
//create the token rule
|
||||||
|
ARC_LexerTokenRule tokenRule;
|
||||||
|
|
||||||
|
//set the id
|
||||||
|
tokenRule.id = id;
|
||||||
|
|
||||||
|
//copy and store the automataData (which is just an ARC_String)
|
||||||
|
ARC_String *automataData;
|
||||||
|
ARC_String_Copy(&automataData, string);
|
||||||
|
tokenRule.automataData = (void *)automataData;
|
||||||
|
|
||||||
|
//we can use the ARC_Lexer_AutomataMatchCharInStringFn for this
|
||||||
|
tokenRule.automataFn = ARC_Lexer_AutomataMatchCharInStringFn;
|
||||||
|
|
||||||
//add the private destroy function
|
//add the private destroy function
|
||||||
tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyStringAutomataDataFn;
|
tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyStringAutomataDataFn;
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue