2024-08-28 20:04:18 -06:00
# include "arc/std/lexer.h"
# include "arc/std/bool.h"
2024-08-29 05:04:08 -06:00
# include "arc/std/errno.h"
2024-08-28 20:04:18 -06:00
# include "arc/std/string.h"
# include "arc/std/vector.h"
# include "arc/std/io.h"
# include <stdlib.h>
struct ARC_Lexer {
2024-08-29 05:04:08 -06:00
ARC_Vector * tokenRules ;
2024-08-28 20:04:18 -06:00
ARC_Vector * tokens ;
} ;
2024-08-29 05:04:08 -06:00
//private function for checking if two lexer token rules are the same in a vector (based on id)
ARC_Bool ARC_LexerTokenRule_VectorCompareDataFn ( void * dataA , void * dataB ) {
ARC_LexerTokenRule * tokenRuleA = ( ARC_LexerTokenRule * ) dataA ;
ARC_LexerTokenRule * tokenRuleB = ( ARC_LexerTokenRule * ) dataB ;
if ( tokenRuleA - > id = = tokenRuleB - > id ) {
return ARC_True ;
}
return ARC_False ;
}
//private function for destroying a lexer token rule from a vector
void ARC_LexerTokenRule_VectorDestroyDataFn ( void * data ) {
ARC_LexerTokenRule * tokenRule = ( ARC_LexerTokenRule * ) data ;
tokenRule - > destroyAutomataDataFn ( tokenRule - > automataData ) ;
free ( tokenRule ) ;
}
//private function for destroying a lexer token from a vector
void ARC_LexerToken_VectorDestroyDataFn ( void * data ) {
ARC_LexerToken * token = ( ARC_LexerToken * ) data ;
free ( token ) ;
}
2024-08-28 20:04:18 -06:00
void ARC_Lexer_Create ( ARC_Lexer * * lexer ) {
2024-08-29 05:04:08 -06:00
//create the lexer
2024-08-28 20:04:18 -06:00
* lexer = ( ARC_Lexer * ) malloc ( sizeof ( ARC_Lexer ) ) ;
2024-08-29 05:04:08 -06:00
//setup token rules vector with compare and delete functions
ARC_Vector_CompareDataFn tokenRulesVectorCompareDataFn = ARC_LexerTokenRule_VectorCompareDataFn ;
ARC_Vector_DestroyDataFn tokenRulesVectorDestroyDataFn = ARC_LexerTokenRule_VectorDestroyDataFn ;
ARC_Vector_Create ( & ( * lexer ) - > tokenRules , & tokenRulesVectorCompareDataFn , & tokenRulesVectorDestroyDataFn ) ;
//setup tokens vector with delete funtion, we don't want a deleteDataFn because their index will be used as the id
ARC_Vector_DestroyDataFn tokenVectorDestroyDataFn = ARC_LexerToken_VectorDestroyDataFn ;
ARC_Vector_Create ( & ( * lexer ) - > tokens , NULL , & tokenVectorDestroyDataFn ) ;
2024-08-28 20:04:18 -06:00
}
void ARC_Lexer_Destroy ( ARC_Lexer * lexer ) {
2024-08-29 05:04:08 -06:00
//free the tokens (there is a vectorDeleteDataFn, so tokens should be freed)
2024-08-28 20:04:18 -06:00
ARC_Vector_Destroy ( lexer - > tokens ) ;
2024-08-29 05:04:08 -06:00
//free the token rules (there is a vectorDeleteDataFn, so token rules should be freed)
ARC_Vector_Destroy ( lexer - > tokenRules ) ;
//free the lexer
2024-08-28 20:04:18 -06:00
free ( lexer ) ;
}
2024-08-29 05:04:08 -06:00
void ARC_Lexer_RegisterTokenRule ( ARC_Lexer * lexer , ARC_LexerTokenRule tokenRule ) {
//copy the token to a new pointer
ARC_LexerTokenRule * storedTokenRule = ( ARC_LexerTokenRule * ) malloc ( sizeof ( ARC_LexerTokenRule ) ) ;
* storedTokenRule = tokenRule ;
//add to the vector and check for error (I'd be surprised if the error ever happened because that would most likely mean overflow)
ARC_Vector_Add ( lexer - > tokenRules , storedTokenRule ) ;
if ( arc_errno ) {
ARC_DEBUG_LOG_ERROR ( " ARC_Lexer_RegisterTokenRule(lexer, tokenRule), errored when running ARC_Vector_Add(lexer->tokenRules, storedTokenRule);. check logs for more info " ) ;
free ( storedTokenRule ) ;
}
}
void ARC_Lexer_LexString ( ARC_Lexer * lexer , ARC_String * * data ) {
//check if there are any token rules to use
if ( ARC_Vector_GetSize ( lexer - > tokenRules ) = = 0 ) {
arc_errno = ARC_ERRNO_DATA ;
ARC_DEBUG_LOG_ERROR ( " ARC_Lexer_LexString(lexer, data), no tokens registered to lexer to use " ) ;
return ;
}
//this will run untill everything token is stripped or there is an error
while ( data ! = NULL ) {
ARC_Bool tokenFound = ARC_False ;
for ( uint32_t index = 0 ; index < ARC_Vector_GetSize ( lexer - > tokenRules ) ; index + + ) {
//check if the token rule is found
ARC_LexerTokenRule * tokenRule = ARC_Vector_Get ( lexer - > tokenRules , index ) ;
//tokenData should only exist if tokenFound is ARC_True as stated in the header
ARC_String * tokenData ;
tokenFound = tokenRule - > automataFn ( data , & tokenData , tokenRule - > automataData ) ;
//check if a token was found if it wasn't continue. I'm doing this to try to cut down on the ammount of indentation
if ( tokenFound ! = ARC_True ) {
continue ;
}
//create the token to add
ARC_LexerToken * token = ( ARC_LexerToken * ) malloc ( sizeof ( ARC_LexerToken ) ) ;
token - > rule = tokenRule - > id ;
token - > data = tokenData ;
//add to the vector and check for error (I'd be surprised if the error ever happened because that would most likely mean overflow)
ARC_Vector_Add ( lexer - > tokens , token ) ;
if ( arc_errno ) {
ARC_DEBUG_LOG_ERROR ( " ARC_Lexer_LexString(lexer, data), errored when running ARC_Vector_Add(lexer->tokens, token);. check logs for more info " ) ;
free ( token ) ;
//clean up errored string
ARC_String_Destroy ( * data ) ;
* data = NULL ;
return ;
}
//the token was added, so break to start checking tokens again
break ;
}
//if no token was found, throw an error
if ( tokenFound = = ARC_False ) {
arc_errno = ARC_ERRNO_DATA ;
ARC_DEBUG_LOG_ERROR_WITH_VARIABLES ( " ARC_Lexer_LexString(lexer, data), no tokens found with current string: \" %s \" " , ( * data ) - > data ) ;
//clean up errored string
ARC_String_Destroy ( * data ) ;
* data = NULL ;
//TODO: might want to do smthn with already tokened data
return ;
}
}
2024-08-28 20:04:18 -06:00
}
void ARC_Lexer_LexFile ( ARC_Lexer * lexer , ARC_String * path ) {
2024-08-29 05:04:08 -06:00
//read file and clean up if it errors
2024-08-28 20:04:18 -06:00
ARC_String * data ;
ARC_IO_FileToStr ( path , & data ) ;
2024-08-29 05:04:08 -06:00
if ( arc_errno ) {
ARC_DEBUG_LOG_ERROR ( " ARC_Lexer_LexFile(lexer, path), errored when running ARC_IO_FileToStr(path, &data);. check logs for more info " ) ;
if ( data ! = NULL ) {
ARC_String_Destroy ( data ) ;
}
return ;
}
2024-08-28 20:04:18 -06:00
2024-08-29 05:04:08 -06:00
//lex the string and log if there is an error, ARC_Lexer_LexString will clean up the string
ARC_Lexer_LexString ( lexer , & data ) ;
if ( arc_errno ) {
ARC_DEBUG_LOG_ERROR ( " ARC_Lexer_LexFile(lexer, path), errored when running ARC_Lexer_LexString(lexer, data);. check logs for more info " ) ;
}
}
2024-08-28 20:04:18 -06:00
2024-08-29 05:04:08 -06:00
ARC_LexerToken ARC_Lexer_GetToken ( ARC_Lexer * lexer , uint32_t index ) {
//get the token and log if there is an error
ARC_LexerToken * token = ARC_Vector_Get ( lexer - > tokens , index ) ;
if ( arc_errno ) {
ARC_DEBUG_LOG_ERROR ( " ARC_Lexer_GetToken(lexer, index), errored when running ARC_Vector_Get(lexer->tokens, index);. check logs for more info " ) ;
//return a token with max rule value, and NULL for the string to signify an error
return ( ARC_LexerToken ) {
~ ( uint32_t ) 0 ,
NULL
} ;
2024-08-28 20:04:18 -06:00
}
2024-08-29 05:04:08 -06:00
//the token was found, so return a copy to that
return * token ;
}
uint32_t ARC_Lexer_GetTokensSize ( ARC_Lexer * lexer ) {
return ARC_Vector_GetSize ( lexer - > tokens ) ;
2024-08-28 20:04:18 -06:00
}
ARC_Bool ARC_Lexer_AutomataMatchCharFn ( ARC_String * * string , ARC_String * * tokenData , void * automataData ) {
2024-08-29 05:04:08 -06:00
//if there is a match the token will be the same as automataData, so we don't need to store it again
2024-08-28 20:04:18 -06:00
* tokenData = NULL ;
2024-08-29 05:04:08 -06:00
//check to see if there is a match with automataData as a char
2024-08-28 20:04:18 -06:00
if ( ( * string ) - > data [ 0 ] = = * ( char * ) automataData ) {
2024-08-29 05:04:08 -06:00
//to keep from erroring instead of stripping from a 1 character string we can just delete it
2024-08-28 20:04:18 -06:00
if ( ( * string ) - > length = = 1 ) {
ARC_String_Destroy ( * string ) ;
* string = NULL ;
2024-08-29 05:04:08 -06:00
return ARC_True ;
2024-08-28 20:04:18 -06:00
}
2024-08-29 05:04:08 -06:00
//strip the charater from the front of the string and return that a match was found
2024-08-28 20:04:18 -06:00
ARC_String_ReplaceWithSubstring ( string , 1 , ( * string ) - > length - 1 ) ;
return ARC_True ;
}
2024-08-29 05:04:08 -06:00
//no match was found
2024-08-28 20:04:18 -06:00
return ARC_False ;
}
ARC_Bool ARC_Lexer_AutomataMatchStringFn ( ARC_String * * string , ARC_String * * tokenData , void * automataData ) {
2024-08-29 05:04:08 -06:00
//if there is a match the token will be the same as automataData, so we don't need to store it again
2024-08-28 20:04:18 -06:00
* tokenData = NULL ;
2024-08-29 05:04:08 -06:00
//check to see if there is a match with automataData as a string
2024-08-28 20:04:18 -06:00
ARC_String * automataDataString = ( ARC_String * ) automataData ;
2024-08-29 05:04:08 -06:00
//to keep from erroring instead of stripping from a same length string we can just delete it
2024-08-28 20:04:18 -06:00
if ( ARC_String_Equals ( * string , automataDataString ) ) {
if ( ( * string ) - > length = = automataDataString - > length ) {
ARC_String_Destroy ( * string ) ;
* string = NULL ;
}
2024-08-29 05:04:08 -06:00
//strip the token string from the front of the string and return that a match was found
2024-08-28 20:04:18 -06:00
ARC_String_ReplaceWithSubstring ( string , automataDataString - > length , ( * string ) - > length - automataDataString - > length ) ;
return ARC_True ;
}
2024-08-29 05:04:08 -06:00
//no match was found
2024-08-28 20:04:18 -06:00
return ARC_False ;
}
2024-08-29 05:04:08 -06:00
//private function to free automataData stored as a char
void ARC_LexerTokenRule_DestroyCharAutomataDataFn ( void * automataData ) {
free ( ( char * ) automataData ) ;
}
ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharRule ( uint32_t id , char character ) {
//create the token rule
ARC_LexerTokenRule tokenRule ;
//set the id
tokenRule . id = id ;
//create and store the automataData (which is just a char)
char * automataData = ( char * ) malloc ( sizeof ( char ) ) ;
* automataData = character ;
tokenRule . automataData = ( void * ) automataData ;
//we can use the ARC_Lexer_AutomataMatchCharFn for this
tokenRule . automataFn = ARC_Lexer_AutomataMatchCharFn ;
//add the private destroy function
tokenRule . destroyAutomataDataFn = ARC_LexerTokenRule_DestroyCharAutomataDataFn ;
//return the created tokenRule
return tokenRule ;
}
//private function to free automataData stored as an ARC_String
void ARC_LexerTokenRule_DestroyStringAutomataDataFn ( void * automataData ) {
ARC_String_Destroy ( ( ARC_String * ) automataData ) ;
}
ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchStringRule ( uint32_t id , ARC_String * string ) {
//create the token rule
ARC_LexerTokenRule tokenRule ;
//set the id
tokenRule . id = id ;
//copy and store the automataData (which is just an ARC_String)
ARC_String * automataData ;
ARC_String_Copy ( & automataData , string ) ;
tokenRule . automataData = ( void * ) automataData ;
//we can use the ARC_Lexer_AutomataMatchCharFn for this
tokenRule . automataFn = ARC_Lexer_AutomataMatchCharFn ;
//add the private destroy function
tokenRule . destroyAutomataDataFn = ARC_LexerTokenRule_DestroyStringAutomataDataFn ;
//return the created tokenRule
return tokenRule ;
}
void ARC_Lexer_InitBasicTokenRules ( ARC_Lexer * lexer ) {
ARC_Lexer_RegisterTokenRule ( lexer , ARC_LexerTokenRule_CreateAndReturnMatchCharRule ( LEXER_TOKEN_COLON_ID , LEXER_TOKEN_COLON_CHAR ) ) ;
ARC_Lexer_RegisterTokenRule ( lexer , ARC_LexerTokenRule_CreateAndReturnMatchCharRule ( LEXER_TOKEN_SEMICOLON_ID , LEXER_TOKEN_SEMICOLON_CHAR ) ) ;
ARC_Lexer_RegisterTokenRule ( lexer , ARC_LexerTokenRule_CreateAndReturnMatchCharRule ( LEXER_TOKEN_COMMA_ID , LEXER_TOKEN_COMMA_CHAR ) ) ;
ARC_Lexer_RegisterTokenRule ( lexer , ARC_LexerTokenRule_CreateAndReturnMatchCharRule ( LEXER_TOKEN_PERIOD_ID , LEXER_TOKEN_PERIOD_CHAR ) ) ;
ARC_Lexer_RegisterTokenRule ( lexer , ARC_LexerTokenRule_CreateAndReturnMatchCharRule ( LEXER_TOKEN_FORWARD_SLASH_ID , LEXER_TOKEN_FORWARD_SLASH_CHAR ) ) ;
ARC_Lexer_RegisterTokenRule ( lexer , ARC_LexerTokenRule_CreateAndReturnMatchCharRule ( LEXER_TOKEN_BACK_SLASH_ID , LEXER_TOKEN_BACK_SLASH_CHAR ) ) ;
ARC_Lexer_RegisterTokenRule ( lexer , ARC_LexerTokenRule_CreateAndReturnMatchCharRule ( LEXER_TOKEN_LEFT_PARENTHESIS_ID , LEXER_TOKEN_LEFT_PARENTHESIS_CHAR ) ) ;
ARC_Lexer_RegisterTokenRule ( lexer , ARC_LexerTokenRule_CreateAndReturnMatchCharRule ( LEXER_TOKEN_RIGHT_PARENTHESIS_ID , LEXER_TOKEN_RIGHT_PARENTHESIS_CHAR ) ) ;
ARC_Lexer_RegisterTokenRule ( lexer , ARC_LexerTokenRule_CreateAndReturnMatchCharRule ( LEXER_TOKEN_LEFT_CURLY_BRACE_ID , LEXER_TOKEN_LEFT_CURLY_BRACE_CHAR ) ) ;
ARC_Lexer_RegisterTokenRule ( lexer , ARC_LexerTokenRule_CreateAndReturnMatchCharRule ( LEXER_TOKEN_RIGHT_CURLY_BRACE_ID , LEXER_TOKEN_RIGHT_CURLY_BRACE_CHAR ) ) ;
ARC_Lexer_RegisterTokenRule ( lexer , ARC_LexerTokenRule_CreateAndReturnMatchCharRule ( LEXER_TOKEN_BANG_ID , LEXER_TOKEN_BANG_CHAR ) ) ;
ARC_Lexer_RegisterTokenRule ( lexer , ARC_LexerTokenRule_CreateAndReturnMatchCharRule ( LEXER_TOKEN_AT_ID , LEXER_TOKEN_AT_CHAR ) ) ;
ARC_Lexer_RegisterTokenRule ( lexer , ARC_LexerTokenRule_CreateAndReturnMatchCharRule ( LEXER_TOKEN_HASH_ID , LEXER_TOKEN_HASH_CHAR ) ) ;
ARC_Lexer_RegisterTokenRule ( lexer , ARC_LexerTokenRule_CreateAndReturnMatchCharRule ( LEXER_TOKEN_PERCENT_ID , LEXER_TOKEN_PERCENT_CHAR ) ) ;
}