From b10f9b9123b7538627249f90671d9a2c98631e9b Mon Sep 17 00:00:00 2001 From: herbglitch Date: Wed, 30 Oct 2024 07:36:43 -0600 Subject: [PATCH] fixed a lot of the lexer and parser (the segfault, and now can do char ranges) --- include/arc/std/lexer.h | 40 ++++++++++++++------------- src/std/lexer.c | 60 ++++++++++++++++++++++++++++++++++++++++- src/std/parser.c | 13 ++++++--- tests/std/parser.c | 49 ++++++++++++++++----------------- 4 files changed, 115 insertions(+), 47 deletions(-) diff --git a/include/arc/std/lexer.h b/include/arc/std/lexer.h index 2544ef8..fa5c0ce 100644 --- a/include/arc/std/lexer.h +++ b/include/arc/std/lexer.h @@ -207,55 +207,57 @@ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint3 /** * @brief basic tokens */ -#define ARC_LEXER_TOKEN_NULL 0 -#define ARC_LEXER_TOKEN_EOF 1 -#define ARC_LEXER_TOKEN_NUMBER 2 -#define ARC_LEXER_TOKEN_ALPHACHAR 3 -#define ARC_LEXER_TOKEN_WHITESPACE 4 +#define ARC_LEXER_TOKEN_NULL 0 +#define ARC_LEXER_TOKEN_NUMBER 1 +#define ARC_LEXER_TOKEN_ALPHALOWERCHAR 2 +#define ARC_LEXER_TOKEN_ALPHAUPPERCHAR 3 +#define ARC_LEXER_TOKEN_WHITESPACE 4 /** * @brief basic token type ids, chars, and tags */ -#define ARC_LEXER_TOKEN_COLON_ID 1 +#define ARC_LEXER_TOKEN_NEWLINE_ID 5 +#define ARC_LEXER_TOKEN_NEWLINE_CHAR '\n' +#define ARC_LEXER_TOKEN_COLON_ID 6 #define ARC_LEXER_TOKEN_COLON_CHAR ':' #define ARC_LEXER_TOKEN_COLON_TAG "COLON" -#define ARC_LEXER_TOKEN_SEMICOLON_ID 2 +#define ARC_LEXER_TOKEN_SEMICOLON_ID 7 #define ARC_LEXER_TOKEN_SEMICOLON_CHAR ';' #define ARC_LEXER_TOKEN_SEMICOLON_TAG "SEMICOLON" -#define ARC_LEXER_TOKEN_COMMA_ID 3 +#define ARC_LEXER_TOKEN_COMMA_ID 8 #define ARC_LEXER_TOKEN_COMMA_CHAR ',' #define ARC_LEXER_TOKEN_COMMA_TAG "COMMA" -#define ARC_LEXER_TOKEN_PERIOD_ID 4 +#define ARC_LEXER_TOKEN_PERIOD_ID 9 #define ARC_LEXER_TOKEN_PERIOD_CHAR '.' #define ARC_LEXER_TOKEN_PERIOD_TAG "PERIOD" -#define ARC_LEXER_TOKEN_FORWARD_SLASH_ID 5 +#define ARC_LEXER_TOKEN_FORWARD_SLASH_ID 10 #define ARC_LEXER_TOKEN_FORWARD_SLASH_CHAR '/' #define ARC_LEXER_TOKEN_FORWARD_SLASH_TAG "FORWARD_SLASH" -#define ARC_LEXER_TOKEN_BACK_SLASH_ID 6 +#define ARC_LEXER_TOKEN_BACK_SLASH_ID 11 #define ARC_LEXER_TOKEN_BACK_SLASH_CHAR '\\' #define ARC_LEXER_TOKEN_BACK_SLASH_TAG "BACK_SLASH" -#define ARC_LEXER_TOKEN_LEFT_PARENTHESIS_ID 7 +#define ARC_LEXER_TOKEN_LEFT_PARENTHESIS_ID 12 #define ARC_LEXER_TOKEN_LEFT_PARENTHESIS_CHAR '(' #define ARC_LEXER_TOKEN_LEFT_PARENTHESIS_TAG "LEFT_PARENTHESIS" -#define ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_ID 8 +#define ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_ID 13 #define ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_CHAR ')' #define ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_TAG "RIGHT_PARENTHESIS" -#define ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_ID 9 +#define ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_ID 14 #define ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_CHAR '{' #define ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_TAG "LEFT_CURLY_BRACE" -#define ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_ID 10 +#define ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_ID 15 #define ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_CHAR '}' #define ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_TAG "RIGHT_CURLY_BRACE" -#define ARC_LEXER_TOKEN_BANG_ID 11 +#define ARC_LEXER_TOKEN_BANG_ID 16 #define ARC_LEXER_TOKEN_BANG_CHAR '!' #define ARC_LEXER_TOKEN_BANG_TAG "BANG" -#define ARC_LEXER_TOKEN_AT_ID 12 +#define ARC_LEXER_TOKEN_AT_ID 17 #define ARC_LEXER_TOKEN_AT_CHAR '!' #define ARC_LEXER_TOKEN_AT_TAG "AT" -#define ARC_LEXER_TOKEN_HASH_ID 13 +#define ARC_LEXER_TOKEN_HASH_ID 18 #define ARC_LEXER_TOKEN_HASH_CHAR '#' #define ARC_LEXER_TOKEN_HASH_TAG "HASH" -#define ARC_LEXER_TOKEN_PERCENT_ID 14 +#define ARC_LEXER_TOKEN_PERCENT_ID 19 #define ARC_LEXER_TOKEN_PERCENT_CHAR '%' #define ARC_LEXER_TOKEN_PERCENT_TAG "PERCENT" diff --git a/src/std/lexer.c b/src/std/lexer.c index ac39e2f..a842aeb 100644 --- a/src/std/lexer.c +++ b/src/std/lexer.c @@ -304,6 +304,22 @@ uint32_t ARC_Lexer_AutomataMatchCharFn(ARC_String **tokenData, ARC_String *strin return 0; } +uint32_t ARC_Lexer_AutomataMatchCharOrBetweenFn(ARC_String **tokenData, ARC_String *string, void *automataData){ + //if there is a match the token will be the same as automataData, so we don't need to store it again + *tokenData = NULL; + + //check to see if there is a match with automataData as a range of chars + char *automataDataChars = (char *)automataData; + if(string->data[0] >= automataDataChars[0] && string->data[0] <= ((char *)automataData)[1]){ + //return the token as token data and the token was found of length 1 + ARC_String_Create(tokenData, string->data, 1); + return 1; + } + + //no match was found + return 0; +} + uint32_t ARC_Lexer_AutomataMatchStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){ //if there is a match the token will be the same as automataData, so we don't need to store it again *tokenData = NULL; @@ -363,6 +379,28 @@ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharRule(uint32_t id, return tokenRule; } +ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(uint32_t id, char start, char end){ + //create the token rule + ARC_LexerTokenRule tokenRule; + + //set the id + tokenRule.id = id; + + //create and store the automataData (which is just two chars (the minumum and manximum)) + char *automataData = (char *)malloc(sizeof(char) * 2); + automataData[0] = start; + automataData[1] = end; + tokenRule.automataData = (void *)automataData; + + //we can use the ARC_Lexer_AutomataMatchCharInStringFn for this + tokenRule.automataFn = ARC_Lexer_AutomataMatchCharOrBetweenFn; + + //add the private destroy function (we can use the char as it destroys a char pointer of any size) + tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyCharAutomataDataFn; + + //return the created tokenRule + return tokenRule; +} //private function to free automataData stored as an ARC_String void ARC_LexerTokenRule_DestroyStringAutomataDataFn(void *automataData){ ARC_String_Destroy((ARC_String *)automataData); @@ -413,7 +451,27 @@ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint3 } void ARC_Lexer_InitBasicTokenRules(ARC_Lexer *lexer){ - ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_NULL , 0 )); + //null + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_NULL, 0)); + + //number + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_LEXER_TOKEN_NUMBER, '0', '9')); + + //alpha char + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_LEXER_TOKEN_ALPHALOWERCHAR, 'a', 'z')); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_LEXER_TOKEN_ALPHAUPPERCHAR, 'A', 'Z')); + + //whitespace + //TODO: fix this + ARC_String *whitespaceString; + ARC_String_CreateWithStrlen(&whitespaceString, " \t"); + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(ARC_LEXER_TOKEN_WHITESPACE, whitespaceString)); + ARC_String_Destroy(whitespaceString); + //TEMP FIX: + //ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_WHITESPACE, ' ')); + + //single char tokens + ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_NEWLINE_ID , ARC_LEXER_TOKEN_NEWLINE_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_COLON_ID , ARC_LEXER_TOKEN_COLON_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_SEMICOLON_ID , ARC_LEXER_TOKEN_SEMICOLON_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_COMMA_ID , ARC_LEXER_TOKEN_COMMA_CHAR )); diff --git a/src/std/parser.c b/src/std/parser.c index 9b1da5b..8fba3ce 100644 --- a/src/std/parser.c +++ b/src/std/parser.c @@ -25,7 +25,7 @@ void ARC_Parser_Create(ARC_Parser **parser, ARC_Array *language, ARC_Parser_Init (*parser)->language.size = language->size; (*parser)->language.data = malloc(sizeof(ARC_ParserLanguageTag) * language->size); - memcpy((*parser)->language.data, language->data, language->size); + memcpy((*parser)->language.data, language->data, sizeof(ARC_ParserLanguageTag) * language->size); } //create the lexer @@ -103,6 +103,12 @@ void ARC_Parser_ParseTag(ARC_Parser *parser, uint32_t *lexerIndex, uint32_t tagI } void ARC_Parser_Parse(ARC_Parser *parser, ARC_String **data){ + //make sure the parser has a language + if(parser->language.size == 0){ + ARC_DEBUG_LOG_ERROR("ARC_Parser_Parse(parser, data), no parser language defined"); + return; + } + //lex the subdata ARC_Lexer_LexString(parser->lexer, data); if(arc_errno){ @@ -110,11 +116,12 @@ void ARC_Parser_Parse(ARC_Parser *parser, ARC_String **data){ return; } + //set the lexer index to start and get the first tag uint32_t lexerIndex = 0; - ARC_ParserLanguageTag startTag = ((ARC_ParserLanguageTag *)parser->language.data)[0]; + ARC_ParserLanguageTag *startTag = parser->language.data; //recursivly parse from the inital start tag - ARC_Parser_ParseTag(parser, &lexerIndex, startTag.tagId); + ARC_Parser_ParseTag(parser, &lexerIndex, startTag->tagId); if(arc_errno){ ARC_DEBUG_LOG_ERROR("ARC_Parser_Parse(parser, data), could not parse the given data"); return; diff --git a/tests/std/parser.c b/tests/std/parser.c index affbefa..50be9c6 100644 --- a/tests/std/parser.c +++ b/tests/std/parser.c @@ -2,12 +2,13 @@ #include "arc/std/errno.h" #include "arc/std/parser.h" -#define LAMBDA 0 -#define CHAR 1 -#define NUM 2 -#define CHAR_OR_NUM 3 -#define VARIABLE_NAME 4 -#define VARIABLE 5 +//TODO: fix lambda +#define LAMBDA 20 +#define CHAR ARC_LEXER_TOKEN_ALPHALOWERCHAR +#define NUM ARC_LEXER_TOKEN_NUMBER +#define CHAR_OR_NUM 23 +#define VARIABLE_NAME 24 +#define VARIABLE 25 void TEST_Parser_InitLexerRulesFn(ARC_Lexer *lexer){ ARC_Lexer_InitBasicTokenRules(lexer); @@ -21,9 +22,9 @@ ARC_TEST(Parser_Init){ uint32_t *variableTokensOrTags[] = { (uint32_t[]){ 2, CHAR, VARIABLE_NAME } }; ARC_ParserLanguageTag testTags[3] = { { - CHAR_OR_NUM, //tagId - charOrNumTokens, //tokensOrTags - 2 //tokenOrTagsSize + VARIABLE, //tagId + variableTokensOrTags, //tokensOrTags + 1 //tokenOrTagsSize }, { VARIABLE_NAME, //tagId @@ -31,9 +32,9 @@ ARC_TEST(Parser_Init){ 2 //tokenOrTagsSize }, { - VARIABLE, //tagId - variableTokensOrTags, //tokensOrTags - 1 //tokenOrTagsSize + CHAR_OR_NUM, //tagId + charOrNumTokens, //tokensOrTags + 2 //tokenOrTagsSize } }; @@ -57,9 +58,9 @@ ARC_TEST(Parser_Basic_Parse){ uint32_t *variableTokensOrTags[] = { (uint32_t[]){ 2, CHAR, VARIABLE_NAME } }; ARC_ParserLanguageTag testTags[3] = { { - CHAR_OR_NUM, //tagId - charOrNumTokens, //tokensOrTags - 2 //tokenOrTagsSize + VARIABLE, //tagId + variableTokensOrTags, //tokensOrTags + 1 //tokenOrTagsSize }, { VARIABLE_NAME, //tagId @@ -67,9 +68,9 @@ ARC_TEST(Parser_Basic_Parse){ 2 //tokenOrTagsSize }, { - VARIABLE, //tagId - variableTokensOrTags, //tokensOrTags - 1 //tokenOrTagsSize + CHAR_OR_NUM, //tagId + charOrNumTokens, //tokensOrTags + 2 //tokenOrTagsSize } }; @@ -97,9 +98,9 @@ ARC_TEST(Parser_Basic_ParseError){ uint32_t *variableTokensOrTags[] = { (uint32_t[]){ 2, CHAR, VARIABLE_NAME } }; ARC_ParserLanguageTag testTags[3] = { { - CHAR_OR_NUM, //tagId - charOrNumTokens, //tokensOrTags - 2 //tokenOrTagsSize + VARIABLE, //tagId + variableTokensOrTags, //tokensOrTags + 1 //tokenOrTagsSize }, { VARIABLE_NAME, //tagId @@ -107,9 +108,9 @@ ARC_TEST(Parser_Basic_ParseError){ 2 //tokenOrTagsSize }, { - VARIABLE, //tagId - variableTokensOrTags, //tokensOrTags - 1 //tokenOrTagsSize + CHAR_OR_NUM, //tagId + charOrNumTokens, //tokensOrTags + 2 //tokenOrTagsSize } };