fixed a lot of the lexer and parser (the segfault, and now can do char ranges)

This commit is contained in:
herbglitch 2024-10-30 07:36:43 -06:00
parent 050f7a8452
commit b10f9b9123
4 changed files with 115 additions and 47 deletions

View file

@ -207,55 +207,57 @@ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint3
/** /**
* @brief basic tokens * @brief basic tokens
*/ */
#define ARC_LEXER_TOKEN_NULL 0 #define ARC_LEXER_TOKEN_NULL 0
#define ARC_LEXER_TOKEN_EOF 1 #define ARC_LEXER_TOKEN_NUMBER 1
#define ARC_LEXER_TOKEN_NUMBER 2 #define ARC_LEXER_TOKEN_ALPHALOWERCHAR 2
#define ARC_LEXER_TOKEN_ALPHACHAR 3 #define ARC_LEXER_TOKEN_ALPHAUPPERCHAR 3
#define ARC_LEXER_TOKEN_WHITESPACE 4 #define ARC_LEXER_TOKEN_WHITESPACE 4
/** /**
* @brief basic token type ids, chars, and tags * @brief basic token type ids, chars, and tags
*/ */
#define ARC_LEXER_TOKEN_COLON_ID 1 #define ARC_LEXER_TOKEN_NEWLINE_ID 5
#define ARC_LEXER_TOKEN_NEWLINE_CHAR '\n'
#define ARC_LEXER_TOKEN_COLON_ID 6
#define ARC_LEXER_TOKEN_COLON_CHAR ':' #define ARC_LEXER_TOKEN_COLON_CHAR ':'
#define ARC_LEXER_TOKEN_COLON_TAG "COLON" #define ARC_LEXER_TOKEN_COLON_TAG "COLON"
#define ARC_LEXER_TOKEN_SEMICOLON_ID 2 #define ARC_LEXER_TOKEN_SEMICOLON_ID 7
#define ARC_LEXER_TOKEN_SEMICOLON_CHAR ';' #define ARC_LEXER_TOKEN_SEMICOLON_CHAR ';'
#define ARC_LEXER_TOKEN_SEMICOLON_TAG "SEMICOLON" #define ARC_LEXER_TOKEN_SEMICOLON_TAG "SEMICOLON"
#define ARC_LEXER_TOKEN_COMMA_ID 3 #define ARC_LEXER_TOKEN_COMMA_ID 8
#define ARC_LEXER_TOKEN_COMMA_CHAR ',' #define ARC_LEXER_TOKEN_COMMA_CHAR ','
#define ARC_LEXER_TOKEN_COMMA_TAG "COMMA" #define ARC_LEXER_TOKEN_COMMA_TAG "COMMA"
#define ARC_LEXER_TOKEN_PERIOD_ID 4 #define ARC_LEXER_TOKEN_PERIOD_ID 9
#define ARC_LEXER_TOKEN_PERIOD_CHAR '.' #define ARC_LEXER_TOKEN_PERIOD_CHAR '.'
#define ARC_LEXER_TOKEN_PERIOD_TAG "PERIOD" #define ARC_LEXER_TOKEN_PERIOD_TAG "PERIOD"
#define ARC_LEXER_TOKEN_FORWARD_SLASH_ID 5 #define ARC_LEXER_TOKEN_FORWARD_SLASH_ID 10
#define ARC_LEXER_TOKEN_FORWARD_SLASH_CHAR '/' #define ARC_LEXER_TOKEN_FORWARD_SLASH_CHAR '/'
#define ARC_LEXER_TOKEN_FORWARD_SLASH_TAG "FORWARD_SLASH" #define ARC_LEXER_TOKEN_FORWARD_SLASH_TAG "FORWARD_SLASH"
#define ARC_LEXER_TOKEN_BACK_SLASH_ID 6 #define ARC_LEXER_TOKEN_BACK_SLASH_ID 11
#define ARC_LEXER_TOKEN_BACK_SLASH_CHAR '\\' #define ARC_LEXER_TOKEN_BACK_SLASH_CHAR '\\'
#define ARC_LEXER_TOKEN_BACK_SLASH_TAG "BACK_SLASH" #define ARC_LEXER_TOKEN_BACK_SLASH_TAG "BACK_SLASH"
#define ARC_LEXER_TOKEN_LEFT_PARENTHESIS_ID 7 #define ARC_LEXER_TOKEN_LEFT_PARENTHESIS_ID 12
#define ARC_LEXER_TOKEN_LEFT_PARENTHESIS_CHAR '(' #define ARC_LEXER_TOKEN_LEFT_PARENTHESIS_CHAR '('
#define ARC_LEXER_TOKEN_LEFT_PARENTHESIS_TAG "LEFT_PARENTHESIS" #define ARC_LEXER_TOKEN_LEFT_PARENTHESIS_TAG "LEFT_PARENTHESIS"
#define ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_ID 8 #define ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_ID 13
#define ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_CHAR ')' #define ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_CHAR ')'
#define ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_TAG "RIGHT_PARENTHESIS" #define ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_TAG "RIGHT_PARENTHESIS"
#define ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_ID 9 #define ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_ID 14
#define ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_CHAR '{' #define ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_CHAR '{'
#define ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_TAG "LEFT_CURLY_BRACE" #define ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_TAG "LEFT_CURLY_BRACE"
#define ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_ID 10 #define ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_ID 15
#define ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_CHAR '}' #define ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_CHAR '}'
#define ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_TAG "RIGHT_CURLY_BRACE" #define ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_TAG "RIGHT_CURLY_BRACE"
#define ARC_LEXER_TOKEN_BANG_ID 11 #define ARC_LEXER_TOKEN_BANG_ID 16
#define ARC_LEXER_TOKEN_BANG_CHAR '!' #define ARC_LEXER_TOKEN_BANG_CHAR '!'
#define ARC_LEXER_TOKEN_BANG_TAG "BANG" #define ARC_LEXER_TOKEN_BANG_TAG "BANG"
#define ARC_LEXER_TOKEN_AT_ID 12 #define ARC_LEXER_TOKEN_AT_ID 17
#define ARC_LEXER_TOKEN_AT_CHAR '!' #define ARC_LEXER_TOKEN_AT_CHAR '!'
#define ARC_LEXER_TOKEN_AT_TAG "AT" #define ARC_LEXER_TOKEN_AT_TAG "AT"
#define ARC_LEXER_TOKEN_HASH_ID 13 #define ARC_LEXER_TOKEN_HASH_ID 18
#define ARC_LEXER_TOKEN_HASH_CHAR '#' #define ARC_LEXER_TOKEN_HASH_CHAR '#'
#define ARC_LEXER_TOKEN_HASH_TAG "HASH" #define ARC_LEXER_TOKEN_HASH_TAG "HASH"
#define ARC_LEXER_TOKEN_PERCENT_ID 14 #define ARC_LEXER_TOKEN_PERCENT_ID 19
#define ARC_LEXER_TOKEN_PERCENT_CHAR '%' #define ARC_LEXER_TOKEN_PERCENT_CHAR '%'
#define ARC_LEXER_TOKEN_PERCENT_TAG "PERCENT" #define ARC_LEXER_TOKEN_PERCENT_TAG "PERCENT"

View file

@ -304,6 +304,22 @@ uint32_t ARC_Lexer_AutomataMatchCharFn(ARC_String **tokenData, ARC_String *strin
return 0; return 0;
} }
uint32_t ARC_Lexer_AutomataMatchCharOrBetweenFn(ARC_String **tokenData, ARC_String *string, void *automataData){
//if there is a match the token will be the same as automataData, so we don't need to store it again
*tokenData = NULL;
//check to see if there is a match with automataData as a range of chars
char *automataDataChars = (char *)automataData;
if(string->data[0] >= automataDataChars[0] && string->data[0] <= ((char *)automataData)[1]){
//return the token as token data and the token was found of length 1
ARC_String_Create(tokenData, string->data, 1);
return 1;
}
//no match was found
return 0;
}
uint32_t ARC_Lexer_AutomataMatchStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){ uint32_t ARC_Lexer_AutomataMatchStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){
//if there is a match the token will be the same as automataData, so we don't need to store it again //if there is a match the token will be the same as automataData, so we don't need to store it again
*tokenData = NULL; *tokenData = NULL;
@ -363,6 +379,28 @@ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharRule(uint32_t id,
return tokenRule; return tokenRule;
} }
ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(uint32_t id, char start, char end){
//create the token rule
ARC_LexerTokenRule tokenRule;
//set the id
tokenRule.id = id;
//create and store the automataData (which is just two chars (the minumum and manximum))
char *automataData = (char *)malloc(sizeof(char) * 2);
automataData[0] = start;
automataData[1] = end;
tokenRule.automataData = (void *)automataData;
//we can use the ARC_Lexer_AutomataMatchCharInStringFn for this
tokenRule.automataFn = ARC_Lexer_AutomataMatchCharOrBetweenFn;
//add the private destroy function (we can use the char as it destroys a char pointer of any size)
tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyCharAutomataDataFn;
//return the created tokenRule
return tokenRule;
}
//private function to free automataData stored as an ARC_String //private function to free automataData stored as an ARC_String
void ARC_LexerTokenRule_DestroyStringAutomataDataFn(void *automataData){ void ARC_LexerTokenRule_DestroyStringAutomataDataFn(void *automataData){
ARC_String_Destroy((ARC_String *)automataData); ARC_String_Destroy((ARC_String *)automataData);
@ -413,7 +451,27 @@ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint3
} }
void ARC_Lexer_InitBasicTokenRules(ARC_Lexer *lexer){ void ARC_Lexer_InitBasicTokenRules(ARC_Lexer *lexer){
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_NULL , 0 )); //null
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_NULL, 0));
//number
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_LEXER_TOKEN_NUMBER, '0', '9'));
//alpha char
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_LEXER_TOKEN_ALPHALOWERCHAR, 'a', 'z'));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_LEXER_TOKEN_ALPHAUPPERCHAR, 'A', 'Z'));
//whitespace
//TODO: fix this
ARC_String *whitespaceString;
ARC_String_CreateWithStrlen(&whitespaceString, " \t");
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(ARC_LEXER_TOKEN_WHITESPACE, whitespaceString));
ARC_String_Destroy(whitespaceString);
//TEMP FIX:
//ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_WHITESPACE, ' '));
//single char tokens
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_NEWLINE_ID , ARC_LEXER_TOKEN_NEWLINE_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_COLON_ID , ARC_LEXER_TOKEN_COLON_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_COLON_ID , ARC_LEXER_TOKEN_COLON_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_SEMICOLON_ID , ARC_LEXER_TOKEN_SEMICOLON_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_SEMICOLON_ID , ARC_LEXER_TOKEN_SEMICOLON_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_COMMA_ID , ARC_LEXER_TOKEN_COMMA_CHAR )); ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_COMMA_ID , ARC_LEXER_TOKEN_COMMA_CHAR ));

View file

@ -25,7 +25,7 @@ void ARC_Parser_Create(ARC_Parser **parser, ARC_Array *language, ARC_Parser_Init
(*parser)->language.size = language->size; (*parser)->language.size = language->size;
(*parser)->language.data = malloc(sizeof(ARC_ParserLanguageTag) * language->size); (*parser)->language.data = malloc(sizeof(ARC_ParserLanguageTag) * language->size);
memcpy((*parser)->language.data, language->data, language->size); memcpy((*parser)->language.data, language->data, sizeof(ARC_ParserLanguageTag) * language->size);
} }
//create the lexer //create the lexer
@ -103,6 +103,12 @@ void ARC_Parser_ParseTag(ARC_Parser *parser, uint32_t *lexerIndex, uint32_t tagI
} }
void ARC_Parser_Parse(ARC_Parser *parser, ARC_String **data){ void ARC_Parser_Parse(ARC_Parser *parser, ARC_String **data){
//make sure the parser has a language
if(parser->language.size == 0){
ARC_DEBUG_LOG_ERROR("ARC_Parser_Parse(parser, data), no parser language defined");
return;
}
//lex the subdata //lex the subdata
ARC_Lexer_LexString(parser->lexer, data); ARC_Lexer_LexString(parser->lexer, data);
if(arc_errno){ if(arc_errno){
@ -110,11 +116,12 @@ void ARC_Parser_Parse(ARC_Parser *parser, ARC_String **data){
return; return;
} }
//set the lexer index to start and get the first tag
uint32_t lexerIndex = 0; uint32_t lexerIndex = 0;
ARC_ParserLanguageTag startTag = ((ARC_ParserLanguageTag *)parser->language.data)[0]; ARC_ParserLanguageTag *startTag = parser->language.data;
//recursivly parse from the inital start tag //recursivly parse from the inital start tag
ARC_Parser_ParseTag(parser, &lexerIndex, startTag.tagId); ARC_Parser_ParseTag(parser, &lexerIndex, startTag->tagId);
if(arc_errno){ if(arc_errno){
ARC_DEBUG_LOG_ERROR("ARC_Parser_Parse(parser, data), could not parse the given data"); ARC_DEBUG_LOG_ERROR("ARC_Parser_Parse(parser, data), could not parse the given data");
return; return;

View file

@ -2,12 +2,13 @@
#include "arc/std/errno.h" #include "arc/std/errno.h"
#include "arc/std/parser.h" #include "arc/std/parser.h"
#define LAMBDA 0 //TODO: fix lambda
#define CHAR 1 #define LAMBDA 20
#define NUM 2 #define CHAR ARC_LEXER_TOKEN_ALPHALOWERCHAR
#define CHAR_OR_NUM 3 #define NUM ARC_LEXER_TOKEN_NUMBER
#define VARIABLE_NAME 4 #define CHAR_OR_NUM 23
#define VARIABLE 5 #define VARIABLE_NAME 24
#define VARIABLE 25
void TEST_Parser_InitLexerRulesFn(ARC_Lexer *lexer){ void TEST_Parser_InitLexerRulesFn(ARC_Lexer *lexer){
ARC_Lexer_InitBasicTokenRules(lexer); ARC_Lexer_InitBasicTokenRules(lexer);
@ -21,9 +22,9 @@ ARC_TEST(Parser_Init){
uint32_t *variableTokensOrTags[] = { (uint32_t[]){ 2, CHAR, VARIABLE_NAME } }; uint32_t *variableTokensOrTags[] = { (uint32_t[]){ 2, CHAR, VARIABLE_NAME } };
ARC_ParserLanguageTag testTags[3] = { ARC_ParserLanguageTag testTags[3] = {
{ {
CHAR_OR_NUM, //tagId VARIABLE, //tagId
charOrNumTokens, //tokensOrTags variableTokensOrTags, //tokensOrTags
2 //tokenOrTagsSize 1 //tokenOrTagsSize
}, },
{ {
VARIABLE_NAME, //tagId VARIABLE_NAME, //tagId
@ -31,9 +32,9 @@ ARC_TEST(Parser_Init){
2 //tokenOrTagsSize 2 //tokenOrTagsSize
}, },
{ {
VARIABLE, //tagId CHAR_OR_NUM, //tagId
variableTokensOrTags, //tokensOrTags charOrNumTokens, //tokensOrTags
1 //tokenOrTagsSize 2 //tokenOrTagsSize
} }
}; };
@ -57,9 +58,9 @@ ARC_TEST(Parser_Basic_Parse){
uint32_t *variableTokensOrTags[] = { (uint32_t[]){ 2, CHAR, VARIABLE_NAME } }; uint32_t *variableTokensOrTags[] = { (uint32_t[]){ 2, CHAR, VARIABLE_NAME } };
ARC_ParserLanguageTag testTags[3] = { ARC_ParserLanguageTag testTags[3] = {
{ {
CHAR_OR_NUM, //tagId VARIABLE, //tagId
charOrNumTokens, //tokensOrTags variableTokensOrTags, //tokensOrTags
2 //tokenOrTagsSize 1 //tokenOrTagsSize
}, },
{ {
VARIABLE_NAME, //tagId VARIABLE_NAME, //tagId
@ -67,9 +68,9 @@ ARC_TEST(Parser_Basic_Parse){
2 //tokenOrTagsSize 2 //tokenOrTagsSize
}, },
{ {
VARIABLE, //tagId CHAR_OR_NUM, //tagId
variableTokensOrTags, //tokensOrTags charOrNumTokens, //tokensOrTags
1 //tokenOrTagsSize 2 //tokenOrTagsSize
} }
}; };
@ -97,9 +98,9 @@ ARC_TEST(Parser_Basic_ParseError){
uint32_t *variableTokensOrTags[] = { (uint32_t[]){ 2, CHAR, VARIABLE_NAME } }; uint32_t *variableTokensOrTags[] = { (uint32_t[]){ 2, CHAR, VARIABLE_NAME } };
ARC_ParserLanguageTag testTags[3] = { ARC_ParserLanguageTag testTags[3] = {
{ {
CHAR_OR_NUM, //tagId VARIABLE, //tagId
charOrNumTokens, //tokensOrTags variableTokensOrTags, //tokensOrTags
2 //tokenOrTagsSize 1 //tokenOrTagsSize
}, },
{ {
VARIABLE_NAME, //tagId VARIABLE_NAME, //tagId
@ -107,9 +108,9 @@ ARC_TEST(Parser_Basic_ParseError){
2 //tokenOrTagsSize 2 //tokenOrTagsSize
}, },
{ {
VARIABLE, //tagId CHAR_OR_NUM, //tagId
variableTokensOrTags, //tokensOrTags charOrNumTokens, //tokensOrTags
1 //tokenOrTagsSize 2 //tokenOrTagsSize
} }
}; };