archeus/src/std/parser/parserlang.c

398 lines
19 KiB
C

#include "arc/std/parser/parserlang.h"
#include "arc/std/bool.h"
#include "arc/std/errno.h"
#include "arc/std/lexer.h"
#include "arc/std/parser.h"
#include "arc/std/string.h"
#include "arc/std/vector.h"
#include <stddef.h>
#include <stdlib.h>
//private function to initalize the lexer rules for the language
void ARC_ParserLang_InitLexerRulesFn(ARC_Lexer *lexer){
//null
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_NULL, 0));
//number
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_PARSERLANG_TOKEN_NUMBER, '0', '9'));
//alpha char
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_PARSERLANG_TOKEN_ALPHA_LOWER_CHAR, 'a', 'z'));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_PARSERLANG_TOKEN_ALPHA_UPPER_CHAR, 'A', 'Z'));
//whitespace
ARC_String *whitespaceString;
ARC_String_CreateWithStrlen(&whitespaceString, " \t");
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(ARC_LEXER_TOKEN_WHITESPACE, whitespaceString));
ARC_String_Destroy(whitespaceString);
//single char tokens
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_NEWLINE_ID , ARC_PARSERLANG_TOKEN_NEWLINE_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_LESS_THAN_ID , ARC_PARSERLANG_TOKEN_LESS_THAN_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_GREATER_THAN_ID, ARC_PARSERLANG_TOKEN_GREATER_THAN_CHAR));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_OR_ID , ARC_PARSERLANG_TOKEN_OR_CHAR ));
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_PARSERLANG_TOKEN_UNDERSCORE_ID , ARC_PARSERLANG_TOKEN_UNDERSCORE_CHAR ));
//arrow
ARC_String *arrowString;
ARC_String_CreateWithStrlen(&arrowString, ARC_PARSERLANG_TOKEN_ARROW_CSTRING);
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchStringRule(ARC_PARSERLANG_TOKEN_ARROW_ID, arrowString));
ARC_String_Destroy(arrowString);
}
void ARC_ParserLang_VectorDestroyVectorFn(void *data){
ARC_Vector *vector = (ARC_Vector *)data;
ARC_Vector_Destroy(vector);
}
void ARC_ParserLang_VectorDestroyUInt32Fn(void *data){
uint32_t *uint = (uint32_t *)data;
free(uint);
}
void ARC_ParserLang_VectorDestroyParserTagFn(void *data){
ARC_ParserTag *currentTag = (ARC_ParserTag *)data;
//free the orIndex vlues
for(uint32_t orIndex = 0; orIndex < currentTag->tokensOrTagsSize; orIndex++){
free(currentTag->tokensOrTags[orIndex]);
}
//free the tokens or tags
free(currentTag->tokensOrTags);
//free the tag itself
free(currentTag);
}
void ARC_ParserLangParsedData_RecurseStringAdd(ARC_String **data, ARC_ParserTagToken *tagToken){
if(tagToken->token != NULL){
//less than rule does not have a string so add it manually and return
if(tagToken->token->rule == ARC_PARSERLANG_TOKEN_LESS_THAN_ID){
char tokenChar = ARC_PARSERLANG_TOKEN_LESS_THAN_CHAR;
ARC_String_AppendCString(data, &tokenChar, 1);
return;
}
//greater than rule does not have a string so add it manually and return
if(tagToken->token->rule == ARC_PARSERLANG_TOKEN_GREATER_THAN_ID){
char tokenChar = ARC_PARSERLANG_TOKEN_GREATER_THAN_CHAR;
ARC_String_AppendCString(data, &tokenChar, 1);
return;
}
if(tagToken->token->rule == ARC_PARSERLANG_TOKEN_UNDERSCORE_ID){
char tokenChar = ARC_PARSERLANG_TOKEN_UNDERSCORE_CHAR;
ARC_String_AppendCString(data, &tokenChar, 1);
return;
}
ARC_String_Append(data, tagToken->token->data);
return;
}
//TODO: probs don't need this
if(tagToken->tagTokens == NULL){
return;
}
for(uint32_t index = 0; index < ARC_Vector_GetSize(tagToken->tagTokens); index++){
ARC_ParserLangParsedData_RecurseStringAdd(data, (ARC_ParserTagToken *)ARC_Vector_Get(tagToken->tagTokens, index));
}
}
void ARC_ParserLangParsedData_CreateTagString(ARC_String **tagString, ARC_ParserTagToken *tagToken){
ARC_String_Create(tagString, NULL, 0);
for(uint32_t index = 0; index < ARC_Vector_GetSize(tagToken->tagTokens); index++){
ARC_ParserTagToken *childTagToken = (ARC_ParserTagToken *)ARC_Vector_Get(tagToken->tagTokens, index);
ARC_ParserLangParsedData_RecurseStringAdd(tagString, childTagToken);
}
}
/*
<argument> -> <tagOrConstant> WHITESPACE <argument> | <tagOrConstant>
*/
void ARC_ParserLangParsedData_GetArgumentTag(ARC_Vector *orTokensOrTags, ARC_ParserTagToken *tagToken, ARC_Parser_GetStringIdFn *getStringIdFn){
for(uint32_t index = 0; index < ARC_Vector_GetSize(tagToken->tagTokens); index++){
ARC_ParserTagToken *childTagToken = (ARC_ParserTagToken *)ARC_Vector_Get(tagToken->tagTokens, index);
//switch variables
ARC_String *tagOrConstantString = NULL;
uint32_t *id = NULL;
switch(childTagToken->id){
case ARC_PARSERLANG_ARGUMENT:
//recurse to check all the arguments
ARC_ParserLangParsedData_GetArgumentTag(orTokensOrTags, childTagToken, getStringIdFn);
continue;
case ARC_PARSERLANG_TAG_OR_CONSTANT:
//initialize the string to use
ARC_String_Create(&tagOrConstantString, NULL, 0);
//get the id of the tag/constant
ARC_ParserLangParsedData_RecurseStringAdd(&tagOrConstantString, childTagToken);
id = (uint32_t *)malloc(sizeof(uint32_t));
*id = (*getStringIdFn)(tagOrConstantString);
//add the id to the matching or vector
ARC_Vector_Add(orTokensOrTags, (void *)id);
//cleanup
ARC_String_Destroy(tagOrConstantString);
continue;
default:
//this should only be whitespace
continue;
}
}
}
/*
<arguments> -> <argument> WHITESPACE OR WHITESPACE <arguments> | <argument>
*/
void ARC_ParserLangParsedData_GetArgumentsTag(ARC_Vector *tokensOrTags, ARC_ParserTagToken *tagToken, ARC_Parser_GetStringIdFn *getStringIdFn){
for(uint32_t index = 0; index < ARC_Vector_GetSize(tagToken->tagTokens); index++){
ARC_ParserTagToken *childTagToken = (ARC_ParserTagToken *)ARC_Vector_Get(tagToken->tagTokens, index);
//switch variables
uint32_t tokensOrTagsIndex = 0;
ARC_Vector *orTokensOrTags = NULL;
ARC_Vector_DestroyDataFn destroyUint32Fn = ARC_ParserLang_VectorDestroyUInt32Fn;
switch(childTagToken->id){
case ARC_PARSERLANG_ARGUMENT:
//get the last vector within tokens or tags to add the tag/constant to
tokensOrTagsIndex = ARC_Vector_GetSize(tokensOrTags);
orTokensOrTags = (ARC_Vector *)ARC_Vector_Get(tokensOrTags, tokensOrTagsIndex - 1);
ARC_ParserLangParsedData_GetArgumentTag(orTokensOrTags, childTagToken, getStringIdFn);
continue;
case ARC_PARSERLANG_TOKEN_OR_ID:
//add a new vector to tagsOrTokens for the or statment
ARC_Vector_Create(&orTokensOrTags, NULL, &destroyUint32Fn);
//add the first or vector to the tokensOrTags
ARC_Vector_Add(tokensOrTags, (void *)orTokensOrTags);
continue;
case ARC_PARSERLANG_ARGUMENTS:
//recurse to check all the arguments
ARC_ParserLangParsedData_GetArgumentsTag(tokensOrTags, childTagToken, getStringIdFn);
continue;
default:
//this should only be whitespace
continue;
}
}
}
void ARC_ParserLangParsedData_CreateBodyTag(ARC_ParserTag **tag, ARC_ParserTagToken *tagToken, ARC_Parser_GetStringIdFn *getStringIdFn){
//create the tag to store the body in
ARC_ParserTag *bodyTag = (ARC_ParserTag *)malloc(sizeof(ARC_ParserTag));
/* ~ Tag Id ~ */
//the first tag will always be the tagId, and as the rule is validated there is no need to check for NULL
ARC_ParserTagToken *tagIdToken = (ARC_ParserTagToken *)ARC_Vector_Get(tagToken->tagTokens, 0);
//get the tagId as a string
ARC_String *tagIdString;
ARC_ParserLangParsedData_CreateTagString(&tagIdString, tagIdToken);
//get the tag id as a uint32_t
bodyTag->tagId = (*getStringIdFn)(tagIdString);
//cleanup the tagIdString
ARC_String_Destroy(tagIdString);
/* ~ Tokens Or Tags Array ~ */
//create a vector to store another vector of data
ARC_Vector *tokensOrTags;
ARC_Vector_DestroyDataFn destroyVectorFn = ARC_ParserLang_VectorDestroyVectorFn;
ARC_Vector_Create(&tokensOrTags, NULL, &destroyVectorFn);
//create vector within the tokens or tags vector to store the or rule in
ARC_Vector *orTokensOrTags;
ARC_Vector_DestroyDataFn destroyUint32Fn = ARC_ParserLang_VectorDestroyUInt32Fn;
ARC_Vector_Create(&orTokensOrTags, NULL, &destroyUint32Fn);
//add the first or vector to the tokensOrTags
ARC_Vector_Add(tokensOrTags, (void *)orTokensOrTags);
//skipping whitespace and arrow tokens, the arguments index starts at 4
ARC_ParserTagToken *argumentsToken = (ARC_ParserTagToken *)ARC_Vector_Get(tagToken->tagTokens, 4);
ARC_ParserLangParsedData_GetArgumentsTag(tokensOrTags, argumentsToken, getStringIdFn);
//initialize the tokens or tags array to the needed size
bodyTag->tokensOrTagsSize = ARC_Vector_GetSize(tokensOrTags);
bodyTag->tokensOrTags = (uint32_t **)malloc(sizeof(uint32_t *) * bodyTag->tokensOrTagsSize);
//copy each or section into the tokensOrTags
for(uint32_t orIndex = 0; orIndex < bodyTag->tokensOrTagsSize; orIndex++){
//get the current or vector and its tags
orTokensOrTags = (ARC_Vector *)ARC_Vector_Get(tokensOrTags, orIndex);
uint32_t orTokensOrTagsSize = ARC_Vector_GetSize(orTokensOrTags);
//create the or array with one extra space and store the size in that space
bodyTag->tokensOrTags[orIndex] = (uint32_t *)malloc(sizeof(uint32_t) * (orTokensOrTagsSize + 1));
bodyTag->tokensOrTags[orIndex][0] = orTokensOrTagsSize;
//copy the or data to the tokensOrTags
for(uint32_t tokenOrTagIndex = 0; tokenOrTagIndex < orTokensOrTagsSize; tokenOrTagIndex++){
bodyTag->tokensOrTags[orIndex][tokenOrTagIndex + 1] = *(uint32_t *)ARC_Vector_Get(orTokensOrTags, tokenOrTagIndex);
}
}
//set the tag to the bodyTag
*tag = bodyTag;
//cleanup
ARC_Vector_Destroy(tokensOrTags);
}
void ARC_ParserLangParsedData_RunLineTag(ARC_Vector *tags, ARC_ParserTagToken *tagToken, ARC_Parser_GetStringIdFn *getStringIdFn){
//loop through the tags either going to the next line or the next body
for(uint32_t index = 0; index < ARC_Vector_GetSize(tagToken->tagTokens); index++){
ARC_ParserTagToken *childTagToken = (ARC_ParserTagToken *)ARC_Vector_Get(tagToken->tagTokens, index);
//for some reason can't create this in the switch, so placed here
ARC_ParserTag *tag;
switch(childTagToken->id){
//recuse to run the next line
case ARC_PARSERLANG_LINE:
ARC_ParserLangParsedData_RunLineTag(tags, childTagToken, getStringIdFn);
continue;
//get a tag
case ARC_PARSERLANG_BODY:
ARC_ParserLangParsedData_CreateBodyTag(&tag, childTagToken, getStringIdFn);
ARC_Vector_Add(tags, (void *)tag);
continue;
default:
continue;
}
}
}
//private function to create the saved data for the language
void ARC_ParserLang_CreateDataFn(void **data, ARC_ParserTagToken *parsedData, void *userData){
ARC_Parser_GetStringIdFn *getStringIdFn = (ARC_Parser_GetStringIdFn *)userData;
//make sure there is parsed data to use
if(parsedData == NULL){
arc_errno = ARC_ERRNO_NULL;
ARC_DEBUG_LOG_ERROR("ARC_ParserLang_CreateDataFn(void **data, ARC_ParserTagToken *parsedData, void *userData), parsedData was NULL when passed in");
return;
}
//function callback to cleanup added tags
ARC_Vector_DestroyDataFn destroyParserTagFn = ARC_ParserLang_VectorDestroyParserTagFn;
//I don't see a reason to have a comparison function right now. this might change in the future
ARC_Vector_Create((ARC_Vector **)data, NULL, &destroyParserTagFn);
//check if there are any tags (thes could be empty if a file is blank)
if(parsedData->tagTokens == NULL){
return;
}
//load the language into a vector recursivly
ARC_ParserLangParsedData_RunLineTag(*((ARC_Vector **)data), parsedData, getStringIdFn);
}
//private function to destroy the saved data for the language
void ARC_ParserLang_DestroyDataFn(void *data, ARC_Bool clear, void *userData){
if(userData != NULL){
ARC_Parser_GetStringIdFn *getStringIdFn = (ARC_Parser_GetStringIdFn *)userData;
free(getStringIdFn);
}
//if not clearing (full destroy) check if there is data to free
if(clear == ARC_False && (ARC_Vector *)data != NULL){
ARC_Vector_Destroy((ARC_Vector *)data);
}
}
void ARC_ParserLang_CreateAsParser(ARC_Parser **parser, ARC_Parser_GetStringIdFn getStringIdFn){
//<line> -> <body> NEWLINE <line> | <body> | NEWLINE <line> | LAMBDA
uint32_t *line[] = { (uint32_t[]){ 3, ARC_PARSERLANG_BODY, ARC_PARSERLANG_TOKEN_NEWLINE_ID, ARC_PARSERLANG_LINE }, (uint32_t[]){ 1, ARC_PARSERLANG_BODY }, (uint32_t[]){ 2, ARC_PARSERLANG_TOKEN_NEWLINE_ID, ARC_PARSERLANG_LINE }, (uint32_t[]){ 1, ARC_PARSERLANG_LAMBDA } };
//<body> -> <tag> <whitespace> ARROW <whitespace> <arguments>
uint32_t *body[] = { (uint32_t[]){ 5, ARC_PARSERLANG_TAG, ARC_PARSERLANG_WHITESPACE, ARC_PARSERLANG_TOKEN_ARROW_ID, ARC_PARSERLANG_WHITESPACE, ARC_PARSERLANG_ARGUMENTS } };
//<arguments> -> <argument> <whitespace> OR <whitespace> <arguments> | <argument>
uint32_t *arguments[] = { (uint32_t[]){ 5, ARC_PARSERLANG_ARGUMENT, ARC_PARSERLANG_WHITESPACE, ARC_PARSERLANG_TOKEN_OR_ID, ARC_PARSERLANG_WHITESPACE, ARC_PARSERLANG_ARGUMENTS }, (uint32_t[]){ 1, ARC_PARSERLANG_ARGUMENT } };
//<argument> -> <tagOrConstant> <whitespace> <argument> | <tagOrConstant>
uint32_t *argument[] = { (uint32_t[]){ 3, ARC_PARSERLANG_TAG_OR_CONSTANT, ARC_PARSERLANG_WHITESPACE, ARC_PARSERLANG_ARGUMENT }, (uint32_t[]){ 1, ARC_PARSERLANG_TAG_OR_CONSTANT } };
//<tagOrConstant> -> <parserLangageTag> | <constant>
uint32_t *tagOrConstant[] = { (uint32_t[]){ 1, ARC_PARSERLANG_TAG }, (uint32_t[]){ 1, ARC_PARSERLANG_CONSTANT } };
//<constant> -> ALPHA_UPPER_CHAR <constantBody>
uint32_t *constant[] = { (uint32_t[]){ 2, ARC_PARSERLANG_TOKEN_ALPHA_UPPER_CHAR, ARC_PARSERLANG_CONSTANT_BODY } };
//<constantBody> -> <constantChar> <constantBody> | LAMBDA
uint32_t *constantBody[] = { (uint32_t[]){ 2, ARC_PARSERLANG_CONSTANT_CHAR, ARC_PARSERLANG_CONSTANT_BODY }, (uint32_t[]){ 1, ARC_PARSERLANG_LAMBDA } };
//<constantChar> -> ALPHA_UPPER_CHAR | UNDERSCORE
uint32_t *constantChar[] = { (uint32_t[]){ 1, ARC_PARSERLANG_TOKEN_ALPHA_UPPER_CHAR }, (uint32_t[]){ 1, ARC_PARSERLANG_TOKEN_UNDERSCORE_ID } };
//<tag> -> LESS_THAN <variable> GREATER_THAN
uint32_t *tag[] = { (uint32_t[]){ 3, ARC_PARSERLANG_TOKEN_LESS_THAN_ID, ARC_PARSERLANG_VARIABLE, ARC_PARSERLANG_TOKEN_GREATER_THAN_ID } };
//<variable> -> <alphaChar> <variableBody> | UNDERSCORE <variableBody>
uint32_t *variable[] = { (uint32_t[]){ 2, ARC_PARSERLANG_ALPHA_CHAR, ARC_PARSERLANG_VARIABLE_BODY }, (uint32_t[]){ 2, ARC_PARSERLANG_TOKEN_UNDERSCORE_ID, ARC_PARSERLANG_VARIABLE_BODY } };
//<variableBody> -> <variableChar> <variableBody> | LAMBDA
uint32_t *variableBody[] = { (uint32_t[]){ 2, ARC_PARSERLANG_VARIABLE_CHAR, ARC_PARSERLANG_VARIABLE_BODY }, (uint32_t[]){ 1, ARC_PARSERLANG_LAMBDA } };
//<variableChar> -> <alphaChar> | NUMBER | UNDERSCORE
uint32_t *variableChar[] = { (uint32_t[]){ 1, ARC_PARSERLANG_ALPHA_CHAR }, (uint32_t[]){ 1, ARC_PARSERLANG_TOKEN_NUMBER }, (uint32_t[]){ 1, ARC_PARSERLANG_TOKEN_UNDERSCORE_ID } };
//<alphaChar> -> ALPHA_LOWER_CHAR | ALPHA_UPPER_CHAR
uint32_t *alphaChar[] = { (uint32_t[]){ 1, ARC_PARSERLANG_TOKEN_ALPHA_LOWER_CHAR }, (uint32_t[]){ 1, ARC_PARSERLANG_TOKEN_ALPHA_UPPER_CHAR }};
//<whitespace> -> WHITESPACE <whitespace> | WHITESPACE
uint32_t *whitespace[] = { (uint32_t[]){ 2, ARC_PARSERLANG_TOKEN_WHITESPACE, ARC_PARSERLANG_WHITESPACE }, (uint32_t[]){ 1, ARC_PARSERLANG_TOKEN_WHITESPACE }};
ARC_ParserTag parserLangTags[14] = {
{ ARC_PARSERLANG_LINE , line , 4 },
{ ARC_PARSERLANG_BODY , body , 1 },
{ ARC_PARSERLANG_ARGUMENTS , arguments , 2 },
{ ARC_PARSERLANG_ARGUMENT , argument , 2 },
{ ARC_PARSERLANG_TAG_OR_CONSTANT, tagOrConstant, 2 },
{ ARC_PARSERLANG_CONSTANT , constant , 1 },
{ ARC_PARSERLANG_CONSTANT_BODY , constantBody , 2 },
{ ARC_PARSERLANG_CONSTANT_CHAR , constantChar , 2 },
{ ARC_PARSERLANG_TAG , tag , 1 },
{ ARC_PARSERLANG_VARIABLE , variable , 2 },
{ ARC_PARSERLANG_VARIABLE_BODY , variableBody , 2 },
{ ARC_PARSERLANG_VARIABLE_CHAR , variableChar , 3 },
{ ARC_PARSERLANG_ALPHA_CHAR , alphaChar , 2 },
{ ARC_PARSERLANG_WHITESPACE , whitespace , 2 }
};
ARC_Array parserLanguageArray = {
14, //size
parserLangTags //data
};
ARC_ParserData_CreateFn createDataFn = ARC_ParserLang_CreateDataFn;
ARC_ParserData_DestroyFn destroyDataFn = ARC_ParserLang_DestroyDataFn;
//this will be cleaned up by the destroyDataFn
ARC_Parser_GetStringIdFn *newGetStringIdFn = (ARC_Parser_GetStringIdFn *)malloc(sizeof(ARC_Parser_GetStringIdFn));
*newGetStringIdFn = getStringIdFn;
//create the parserlang
ARC_Parser_Create(parser, &parserLanguageArray, ARC_ParserLang_InitLexerRulesFn, &createDataFn, &destroyDataFn, (void *)newGetStringIdFn);
}