archeus/src/std/parser.c

#include "arc/std/parser.h"
#include "arc/std/bool.h"
#include "arc/std/errno.h"
#include "arc/std/lexer.h"
//#include "arc/std/vector.h"
#include <stdint.h>
#include <stdlib.h>
#include <string.h>

struct ARC_Parser {
    ARC_Array language;

    ARC_Lexer *lexer;
};

void ARC_Parser_Create(ARC_Parser **parser, ARC_Array *language, ARC_Parser_InitLexerRulesFn initLexerRulesFn){
    *parser = (ARC_Parser *)malloc(sizeof(ARC_Parser));

    //set the language size to 0 and data to NULL in case the language is NULL
    (*parser)->language.size = 0;
    (*parser)->language.data = NULL;

    //if the language exists, copy the language
    if(language != NULL){
        (*parser)->language.size = language->size;
        (*parser)->language.data = malloc(sizeof(ARC_ParserLanguageTag) * language->size);

        memcpy((*parser)->language.data, language->data, sizeof(ARC_ParserLanguageTag) * language->size);
        for(uint32_t index = 0; index < language->size; index++){
            ARC_ParserLanguageTag *languageTag = ((ARC_ParserLanguageTag *)language->data) + index;
            ARC_ParserLanguageTag *currentTag  = ((ARC_ParserLanguageTag *)(*parser)->language.data) + index;

            //copy the currentTag's tokenOrTags
            memcpy(currentTag->tokensOrTags, languageTag->tokensOrTags,  languageTag->tokensOrTagsSize);
        }
    }

    //create the lexer
    ARC_Lexer_Create(&((*parser)->lexer));

    //register instructions to the lexer
    initLexerRulesFn(((*parser)->lexer));
}

void ARC_Parser_CreateFromString(ARC_Parser **parser, ARC_String *languageString, ARC_Parser_InitLexerRulesFn initLexerRulesFn){
}

void ARC_Parser_Destroy(ARC_Parser *parser){
    //clear all the copied token or tags from memory
    for(uint32_t index = 0; index < parser->language.size; index++){
        ARC_ParserLanguageTag *currentTag  = ((ARC_ParserLanguageTag *)parser->language.data) + index;
        free(currentTag->tokensOrTags);
    }

    //clear the copied language from memory
    free(parser->language.data);

    ARC_Lexer_Destroy(parser->lexer);

    free(parser);
}

//private recusive function to parse a tag
ARC_Bool ARC_Parser_ParseTag(ARC_Parser *parser, uint32_t *lexerIndex, uint32_t tagId){
    //get the current tag
    ARC_ParserLanguageTag *tag = NULL;
    for(uint32_t index = 0; index < parser->language.size; index++){
        ARC_ParserLanguageTag *foundTag = ((ARC_ParserLanguageTag *)parser->language.data) + index;
        if(foundTag->tagId == tagId){
            tag = foundTag;
            break;
        }
    }

    //if the tag was not found can't do much, so throw an error
    if(tag == NULL){
        arc_errno = ARC_ERRNO_NULL;
        ARC_DEBUG_LOG_ERROR_WITH_VARIABLES("ARC_Parser_ParseTag(parser, subdata, tagId), could not find tag with id: %u", tagId);
        return ARC_False;
    }

    //loop through each or section of the tags and tokens
    for(uint32_t orIndex = 0; orIndex < tag->tokensOrTagsSize; orIndex++){
        //loop through each token or tag to check if the lexed data matches
        uint32_t lexerCheckIndex = *lexerIndex;
        ARC_Bool foundRule = ARC_True;
        for(uint32_t tokenOrTagIndex = 1; tokenOrTagIndex < tag->tokensOrTags[orIndex][0] + 1; tokenOrTagIndex++){
            //check if it is lambda (can return safely)
            if(tag->tokensOrTags[orIndex][tokenOrTagIndex] == ARC_PARSER_TAG_LAMBDA){
                break;
            }

            //if the value isn't a token it is a tag, so recurs if it isn't a token
            ARC_Bool isToken = ARC_Lexer_IsTokenId(parser->lexer, tag->tokensOrTags[orIndex][tokenOrTagIndex]);
            if(isToken == ARC_False){
                //check if the tag works if not break to continue checking next or
                uint32_t nextTagId = tag->tokensOrTags[orIndex][tokenOrTagIndex];
                foundRule = ARC_Parser_ParseTag(parser, lexerIndex, nextTagId);
                if(foundRule == ARC_False){
                    break;
                }

                //this will probably never be called as lambda is usually the last instruction, but just in case we can continue instead of break
                continue;
            }

            //check if there is another token that can be used
            if(lexerCheckIndex >= ARC_Lexer_GetTokensSize(parser->lexer)){
                //out of tokens to the current or does not work, so break
                foundRule = ARC_False;
                break;
            }

            //get the next token in the lexer and increment the lexers index
            ARC_LexerToken token = ARC_Lexer_GetToken(parser->lexer, lexerCheckIndex);
            lexerCheckIndex++;

            //if the token rule does not match the current token in the current or statement the token rule could not be found for the current or index so break
            if(token.rule != tag->tokensOrTags[orIndex][tokenOrTagIndex]){
                foundRule = ARC_False;
                break;
            }
        }

        //if the rule is found we don't need to check anymore so we can return out
        if(foundRule == ARC_True){
            *lexerIndex = lexerCheckIndex;
            //TODO: set tag into datastructure
            return ARC_True;
        }
    }

    //no rule was found, so return false
    return ARC_False;
}

void ARC_Parser_Parse(ARC_Parser *parser, ARC_String **data){
    //make sure the parser has a language
    if(parser->language.size == 0){
        ARC_DEBUG_LOG_ERROR("ARC_Parser_Parse(parser, data), no parser language defined");
        return;
    }

    //lex the subdata
    ARC_Lexer_LexString(parser->lexer, data);
    if(arc_errno){
        ARC_DEBUG_LOG_ERROR("ARC_Parser_Parse(parser, data), could not lex the given data");
        ARC_Lexer_Clear(parser->lexer);
        return;
    }

    //set the lexer index to start and get the first tag
    uint32_t lexerIndex = 0;
    ARC_ParserLanguageTag *startTag = parser->language.data;

    //TODO: handle error checks for if parsing fails
    //recursivly parse from the inital start tag
    ARC_Bool parsed = ARC_Parser_ParseTag(parser, &lexerIndex, startTag->tagId);
    ARC_Bool allTokensParsed = lexerIndex == ARC_Lexer_GetTokensSize(parser->lexer);
    ARC_Lexer_Clear(parser->lexer);
    if(parsed == ARC_False || allTokensParsed == ARC_False || arc_errno){
        arc_errno = ARC_ERRNO_DATA;
        ARC_DEBUG_LOG_ERROR_WITH_VARIABLES("ARC_Parser_Parse(parser, data), could not parse the given data at lexer index: %u", lexerIndex);
        return;
    }
}

void ARC_Parser_ParseFile(ARC_Parser *parser, ARC_String *path){

}
creation and destruction in parser, also added some stuff to gitingore 2024-10-16 05:14:53 -06:00			`#include "arc/std/parser.h"`
added ability to check if token id is a lexer rule and wrote parser, still need to test 2024-10-24 19:56:26 -06:00			`#include "arc/std/bool.h"`
working on parsing recursively 2024-10-16 23:46:16 -06:00			`#include "arc/std/errno.h"`
creation and destruction in parser, also added some stuff to gitingore 2024-10-16 05:14:53 -06:00			`#include "arc/std/lexer.h"`
added ability to check if token id is a lexer rule and wrote parser, still need to test 2024-10-24 19:56:26 -06:00			`//#include "arc/std/vector.h"`
creation and destruction in parser, also added some stuff to gitingore 2024-10-16 05:14:53 -06:00			`#include <stdint.h>`
			`#include <stdlib.h>`
			`#include <string.h>`

			`struct ARC_Parser {`
			`ARC_Array language;`

			`ARC_Lexer *lexer;`
			`};`

merged with old parser stuff, and worked on parser a bit more 2024-10-16 18:00:52 -06:00			`void ARC_Parser_Create(ARC_Parser *parser, ARC_Array language, ARC_Parser_InitLexerRulesFn initLexerRulesFn){`
creation and destruction in parser, also added some stuff to gitingore 2024-10-16 05:14:53 -06:00			`parser = (ARC_Parser )malloc(sizeof(ARC_Parser));`

			`//set the language size to 0 and data to NULL in case the language is NULL`
			`(*parser)->language.size = 0;`
			`(*parser)->language.data = NULL;`

			`//if the language exists, copy the language`
			`if(language != NULL){`
			`(*parser)->language.size = language->size;`
			`(parser)->language.data = malloc(sizeof(ARC_ParserLanguageTag) language->size);`

fixed a lot of the lexer and parser (the segfault, and now can do char ranges) 2024-10-30 07:36:43 -06:00			`memcpy((parser)->language.data, language->data, sizeof(ARC_ParserLanguageTag) language->size);`
copies tagOrtoken and frees it now 2024-11-12 13:07:23 -07:00			`for(uint32_t index = 0; index < language->size; index++){`
			`ARC_ParserLanguageTag languageTag = ((ARC_ParserLanguageTag )language->data) + index;`
			`ARC_ParserLanguageTag currentTag = ((ARC_ParserLanguageTag )(*parser)->language.data) + index;`

			`//copy the currentTag's tokenOrTags`
			`memcpy(currentTag->tokensOrTags, languageTag->tokensOrTags, languageTag->tokensOrTagsSize);`
			`}`
creation and destruction in parser, also added some stuff to gitingore 2024-10-16 05:14:53 -06:00			`}`

			`//create the lexer`
			`ARC_Lexer_Create(&((*parser)->lexer));`

merged with old parser stuff, and worked on parser a bit more 2024-10-16 18:00:52 -06:00			`//register instructions to the lexer`
			`initLexerRulesFn(((*parser)->lexer));`
			`}`

			`void ARC_Parser_CreateFromString(ARC_Parser *parser, ARC_String languageString, ARC_Parser_InitLexerRulesFn initLexerRulesFn){`
creation and destruction in parser, also added some stuff to gitingore 2024-10-16 05:14:53 -06:00			`}`

			`void ARC_Parser_Destroy(ARC_Parser *parser){`
copies tagOrtoken and frees it now 2024-11-12 13:07:23 -07:00			`//clear all the copied token or tags from memory`
			`for(uint32_t index = 0; index < parser->language.size; index++){`
			`ARC_ParserLanguageTag currentTag = ((ARC_ParserLanguageTag )parser->language.data) + index;`
			`free(currentTag->tokensOrTags);`
			`}`

			`//clear the copied language from memory`
creation and destruction in parser, also added some stuff to gitingore 2024-10-16 05:14:53 -06:00			`free(parser->language.data);`

			`ARC_Lexer_Destroy(parser->lexer);`

			`free(parser);`
			`}`
merged with old parser stuff, and worked on parser a bit more 2024-10-16 18:00:52 -06:00
working on parsing recursively 2024-10-16 23:46:16 -06:00			`//private recusive function to parse a tag`
updated to fix breaking recusrion when a good tag was found, it now segfaults though 2024-10-31 19:58:11 -06:00			`ARC_Bool ARC_Parser_ParseTag(ARC_Parser parser, uint32_t lexerIndex, uint32_t tagId){`
working on parsing recursively 2024-10-16 23:46:16 -06:00			`//get the current tag`
			`ARC_ParserLanguageTag *tag = NULL;`
			`for(uint32_t index = 0; index < parser->language.size; index++){`
			`ARC_ParserLanguageTag foundTag = ((ARC_ParserLanguageTag )parser->language.data) + index;`
			`if(foundTag->tagId == tagId){`
			`tag = foundTag;`
			`break;`
			`}`
			`}`

			`//if the tag was not found can't do much, so throw an error`
			`if(tag == NULL){`
			`arc_errno = ARC_ERRNO_NULL;`
			`ARC_DEBUG_LOG_ERROR_WITH_VARIABLES("ARC_Parser_ParseTag(parser, subdata, tagId), could not find tag with id: %u", tagId);`
updated to fix breaking recusrion when a good tag was found, it now segfaults though 2024-10-31 19:58:11 -06:00			`return ARC_False;`
working on parsing recursively 2024-10-16 23:46:16 -06:00			`}`
a bit more work on parser, pushing to work on it at school 2024-10-21 13:36:45 -06:00
			`//loop through each or section of the tags and tokens`
			`for(uint32_t orIndex = 0; orIndex < tag->tokensOrTagsSize; orIndex++){`
			`//loop through each token or tag to check if the lexed data matches`
added ability to check if token id is a lexer rule and wrote parser, still need to test 2024-10-24 19:56:26 -06:00			`uint32_t lexerCheckIndex = *lexerIndex;`
			`ARC_Bool foundRule = ARC_True;`
a bit more work on parser, pushing to work on it at school 2024-10-21 13:36:45 -06:00			`for(uint32_t tokenOrTagIndex = 1; tokenOrTagIndex < tag->tokensOrTags[orIndex][0] + 1; tokenOrTagIndex++){`
updated to fix breaking recusrion when a good tag was found, it now segfaults though 2024-10-31 19:58:11 -06:00			`//check if it is lambda (can return safely)`
parser mostly fixed, still needs a lot more testing though, and need to store parsed values in a datatype 2024-11-01 04:39:45 -06:00			`if(tag->tokensOrTags[orIndex][tokenOrTagIndex] == ARC_PARSER_TAG_LAMBDA){`
			`break;`
updated to fix breaking recusrion when a good tag was found, it now segfaults though 2024-10-31 19:58:11 -06:00			`}`

added ability to check if token id is a lexer rule and wrote parser, still need to test 2024-10-24 19:56:26 -06:00			`//if the value isn't a token it is a tag, so recurs if it isn't a token`
			`ARC_Bool isToken = ARC_Lexer_IsTokenId(parser->lexer, tag->tokensOrTags[orIndex][tokenOrTagIndex]);`
			`if(isToken == ARC_False){`
updated to fix breaking recusrion when a good tag was found, it now segfaults though 2024-10-31 19:58:11 -06:00			`//check if the tag works if not break to continue checking next or`
parser mostly fixed, still needs a lot more testing though, and need to store parsed values in a datatype 2024-11-01 04:39:45 -06:00			`uint32_t nextTagId = tag->tokensOrTags[orIndex][tokenOrTagIndex];`
			`foundRule = ARC_Parser_ParseTag(parser, lexerIndex, nextTagId);`
			`if(foundRule == ARC_False){`
updated to fix breaking recusrion when a good tag was found, it now segfaults though 2024-10-31 19:58:11 -06:00			`break;`
			`}`
parser mostly fixed, still needs a lot more testing though, and need to store parsed values in a datatype 2024-11-01 04:39:45 -06:00
			`//this will probably never be called as lambda is usually the last instruction, but just in case we can continue instead of break`
			`continue;`
			`}`

			`//check if there is another token that can be used`
			`if(lexerCheckIndex >= ARC_Lexer_GetTokensSize(parser->lexer)){`
			`//out of tokens to the current or does not work, so break`
			`foundRule = ARC_False;`
			`break;`
added ability to check if token id is a lexer rule and wrote parser, still need to test 2024-10-24 19:56:26 -06:00			`}`

			`//get the next token in the lexer and increment the lexers index`
			`ARC_LexerToken token = ARC_Lexer_GetToken(parser->lexer, lexerCheckIndex);`
			`lexerCheckIndex++;`

			`//if the token rule does not match the current token in the current or statement the token rule could not be found for the current or index so break`
			`if(token.rule != tag->tokensOrTags[orIndex][tokenOrTagIndex]){`
			`foundRule = ARC_False;`
			`break;`
			`}`
			`}`

			`//if the rule is found we don't need to check anymore so we can return out`
			`if(foundRule == ARC_True){`
			`*lexerIndex = lexerCheckIndex;`
segfault when testing parser 2024-10-28 21:00:48 -06:00			`//TODO: set tag into datastructure`
updated to fix breaking recusrion when a good tag was found, it now segfaults though 2024-10-31 19:58:11 -06:00			`return ARC_True;`
a bit more work on parser, pushing to work on it at school 2024-10-21 13:36:45 -06:00			`}`
			`}`
added ability to check if token id is a lexer rule and wrote parser, still need to test 2024-10-24 19:56:26 -06:00
parser mostly fixed, still needs a lot more testing though, and need to store parsed values in a datatype 2024-11-01 04:39:45 -06:00			`//no rule was found, so return false`
updated to fix breaking recusrion when a good tag was found, it now segfaults though 2024-10-31 19:58:11 -06:00			`return ARC_False;`
working on parsing recursively 2024-10-16 23:46:16 -06:00			`}`
merged with old parser stuff, and worked on parser a bit more 2024-10-16 18:00:52 -06:00
added ability to check if token id is a lexer rule and wrote parser, still need to test 2024-10-24 19:56:26 -06:00			`void ARC_Parser_Parse(ARC_Parser parser, ARC_String *data){`
fixed a lot of the lexer and parser (the segfault, and now can do char ranges) 2024-10-30 07:36:43 -06:00			`//make sure the parser has a language`
			`if(parser->language.size == 0){`
			`ARC_DEBUG_LOG_ERROR("ARC_Parser_Parse(parser, data), no parser language defined");`
			`return;`
			`}`

added ability to check if token id is a lexer rule and wrote parser, still need to test 2024-10-24 19:56:26 -06:00			`//lex the subdata`
			`ARC_Lexer_LexString(parser->lexer, data);`
			`if(arc_errno){`
			`ARC_DEBUG_LOG_ERROR("ARC_Parser_Parse(parser, data), could not lex the given data");`
parser clears lexer when done parsing 2024-10-30 18:41:01 -06:00			`ARC_Lexer_Clear(parser->lexer);`
segfault when testing parser 2024-10-28 21:00:48 -06:00			`return;`
added ability to check if token id is a lexer rule and wrote parser, still need to test 2024-10-24 19:56:26 -06:00			`}`

fixed a lot of the lexer and parser (the segfault, and now can do char ranges) 2024-10-30 07:36:43 -06:00			`//set the lexer index to start and get the first tag`
added ability to check if token id is a lexer rule and wrote parser, still need to test 2024-10-24 19:56:26 -06:00			`uint32_t lexerIndex = 0;`
fixed a lot of the lexer and parser (the segfault, and now can do char ranges) 2024-10-30 07:36:43 -06:00			`ARC_ParserLanguageTag *startTag = parser->language.data;`
added ability to check if token id is a lexer rule and wrote parser, still need to test 2024-10-24 19:56:26 -06:00
parser mostly fixed, still needs a lot more testing though, and need to store parsed values in a datatype 2024-11-01 04:39:45 -06:00			`//TODO: handle error checks for if parsing fails`
added ability to check if token id is a lexer rule and wrote parser, still need to test 2024-10-24 19:56:26 -06:00			`//recursivly parse from the inital start tag`
parser mostly fixed, still needs a lot more testing though, and need to store parsed values in a datatype 2024-11-01 04:39:45 -06:00			`ARC_Bool parsed = ARC_Parser_ParseTag(parser, &lexerIndex, startTag->tagId);`
			`ARC_Bool allTokensParsed = lexerIndex == ARC_Lexer_GetTokensSize(parser->lexer);`
parser clears lexer when done parsing 2024-10-30 18:41:01 -06:00			`ARC_Lexer_Clear(parser->lexer);`
parser mostly fixed, still needs a lot more testing though, and need to store parsed values in a datatype 2024-11-01 04:39:45 -06:00			`if(parsed == ARC_False \|\| allTokensParsed == ARC_False \|\| arc_errno){`
			`arc_errno = ARC_ERRNO_DATA;`
			`ARC_DEBUG_LOG_ERROR_WITH_VARIABLES("ARC_Parser_Parse(parser, data), could not parse the given data at lexer index: %u", lexerIndex);`
segfault when testing parser 2024-10-28 21:00:48 -06:00			`return;`
added ability to check if token id is a lexer rule and wrote parser, still need to test 2024-10-24 19:56:26 -06:00			`}`
working on parsing recursively 2024-10-16 23:46:16 -06:00			`}`

			`void ARC_Parser_ParseFile(ARC_Parser parser, ARC_String path){`

			`}`