added ability to check if token id is a lexer rule and wrote parser, still need to test
This commit is contained in:
parent
7a3495f7ae
commit
d8d1a1a107
7 changed files with 211 additions and 13 deletions
|
|
@ -128,7 +128,7 @@ if(ARCHEUS_STD_TESTS)
|
|||
tests/test.c
|
||||
|
||||
#tests/std/vector.c
|
||||
#tests/std/lexer.c
|
||||
tests/std/lexer.c
|
||||
tests/std/parser.c
|
||||
|
||||
${ARCHEUS_STD_SOURCES}
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "arc/std/string.h"
|
||||
#include <stdint.h>
|
||||
|
||||
|
|
@ -115,6 +114,16 @@ ARC_LexerToken ARC_Lexer_GetToken(ARC_Lexer *lexer, uint32_t index);
|
|||
*/
|
||||
uint32_t ARC_Lexer_GetTokensSize(ARC_Lexer *lexer);
|
||||
|
||||
/**
|
||||
* @brief returns a boolean based on if a given id is a stored token rule id
|
||||
*
|
||||
* @param[in] lexer the lexer to check stored token rule ids
|
||||
* @param[in] id the id to check against the token rules
|
||||
*
|
||||
* @return ARC_True if the id is a rule id, ARC_False otherwise
|
||||
*/
|
||||
ARC_Bool ARC_Lexer_IsTokenId(ARC_Lexer *lexer, uint32_t id);
|
||||
|
||||
/**
|
||||
* @brief checks if the first character of string matches the automataData cast as a char
|
||||
*
|
||||
|
|
|
|||
|
|
@ -78,9 +78,9 @@ void ARC_Parser_Destroy(ARC_Parser *parser);
|
|||
* @brief
|
||||
*
|
||||
* @param[in] parser
|
||||
* @param[in] language
|
||||
* @param[in/out] data the string to parse, will be freed and set to NULL by the end of this function
|
||||
*/
|
||||
void ARC_Parser_Parse(ARC_Parser *parser, ARC_String *data);
|
||||
void ARC_Parser_Parse(ARC_Parser *parser, ARC_String **data);
|
||||
|
||||
/**
|
||||
* @brief
|
||||
|
|
|
|||
|
|
@ -10,6 +10,10 @@
|
|||
struct ARC_Lexer {
|
||||
ARC_Vector *tokenRules;
|
||||
ARC_Vector *tokens;
|
||||
|
||||
//these are used for checking if an uint32_t is a value, if token rules are continuous we can just check the max token value
|
||||
ARC_Bool tokenRulesAreContinuous;
|
||||
uint32_t tokenRulesMaxVal;
|
||||
};
|
||||
|
||||
//private function for checking if two lexer token rules are the same in a vector (based on id)
|
||||
|
|
@ -50,6 +54,10 @@ void ARC_Lexer_Create(ARC_Lexer **lexer){
|
|||
//setup tokens vector with delete funtion, we don't want a deleteDataFn because their index will be used as the id
|
||||
ARC_Vector_DestroyDataFn tokenVectorDestroyDataFn = ARC_LexerToken_VectorDestroyDataFn;
|
||||
ARC_Vector_Create(&(*lexer)->tokens, NULL, &tokenVectorDestroyDataFn);
|
||||
|
||||
//set token rules to continuous and initialize the token rules max value
|
||||
(*lexer)->tokenRulesAreContinuous = ARC_True;
|
||||
(*lexer)->tokenRulesMaxVal = 0;
|
||||
}
|
||||
|
||||
void ARC_Lexer_Destroy(ARC_Lexer *lexer){
|
||||
|
|
@ -74,6 +82,73 @@ void ARC_Lexer_RegisterTokenRule(ARC_Lexer *lexer, ARC_LexerTokenRule tokenRule)
|
|||
ARC_DEBUG_LOG_ERROR("ARC_Lexer_RegisterTokenRule(lexer, tokenRule), errored when running ARC_Vector_Add(lexer->tokenRules, storedTokenRule);. check logs for more info");
|
||||
free(storedTokenRule);
|
||||
}
|
||||
|
||||
//check if the value still is continuous
|
||||
if(lexer->tokenRulesAreContinuous == ARC_True){
|
||||
//if it is already continuous we just check if it is one value above the tokens already in the vector
|
||||
for(uint32_t tokenRuleIndex = ARC_Vector_GetSize(lexer->tokenRules) - 1; tokenRuleIndex > 0; tokenRuleIndex--){
|
||||
//get the current token rule
|
||||
ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex - 1);
|
||||
|
||||
//check if the token rule is continuous (then next max value by one)
|
||||
if(tokenRule.id - currentTokenRule->id == 1){
|
||||
//the token rule is already continuous so we can update the max value and return
|
||||
lexer->tokenRulesMaxVal = tokenRule.id;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
//the token is no longer continous
|
||||
lexer->tokenRulesAreContinuous = ARC_False;
|
||||
return;
|
||||
}
|
||||
|
||||
//check to see if this value makes the token rule continuous again
|
||||
//TODO: might want to optomize this
|
||||
uint32_t minValue = ~(uint32_t)0;
|
||||
for(uint32_t tokenRuleIndex = 0; tokenRuleIndex < ARC_Vector_GetSize(lexer->tokenRules); tokenRuleIndex++){
|
||||
//get the current token rule
|
||||
ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex);
|
||||
|
||||
//check each token to find the minimum one
|
||||
if(currentTokenRule->id < minValue){
|
||||
minValue = currentTokenRule->id;
|
||||
}
|
||||
}
|
||||
|
||||
//loop through untill either all the values are checked and in order or the token rule is not continuous
|
||||
//TODO: might want to optomize this
|
||||
for(uint32_t foundSize = 0; foundSize != ARC_Vector_GetSize(lexer->tokenRules); foundSize++){
|
||||
//check all current rules
|
||||
ARC_Bool currentAreContinuous = ARC_False;
|
||||
for(uint32_t tokenRuleIndex = 0; tokenRuleIndex < ARC_Vector_GetSize(lexer->tokenRules); tokenRuleIndex++){
|
||||
//get the current token rule
|
||||
ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex);
|
||||
|
||||
//check if the value is smaller than or equal to the minimum value and if it is we can skip it
|
||||
if(currentTokenRule->id <= minValue){
|
||||
continue;
|
||||
}
|
||||
|
||||
//check if the value is continous
|
||||
if(currentTokenRule->id - minValue == 1){
|
||||
//set the token rule max val to the next most continuous value
|
||||
lexer->tokenRulesMaxVal = currentTokenRule->id;
|
||||
|
||||
//set the next smallest value to check to the the next most continuous value
|
||||
minValue = currentTokenRule->id;
|
||||
currentAreContinuous = ARC_True;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//the current values are not continuous so we can return as token rules are continuous is already set to false
|
||||
if(currentAreContinuous == ARC_False){
|
||||
return;
|
||||
}
|
||||
|
||||
//a continuous value was found so loop to next value
|
||||
}
|
||||
}
|
||||
|
||||
void ARC_Lexer_LexString(ARC_Lexer *lexer, ARC_String **data){
|
||||
|
|
@ -198,6 +273,23 @@ uint32_t ARC_Lexer_GetTokensSize(ARC_Lexer *lexer){
|
|||
return ARC_Vector_GetSize(lexer->tokens);
|
||||
}
|
||||
|
||||
ARC_Bool ARC_Lexer_IsTokenId(ARC_Lexer *lexer, uint32_t id){
|
||||
//if the rules are continuous we can just check if it is less than the max rules value
|
||||
if(lexer->tokenRulesAreContinuous == ARC_True){
|
||||
return id <= lexer->tokenRulesMaxVal;
|
||||
}
|
||||
|
||||
//the rules are not continuous so we need to check each individually
|
||||
for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){
|
||||
ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, index);
|
||||
if(currentTokenRule->id == id){
|
||||
return ARC_True;
|
||||
}
|
||||
}
|
||||
|
||||
return ARC_False;
|
||||
}
|
||||
|
||||
uint32_t ARC_Lexer_AutomataMatchCharFn(ARC_String **tokenData, ARC_String *string, void *automataData){
|
||||
//if there is a match the token will be the same as automataData, so we don't need to store it again
|
||||
*tokenData = NULL;
|
||||
|
|
@ -321,6 +413,7 @@ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint3
|
|||
}
|
||||
|
||||
void ARC_Lexer_InitBasicTokenRules(ARC_Lexer *lexer){
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_NULL , 0 ));
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_COLON_ID , ARC_LEXER_TOKEN_COLON_CHAR ));
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_SEMICOLON_ID , ARC_LEXER_TOKEN_SEMICOLON_CHAR ));
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_COMMA_ID , ARC_LEXER_TOKEN_COMMA_CHAR ));
|
||||
|
|
|
|||
|
|
@ -1,7 +1,8 @@
|
|||
#include "arc/std/parser.h"
|
||||
#include "arc/std/bool.h"
|
||||
#include "arc/std/errno.h"
|
||||
#include "arc/std/lexer.h"
|
||||
#include "arc/std/vector.h"
|
||||
//#include "arc/std/vector.h"
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -46,7 +47,7 @@ void ARC_Parser_Destroy(ARC_Parser *parser){
|
|||
}
|
||||
|
||||
//private recusive function to parse a tag
|
||||
void ARC_Parser_ParseTag(ARC_Parser *parser, uint32_t lexerIndex, uint32_t tagId){
|
||||
void ARC_Parser_ParseTag(ARC_Parser *parser, uint32_t *lexerIndex, uint32_t tagId){
|
||||
//get the current tag
|
||||
ARC_ParserLanguageTag *tag = NULL;
|
||||
for(uint32_t index = 0; index < parser->language.size; index++){
|
||||
|
|
@ -67,16 +68,54 @@ void ARC_Parser_ParseTag(ARC_Parser *parser, uint32_t lexerIndex, uint32_t tagId
|
|||
//loop through each or section of the tags and tokens
|
||||
for(uint32_t orIndex = 0; orIndex < tag->tokensOrTagsSize; orIndex++){
|
||||
//loop through each token or tag to check if the lexed data matches
|
||||
uint32_t lexerCheckIndex = lexerIndex;
|
||||
uint32_t lexerCheckIndex = *lexerIndex;
|
||||
ARC_Bool foundRule = ARC_True;
|
||||
for(uint32_t tokenOrTagIndex = 1; tokenOrTagIndex < tag->tokensOrTags[orIndex][0] + 1; tokenOrTagIndex++){
|
||||
//if the value isn't a token it is a tag, so recurs if it isn't a token
|
||||
ARC_Bool isToken = ARC_Lexer_IsTokenId(parser->lexer, tag->tokensOrTags[orIndex][tokenOrTagIndex]);
|
||||
if(isToken == ARC_False){
|
||||
ARC_Parser_ParseTag(parser, lexerIndex, tag->tokensOrTags[orIndex][tokenOrTagIndex]);
|
||||
return;
|
||||
}
|
||||
|
||||
//get the next token in the lexer and increment the lexers index
|
||||
ARC_LexerToken token = ARC_Lexer_GetToken(parser->lexer, lexerCheckIndex);
|
||||
lexerCheckIndex++;
|
||||
|
||||
//if the token rule does not match the current token in the current or statement the token rule could not be found for the current or index so break
|
||||
if(token.rule != tag->tokensOrTags[orIndex][tokenOrTagIndex]){
|
||||
foundRule = ARC_False;
|
||||
break;
|
||||
}
|
||||
//
|
||||
}
|
||||
|
||||
//if the rule is found we don't need to check anymore so we can return out
|
||||
if(foundRule == ARC_True){
|
||||
*lexerIndex = lexerCheckIndex;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
//no rule was found, so set an error and log
|
||||
arc_errno = ARC_ERRNO_DATA;
|
||||
ARC_DEBUG_LOG_ERROR_WITH_VARIABLES("ARC_Parser_ParseTag(parser, lexerIndex, tagId), tag id: %u could not find a matching rule at token index %u", tagId, *lexerIndex);
|
||||
}
|
||||
|
||||
void ARC_Parser_Parse(ARC_Parser *parser, ARC_String *data){
|
||||
void ARC_Parser_Parse(ARC_Parser *parser, ARC_String **data){
|
||||
//lex the subdata
|
||||
ARC_Lexer_LexString(parser->lexer, data);
|
||||
if(arc_errno){
|
||||
ARC_DEBUG_LOG_ERROR("ARC_Parser_Parse(parser, data), could not lex the given data");
|
||||
}
|
||||
|
||||
uint32_t lexerIndex = 0;
|
||||
ARC_ParserLanguageTag startTag = ((ARC_ParserLanguageTag *)parser->language.data)[0];
|
||||
|
||||
//recursivly parse from the inital start tag
|
||||
ARC_Parser_ParseTag(parser, &lexerIndex, startTag.tagId);
|
||||
if(arc_errno){
|
||||
ARC_DEBUG_LOG_ERROR("ARC_Parser_Parse(parser, data), could not parse the given data");
|
||||
}
|
||||
}
|
||||
|
||||
void ARC_Parser_ParseFile(ARC_Parser *parser, ARC_String *path){
|
||||
|
|
|
|||
|
|
@ -37,3 +37,60 @@ ARC_TEST(Lexer_Char_Match){
|
|||
|
||||
ARC_Lexer_Destroy(lexer);
|
||||
}
|
||||
|
||||
ARC_TEST(Lexer_Check_Id_Basic){
|
||||
ARC_Lexer *lexer;
|
||||
ARC_Lexer_Create(&lexer);
|
||||
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(0, 0 ));
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(1, ':'));
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(2, ':'));
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(3, ':'));
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(4, ':'));
|
||||
|
||||
ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 0) == ARC_True );
|
||||
ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 7) == ARC_False);
|
||||
ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 2) == ARC_True );
|
||||
ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 4) == ARC_True );
|
||||
ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 5) == ARC_False);
|
||||
|
||||
ARC_Lexer_Destroy(lexer);
|
||||
}
|
||||
|
||||
ARC_TEST(Lexer_Check_Id_Unordered_But_Continious){
|
||||
ARC_Lexer *lexer;
|
||||
ARC_Lexer_Create(&lexer);
|
||||
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(2, ':'));
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(0, 0 ));
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(3, ':'));
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(1, ':'));
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(4, ':'));
|
||||
|
||||
ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 0) == ARC_True );
|
||||
ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 7) == ARC_False);
|
||||
ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 2) == ARC_True );
|
||||
ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 4) == ARC_True );
|
||||
ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 5) == ARC_False);
|
||||
|
||||
ARC_Lexer_Destroy(lexer);
|
||||
}
|
||||
|
||||
ARC_TEST(Lexer_Check_Id_Unordered_Not_Continious){
|
||||
ARC_Lexer *lexer;
|
||||
ARC_Lexer_Create(&lexer);
|
||||
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(2, ':'));
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(8, 0 ));
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(3, ':'));
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(1, ':'));
|
||||
ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(4, ':'));
|
||||
|
||||
ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 8) == ARC_True );
|
||||
ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 7) == ARC_False);
|
||||
ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 2) == ARC_True );
|
||||
ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 4) == ARC_True );
|
||||
ARC_CHECK(ARC_Lexer_IsTokenId(lexer, 5) == ARC_False);
|
||||
|
||||
ARC_Lexer_Destroy(lexer);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ void TEST_Parser_InitLexerRulesFn(ARC_Lexer *lexer){
|
|||
ARC_Lexer_InitBasicTokenRules(lexer);
|
||||
}
|
||||
|
||||
ARC_TEST(Lexer_Char_Match){
|
||||
ARC_TEST(Parser_Init){
|
||||
ARC_Parser *parser;
|
||||
|
||||
uint32_t *charOrNumTokens[] = { (uint32_t[]){ 1, CHAR }, (uint32_t[]){ 1, NUM } };
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue