archeus/src/std/lexer.c

#include "arc/std/lexer.h"

#include "arc/std/bool.h"
#include "arc/std/errno.h"
#include "arc/std/string.h"
#include "arc/std/vector.h"
#include "arc/std/io.h"
#include <stdlib.h>

struct ARC_Lexer {
    ARC_Vector *tokenRules;
    ARC_Vector *tokens;

    //these are used for checking if an uint32_t is a value, if token rules are continuous we can just check the max token value
    ARC_Bool tokenRulesAreContinuous;
    uint32_t tokenRulesMaxVal;
};

//private function for checking if two lexer token rules are the same in a vector (based on id)
ARC_Bool ARC_LexerTokenRule_VectorCompareDataFn(void *dataA, void *dataB){
    ARC_LexerTokenRule *tokenRuleA = (ARC_LexerTokenRule *)dataA;
    ARC_LexerTokenRule *tokenRuleB = (ARC_LexerTokenRule *)dataB;

    if(tokenRuleA->id == tokenRuleB->id){
        return ARC_True;
    }

    return ARC_False;
}

//private function for destroying a lexer token rule from a vector
void ARC_LexerTokenRule_VectorDestroyDataFn(void *data){
    ARC_LexerTokenRule *tokenRule = (ARC_LexerTokenRule *)data;
    tokenRule->destroyAutomataDataFn(tokenRule->automataData);
    free(tokenRule);
}


//private function for destroying a lexer token from a vector
void ARC_LexerToken_VectorDestroyDataFn(void *data){
    ARC_LexerToken *token = (ARC_LexerToken *)data;
    free(token);
}

void ARC_Lexer_Create(ARC_Lexer **lexer){
    //create the lexer
    *lexer = (ARC_Lexer *)malloc(sizeof(ARC_Lexer));

    //setup token rules vector with compare and delete functions
    ARC_Vector_CompareDataFn tokenRulesVectorCompareDataFn = ARC_LexerTokenRule_VectorCompareDataFn;
    ARC_Vector_DestroyDataFn tokenRulesVectorDestroyDataFn = ARC_LexerTokenRule_VectorDestroyDataFn;
    ARC_Vector_Create(&(*lexer)->tokenRules, &tokenRulesVectorCompareDataFn, &tokenRulesVectorDestroyDataFn);

    //setup tokens vector with delete funtion, we don't want a deleteDataFn because their index will be used as the id
    ARC_Vector_DestroyDataFn tokenVectorDestroyDataFn = ARC_LexerToken_VectorDestroyDataFn;
    ARC_Vector_Create(&(*lexer)->tokens, NULL, &tokenVectorDestroyDataFn);

    //set token rules to continuous and initialize the token rules max value
    (*lexer)->tokenRulesAreContinuous = ARC_True;
    (*lexer)->tokenRulesMaxVal        = 0;
}

void ARC_Lexer_Destroy(ARC_Lexer *lexer){
    //free the tokens (there is a vectorDeleteDataFn, so tokens should be freed)
    ARC_Vector_Destroy(lexer->tokens);

    //free the token rules (there is a vectorDeleteDataFn, so token rules should be freed)
    ARC_Vector_Destroy(lexer->tokenRules);

    //free the lexer
    free(lexer);
}

void ARC_Lexer_RegisterTokenRule(ARC_Lexer *lexer, ARC_LexerTokenRule tokenRule){
    //copy the token to a new pointer
    ARC_LexerTokenRule *storedTokenRule = (ARC_LexerTokenRule *)malloc(sizeof(ARC_LexerTokenRule));
    *storedTokenRule = tokenRule;

    //TODO: add warning here for if arc_errno is already set

    //add to the vector and check for error (I'd be surprised if the error ever happened because that would most likely mean overflow)
    ARC_Vector_Add(lexer->tokenRules, storedTokenRule);
    if(arc_errno){
        ARC_DEBUG_LOG_ERROR("ARC_Lexer_RegisterTokenRule(lexer, tokenRule), errored when running ARC_Vector_Add(lexer->tokenRules, storedTokenRule);. check logs for more info");
        free(storedTokenRule);
    }

    //check if the value still is continuous
    if(lexer->tokenRulesAreContinuous == ARC_True){
        //if it is already continuous we just check if it is one value above the tokens already in the vector
        for(uint32_t tokenRuleIndex = ARC_Vector_GetSize(lexer->tokenRules) - 1; tokenRuleIndex > 0; tokenRuleIndex--){
            //get the current token rule
            ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex - 1);

            //check if the token rule is continuous (then next max value by one)
            if(tokenRule.id - currentTokenRule->id == 1){
                //the token rule is already continuous so we can update the max value and return
                lexer->tokenRulesMaxVal = tokenRule.id;
                return;
            }
        }

        //the token is no longer continous
        lexer->tokenRulesAreContinuous = ARC_False;
        return;
    }

    //check to see if this value makes the token rule continuous again
    //TODO: might want to optomize this
    uint32_t minValue = ~(uint32_t)0;
    for(uint32_t tokenRuleIndex = 0; tokenRuleIndex < ARC_Vector_GetSize(lexer->tokenRules); tokenRuleIndex++){
        //get the current token rule
        ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex);

        //check each token to find the minimum one
        if(currentTokenRule->id < minValue){
            minValue = currentTokenRule->id;
        }
    }

    //loop through untill either all the values are checked and in order or the token rule is not continuous
    //TODO: might want to optomize this
    for(uint32_t foundSize = 0; foundSize != ARC_Vector_GetSize(lexer->tokenRules); foundSize++){
        //check all current rules
        ARC_Bool currentAreContinuous = ARC_False;
        for(uint32_t tokenRuleIndex = 0; tokenRuleIndex < ARC_Vector_GetSize(lexer->tokenRules);  tokenRuleIndex++){
            //get the current token rule
            ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, tokenRuleIndex);

            //check if the value is smaller than or equal to the minimum value and if it is we can skip it
            if(currentTokenRule->id <= minValue){
                continue;
            }

            //check if the value is continous
            if(currentTokenRule->id - minValue == 1){
                //set the token rule max val to the next most continuous value
                lexer->tokenRulesMaxVal = currentTokenRule->id;

                //set the next smallest value to check to the the next most continuous value
                minValue = currentTokenRule->id;
                currentAreContinuous = ARC_True;
                break;
            }
        }

        //the current values are not continuous so we can return as token rules are continuous is already set to false
        if(currentAreContinuous == ARC_False){
            return;
        }

        //a continuous value was found so loop to next value
    }
}

void ARC_Lexer_Clear(ARC_Lexer *lexer){
    //clear the tokens vector
    ARC_Vector_Clear(lexer->tokens);
}

void ARC_Lexer_LexString(ARC_Lexer *lexer, ARC_String **data){
    //check if there are any token rules to use
    if(ARC_Vector_GetSize(lexer->tokenRules) == 0){
        arc_errno = ARC_ERRNO_DATA;
        ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexString(lexer, data), no tokens registered to lexer to use");
        return;
    }

    //this will run untill everything token is stripped or there is an error
    while(*data != NULL){
        uint32_t tokenLength = 0;
        uint32_t lastTokenLength = 0;
        ARC_LexerToken *token = NULL;

        for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){
            //check if the token rule is found
            ARC_LexerTokenRule *tokenRule = ARC_Vector_Get(lexer->tokenRules, index);

            //tokenData should only exist if tokenLength is ARC_True as stated in the header
            ARC_String *tokenData;
            tokenLength = tokenRule->automataFn(&tokenData, *data, tokenRule->automataData);

            //check if a token was found if it wasn't continue. I'm doing this to try to cut down on the ammount of indentation
            if(tokenLength == 0){
                continue;
            }

            //check to see if we found a better match
            if(tokenLength > lastTokenLength){
                //free the current token if it exists
                if(token != NULL){
                    ARC_LexerTokenRule_VectorDestroyDataFn((void *)token);
                }

                //create the token to add
                token = (ARC_LexerToken *)malloc(sizeof(ARC_LexerToken));
                token->rule = tokenRule->id;
                token->data = tokenData;

                //update the last found tokenLength to the max length
                lastTokenLength = tokenLength;
            }
        }

        //if no token was found, throw an error
        if(token == NULL){
            arc_errno = ARC_ERRNO_DATA;
            ARC_DEBUG_LOG_ERROR_WITH_VARIABLES("ARC_Lexer_LexString(lexer, data), no tokens found with current string: \"%s\"", (*data)->data);

            //clean up errored string
            ARC_String_Destroy(*data);
            *data = NULL;
            return;
        }

        //token exists (something must have gone very wrong if it doesn't), so add it and check for overflow (which I'd be surprised if that happens)
        ARC_Vector_Add(lexer->tokens, (void *)token);
        if(arc_errno){
            ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexString(lexer, data), errored when running ARC_Vector_Add(lexer->tokens, token);. check logs for more info");
            free(token);

            //clean up errored string
            ARC_String_Destroy(*data);
            *data = NULL;
            return;
        }

        //if the last token was found, destroy the string and return
        if(lastTokenLength == (*data)->length){
            ARC_String_Destroy(*data);
            *data = NULL;
            return;
        }

        //strip the string
        ARC_String_ReplaceWithSubstring(data, lastTokenLength, (*data)->length - lastTokenLength);
    }
}

void ARC_Lexer_LexFile(ARC_Lexer *lexer, ARC_String *path){
    //read file and clean up if it errors
    ARC_String *data;
    ARC_IO_FileToStr(path, &data);
    if(arc_errno){
        ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexFile(lexer, path), errored when running ARC_IO_FileToStr(path, &data);. check logs for more info");
        if(data != NULL){
            ARC_String_Destroy(data);
        }

        return;
    }

    //lex the string and log if there is an error, ARC_Lexer_LexString will clean up the string
    ARC_Lexer_LexString(lexer, &data);
    if(arc_errno){
        ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexFile(lexer, path), errored when running ARC_Lexer_LexString(lexer, data);. check logs for more info");
    }
}

void ARC_Lexer_PrintTokenRules(ARC_Lexer *lexer){
    for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){
        ARC_LexerTokenRule *tokenRule = ARC_Vector_Get(lexer->tokenRules, index);
        printf("Rule: %02i\tFunction: %p\n", tokenRule->id, tokenRule->automataFn);
    }
}

ARC_LexerToken ARC_Lexer_GetToken(ARC_Lexer *lexer, uint32_t index){
    //get the token and log if there is an error
    ARC_LexerToken *token = ARC_Vector_Get(lexer->tokens, index);
    if(arc_errno){
        ARC_DEBUG_LOG_ERROR("ARC_Lexer_GetToken(lexer, index), errored when running ARC_Vector_Get(lexer->tokens, index);. check logs for more info");

        //return a token with max rule value, and NULL for the string to signify an error
        return (ARC_LexerToken){
            ~(uint32_t)0,
            NULL
        };
    }

    //the token was found, so return a copy to that
    return *token;
}

uint32_t ARC_Lexer_GetTokensSize(ARC_Lexer *lexer){
    return ARC_Vector_GetSize(lexer->tokens);
}

ARC_Bool ARC_Lexer_IsTokenId(ARC_Lexer *lexer, uint32_t id){
    //if the rules are continuous we can just check if it is less than the max rules value
    if(lexer->tokenRulesAreContinuous == ARC_True){
        return id <= lexer->tokenRulesMaxVal;
    }

    //the rules are not continuous so we need to check each individually
    for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){
        ARC_LexerTokenRule *currentTokenRule = (ARC_LexerTokenRule *)ARC_Vector_Get(lexer->tokenRules, index);
        if(currentTokenRule->id == id){
            return ARC_True;
        }
    }

    return ARC_False;
}

uint32_t ARC_Lexer_AutomataMatchCharFn(ARC_String **tokenData, ARC_String *string, void *automataData){
    //if there is a match the token will be the same as automataData, so we don't need to store it again
    *tokenData = NULL;

    //check to see if there is a match with automataData as a char
    if(string->data[0] == *(char *)automataData){
        //return the token was found of length 1
        return 1;
    }

    //no match was found
    return 0;
}

uint32_t ARC_Lexer_AutomataMatchCharOrBetweenFn(ARC_String **tokenData, ARC_String *string, void *automataData){
    //if there is a match the token will be the same as automataData, so we don't need to store it again
    *tokenData = NULL;

    //check to see if there is a match with automataData as a range of chars
    char *automataDataChars = (char *)automataData;
    if(string->data[0] >= automataDataChars[0] && string->data[0] <= ((char *)automataData)[1]){
        //return the token as token data and the token was found of length 1
        //TODO: fix this
        //ARC_String_Create(tokenData, string->data, 1);
        return 1;
    }

    //no match was found
    return 0;
}

uint32_t ARC_Lexer_AutomataMatchStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){
    //if there is a match the token will be the same as automataData, so we don't need to store it again
    *tokenData = NULL;

    //check to see if there is a match with automataData as a string
    ARC_String *automataDataString = (ARC_String *)automataData;
    if(ARC_String_SubstringEquals(string, 0, automataDataString)){
        //return the token was found of the string length
        return automataDataString->length;
    }

    //no match was found
    return 0;
}

uint32_t ARC_Lexer_AutomataMatchCharInStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){
    //if there is a match the token will be the same as automataData, so we don't need to store it again
    *tokenData = NULL;

    //check to see if there is a char match in automataData as a string
    ARC_String *automataDataString = (ARC_String *)automataData;
    for(uint64_t index = 0; index < automataDataString->length; index++){
        if(string->data[0] == automataDataString->data[index]){
            //return the token was found in the string of length 1
            return 1;
        }
    }

    //no match was found
    return 0;
}

//private function to free automataData stored as a char
void ARC_LexerTokenRule_DestroyCharAutomataDataFn(void *automataData){
    free((char *)automataData);
}

ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharRule(uint32_t id, char character){
    //create the token rule
    ARC_LexerTokenRule tokenRule;

    //set the id
    tokenRule.id = id;

    //create and store the automataData (which is just a char)
    char *automataData = (char *)malloc(sizeof(char));
    *automataData = character;
    tokenRule.automataData = (void *)automataData;

    //we can use the ARC_Lexer_AutomataMatchCharFn for this
    tokenRule.automataFn = ARC_Lexer_AutomataMatchCharFn;

    //add the private destroy function
    tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyCharAutomataDataFn;

    //return the created tokenRule
    return tokenRule;
}

ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(uint32_t id, char start, char end){
    //create the token rule
    ARC_LexerTokenRule tokenRule;

    //set the id
    tokenRule.id = id;

    //create and store the automataData (which is just two chars (the minumum and manximum))
    char *automataData = (char *)malloc(sizeof(char) * 2);
    automataData[0] = start;
    automataData[1] = end;
    tokenRule.automataData = (void *)automataData;

    //we can use the ARC_Lexer_AutomataMatchCharInStringFn for this
    tokenRule.automataFn = ARC_Lexer_AutomataMatchCharOrBetweenFn;

    //add the private destroy function (we can use the char as it destroys a char pointer of any size)
    tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyCharAutomataDataFn;

    //return the created tokenRule
    return tokenRule;
}

//private function to free automataData stored as an ARC_String
void ARC_LexerTokenRule_DestroyStringAutomataDataFn(void *automataData){
    ARC_String_Destroy((ARC_String *)automataData);
}

ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchStringRule(uint32_t id, ARC_String *string){
    //create the token rule
    ARC_LexerTokenRule tokenRule;

    //set the id
    tokenRule.id = id;

    //copy and store the automataData (which is just an ARC_String)
    ARC_String *automataData;
    ARC_String_Copy(&automataData, string);
    tokenRule.automataData = (void *)automataData;

    //we can use the ARC_Lexer_AutomataMatchStringFn for this
    tokenRule.automataFn = ARC_Lexer_AutomataMatchStringFn;

    //add the private destroy function
    tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyStringAutomataDataFn;

    //return the created tokenRule
    return tokenRule;
}

ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint32_t id, ARC_String *string){
    //create the token rule
    ARC_LexerTokenRule tokenRule;

    //set the id
    tokenRule.id = id;

    //copy and store the automataData (which is just an ARC_String)
    ARC_String *automataData;
    ARC_String_Copy(&automataData, string);
    tokenRule.automataData = (void *)automataData;

    //we can use the ARC_Lexer_AutomataMatchCharInStringFn for this
    tokenRule.automataFn = ARC_Lexer_AutomataMatchCharInStringFn;

    //add the private destroy function
    tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyStringAutomataDataFn;

    //return the created tokenRule
    return tokenRule;
}

void ARC_Lexer_InitBasicTokenRules(ARC_Lexer *lexer){
    //null
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_NULL, 0));

    //number 
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_LEXER_TOKEN_NUMBER, '0', '9'));

    //alpha char
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_LEXER_TOKEN_ALPHALOWERCHAR, 'a', 'z'));
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharOrBetween(ARC_LEXER_TOKEN_ALPHAUPPERCHAR, 'A', 'Z'));

    //whitespace
    //TODO: fix this
    ARC_String *whitespaceString;
    ARC_String_CreateWithStrlen(&whitespaceString, " \t");
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(ARC_LEXER_TOKEN_WHITESPACE, whitespaceString));
    ARC_String_Destroy(whitespaceString);
    //TEMP FIX:
    //ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_WHITESPACE, ' '));

    //single char tokens
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_NEWLINE_ID          , ARC_LEXER_TOKEN_NEWLINE_CHAR          ));
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_COLON_ID            , ARC_LEXER_TOKEN_COLON_CHAR            ));
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_SEMICOLON_ID        , ARC_LEXER_TOKEN_SEMICOLON_CHAR        ));
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_COMMA_ID            , ARC_LEXER_TOKEN_COMMA_CHAR            ));
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_PERIOD_ID           , ARC_LEXER_TOKEN_PERIOD_CHAR           ));
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_FORWARD_SLASH_ID    , ARC_LEXER_TOKEN_FORWARD_SLASH_CHAR    ));
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_BACK_SLASH_ID       , ARC_LEXER_TOKEN_BACK_SLASH_CHAR       ));
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_LEFT_PARENTHESIS_ID , ARC_LEXER_TOKEN_LEFT_PARENTHESIS_CHAR ));
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_ID, ARC_LEXER_TOKEN_RIGHT_PARENTHESIS_CHAR));
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_ID , ARC_LEXER_TOKEN_LEFT_CURLY_BRACE_CHAR ));
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_ID, ARC_LEXER_TOKEN_RIGHT_CURLY_BRACE_CHAR));
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_BANG_ID             , ARC_LEXER_TOKEN_BANG_CHAR             ));
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_AT_ID               , ARC_LEXER_TOKEN_AT_CHAR               ));
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_HASH_ID             , ARC_LEXER_TOKEN_HASH_CHAR             ));
    ARC_Lexer_RegisterTokenRule(lexer, ARC_LexerTokenRule_CreateAndReturnMatchCharRule(ARC_LEXER_TOKEN_PERCENT_ID          , ARC_LEXER_TOKEN_PERCENT_CHAR          ));
}