older lexer stuff

2024-10-16 17:35:38 -06:00 · 2024-10-16 17:35:38 -06:00 · 380e74a0e6
commit 380e74a0e6
parent 7bd7cc4aa5
3 changed files with 211 additions and 72 deletions
--- a/include/arc/std/lexer.h
+++ b/include/arc/std/lexer.h
@ -5,7 +5,6 @@
 extern "C" {
 #endif

-#include "arc/std/bool.h"
 #include "arc/std/string.h"
 #include <stdint.h>

@ -25,16 +24,15 @@ typedef struct ARC_LexerToken {
 /**
 * @brief checks to see if a string is a type of token
 *
- * @note do not set tokenData if this function returns ARC_False, doing so will create a memory leak
+ * @note do not set tokenData if this function returns 0, doing so will create a memory leak
 *
- * @param[in/out] string       a string to be checked to see if it matches a token,
- *                             this needs to srip the token out for the lexer to avoid an infinite loop
- * @param[out]    tokenData    a place to store token data (like a variable name), can be NULL if not needed
- * @param[in]     automataData any data that needs to be used for the ARC_Lexer_AutomataFn
+ * @param[out] tokenData    a place to store token data (like a variable name), can be NULL if not needed
+ * @param[in]  string       a string to be checked to see if it matches a token
+ * @param[in]  automataData any data that needs to be used for the ARC_Lexer_AutomataFn
 *
- * @return if a token was successfully found ARC_True, otherwise ARC_False
+ * @return the size of the token found, or 0 if the token was not found
 */
-typedef ARC_Bool (* ARC_LexerTokenRule_AutomataFn)(ARC_String **string, ARC_String **tokenData, void *automataData);
+typedef uint32_t (* ARC_LexerTokenRule_AutomataFn)(ARC_String **tokenData, ARC_String *string, void *automataData);

 /**
 * @brief a callback function to clean up ARC_LexerTokenRule's automataData
@ -124,14 +122,13 @@ uint32_t ARC_Lexer_GetTokensSize(ARC_Lexer *lexer);
 * @note this is intended as a helper callback
 * @note this function is a ARC_Lexer_AutomataFn callback
 *
- * @param[in/out] string       a string to be checked to see if it matches a token,
- *                             this needs to srip the token out for the lexer to avoid an infinite loop
- * @param[out]    tokenData    a place to store token data (like a variable name), can be NULL if not needed
- * @param[in]     automataData any data that needs to be used for the ARC_Lexer_AutomataFn
+ * @param[out] tokenData    a place to store token data (like a variable name), can be NULL if not needed
+ * @param[in]  string       a string to be checked to see if it matches a token
+ * @param[in]  automataData any data that needs to be used for the ARC_Lexer_AutomataFn
 *
- * @return if a token was successfully found ARC_True, otherwise ARC_False
+ * @return the size of the token found, or 0 if the token was not found
 */
-ARC_Bool ARC_Lexer_AutomataMatchCharFn(ARC_String **string, ARC_String **tokenData, void *automataData);
+uint32_t ARC_Lexer_AutomataMatchCharFn(ARC_String **tokenData, ARC_String *string, void *automataData);

 /**
 * @brief checks if the substring automataData as an ARC_String matches the first part of string
@ -139,14 +136,27 @@ ARC_Bool ARC_Lexer_AutomataMatchCharFn(ARC_String **string, ARC_String **tokenDa
 * @note this is intended as a helper callback
 * @note this function is a ARC_Lexer_AutomataFn callback
 *
- * @param[in/out] string       a string to be checked to see if it matches a token,
- *                             this needs to srip the token out for the lexer to avoid an infinite loop
- * @param[out]    tokenData    a place to store token data (like a variable name), can be NULL if not needed
- * @param[in]     automataData any data that needs to be used for the ARC_Lexer_AutomataFn
+ * @param[out] tokenData    a place to store token data (like a variable name), can be NULL if not needed
+ * @param[in]  string       a string to be checked to see if it matches a token
+ * @param[in]  automataData any data that needs to be used for the ARC_Lexer_AutomataFn
 *
- * @return if a token was successfully found ARC_True, otherwise ARC_False
+ * @return the size of the token found, or 0 if the token was not found
 */
-ARC_Bool ARC_Lexer_AutomataMatchStringFn(ARC_String **string, ARC_String **tokenData, void *automataData);
+uint32_t ARC_Lexer_AutomataMatchStringFn(ARC_String **tokenData, ARC_String *string, void *automataData);
+
+/**
+ * @brief checks if the first part of string is a character in substring
+ *
+ * @note this is intended as a helper callback
+ * @note this function is a ARC_Lexer_AutomataFn callback
+ *
+ * @param[out] tokenData    a place to store token data (like a variable name), can be NULL if not needed
+ * @param[in]  string       a string to be checked to see if it matches a token
+ * @param[in]  automataData any data that needs to be used for the ARC_Lexer_AutomataFn
+ *
+ * @return the size of the token found, or 0 if the token was not found
+*/
+uint32_t ARC_Lexer_AutomataMatchCharInStringFn(ARC_String **tokenData, ARC_String *string, void *automataData);

 /**
 * @brief creates a ARC_LexerTokenRule with a given id and character
@ -173,6 +183,28 @@ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharRule(uint32_t id,
 */
 ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchStringRule(uint32_t id, ARC_String *string);

+/**
+ * @brief creates a ARC_LexerTokenRule with a given id and string
+ *
+ * @note this is intended as a helper funtion
+ * #note string will not be freed (it will be copied and the copy will be freed)
+ *
+ * @param[in] id        a tokens id (basically the token value)
+ * @param[in] character the string to match against, will be copied
+ *
+ * @return a token rule based in the id and string
+*/
+ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint32_t id, ARC_String *string);
+
+/**
+ * @brief basic tokens
+*/
+#define ARC_LEXER_TOKEN_NULL       0
+#define ARC_LEXER_TOKEN_EOF        1
+#define ARC_LEXER_TOKEN_NUMBER     2
+#define ARC_LEXER_TOKEN_ALPHACHAR  3
+#define ARC_LEXER_TOKEN_WHITESPACE 4
+
 /**
 * @brief basic token type ids, chars, and tags
 */
--- a/include/arc/std/parser.h
+++ b/include/arc/std/parser.h
@ -0,0 +1,63 @@
+#ifndef ARC_STD_PARSER_H_
+#define ARC_STD_PARSER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "arc/std/string.h"
+
+/**
+ * @brief a parser type
+*/
+typedef struct ARC_Parser ARC_Parser;
+
+/**
+ * @brief a parser node
+*/
+typedef struct ARC_ParserNode ARC_ParserNode;
+
+/**
+ * @brief creates an ARC_Parser type
+ *
+ * @param[out] parser
+ * @param[in]  language ..., can be NULL
+*/
+void ARC_Parser_Create(ARC_Parser **parser, ARC_String *language);
+
+/**
+ * @brief destroys an ARC_Parser type
+ *
+ * @param[in] parser ARC_Parser to free
+*/
+void ARC_Parser_Destroy(ARC_Parser *parser);
+
+/**
+ * @brief sets the definition of the parser, the language itself is parsed and will throw an error if invalid
+ *
+ * @param[in] parser   ARC_Parser to set the language to
+ * @param[in] language the language as a string the parser should use
+*/
+void ARC_Parser_SetLanguage(ARC_Parser *parser, ARC_String *language);
+
+/**
+ * @brief sets the definition of the parser, the language itself is parsed and will throw an error if invalid
+ *
+ * @param[in] parser   ARC_Parser to set the language to
+ * @param[in] language the language as a string the parser should use
+*/
+void ARC_Parser_Parse(ARC_Parser *parser, ARC_String *data);
+
+/**
+ * @brief sets the definition of the parser, the language itself is parsed and will throw an error if invalid
+ *
+ * @param[in] parser   ARC_Parser to set the language to
+ * @param[in] language the language as a string the parser should use
+*/
+void ARC_Parser_ParseFile(ARC_Parser *parser, ARC_String *path);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // !ARC_STD_LEXER_H_
--- a/src/std/lexer.c
+++ b/src/std/lexer.c
@ -86,53 +86,74 @@ void ARC_Lexer_LexString(ARC_Lexer *lexer, ARC_String **data){

    //this will run untill everything token is stripped or there is an error
    while(*data != NULL){
-        ARC_Bool tokenFound = ARC_False;
+        uint32_t tokenLength = 0;
+        uint32_t lastTokenLength = 0;
+        ARC_LexerToken *token = NULL;
+
        for(uint32_t index = 0; index < ARC_Vector_GetSize(lexer->tokenRules); index++){
            //check if the token rule is found
            ARC_LexerTokenRule *tokenRule = ARC_Vector_Get(lexer->tokenRules, index);

-            //tokenData should only exist if tokenFound is ARC_True as stated in the header
+            //set the last token length if the last token had a length
+            if(tokenLength > 0){
+                lastTokenLength = tokenLength;
+            }
+
+            //tokenData should only exist if tokenLength is ARC_True as stated in the header
            ARC_String *tokenData;
-            tokenFound = tokenRule->automataFn(data, &tokenData, tokenRule->automataData);
+            tokenLength = tokenRule->automataFn(&tokenData, *data, tokenRule->automataData);

            //check if a token was found if it wasn't continue. I'm doing this to try to cut down on the ammount of indentation
-            if(tokenFound != ARC_True){
+            if(tokenLength == 0){
                continue;
            }

-            //create the token to add
-            ARC_LexerToken *token = (ARC_LexerToken *)malloc(sizeof(ARC_LexerToken));
-            token->rule = tokenRule->id;
-            token->data = tokenData;
+            //check to see if we found a better match
+            if(tokenLength > lastTokenLength){
+                //free the current token if it exists
+                if(token != NULL){
+                    ARC_LexerTokenRule_VectorDestroyDataFn((void *)token);
+                }

-            //add to the vector and check for error (I'd be surprised if the error ever happened because that would most likely mean overflow)
-            ARC_Vector_Add(lexer->tokens, (void *)token);
-            if(arc_errno){
-                ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexString(lexer, data), errored when running ARC_Vector_Add(lexer->tokens, token);. check logs for more info");
-                free(token);
-
-                //clean up errored string
-                ARC_String_Destroy(*data);
-                *data = NULL;
-                return;
+                //create the token to add
+                token = (ARC_LexerToken *)malloc(sizeof(ARC_LexerToken));
+                token->rule = tokenRule->id;
+                token->data = tokenData;
            }
-
-            //the token was added, so break to start checking tokens again
-            break;
        }

        //if no token was found, throw an error
-        if(tokenFound == ARC_False){
+        if(token == NULL){
            arc_errno = ARC_ERRNO_DATA;
            ARC_DEBUG_LOG_ERROR_WITH_VARIABLES("ARC_Lexer_LexString(lexer, data), no tokens found with current string: \"%s\"", (*data)->data);

            //clean up errored string
            ARC_String_Destroy(*data);
            *data = NULL;
-
-            //TODO: might want to do smthn with already tokened data
            return;
        }
+
+        //token exists (something must have gone very wrong if it doesn't), so add it and check for overflow (which I'd be surprised if that happens)
+        ARC_Vector_Add(lexer->tokens, (void *)token);
+        if(arc_errno){
+            ARC_DEBUG_LOG_ERROR("ARC_Lexer_LexString(lexer, data), errored when running ARC_Vector_Add(lexer->tokens, token);. check logs for more info");
+            free(token);
+
+            //clean up errored string
+            ARC_String_Destroy(*data);
+            *data = NULL;
+            return;
+        }
+
+        //if the last token was found, destroy the string and return
+        if(lastTokenLength == (*data)->length){
+            ARC_String_Destroy(*data);
+            *data = NULL;
+            return;
+        }
+
+        //strip the string
+        ARC_String_ReplaceWithSubstring(data, lastTokenLength, (*data)->length - lastTokenLength);
    }
 }

@ -177,49 +198,50 @@ uint32_t ARC_Lexer_GetTokensSize(ARC_Lexer *lexer){
    return ARC_Vector_GetSize(lexer->tokens);
 }

-ARC_Bool ARC_Lexer_AutomataMatchCharFn(ARC_String **string, ARC_String **tokenData, void *automataData){
+uint32_t ARC_Lexer_AutomataMatchCharFn(ARC_String **tokenData, ARC_String *string, void *automataData){
    //if there is a match the token will be the same as automataData, so we don't need to store it again
    *tokenData = NULL;

    //check to see if there is a match with automataData as a char
-    if((*string)->data[0] == *(char *)automataData){
-        //to keep from erroring instead of stripping from a 1 character string we can just delete it
-        if((*string)->length == 1){
-            ARC_String_Destroy(*string);
-            *string = NULL;
-            return ARC_True;
-        }
-
-        //strip the charater from the front of the string and return that a match was found
-        ARC_String_ReplaceWithSubstring(string, 1, (*string)->length - 1);
-        return ARC_True;
+    if(string->data[0] == *(char *)automataData){
+        //return the token was found of length 1
+        return 1;
    }

    //no match was found
-    return ARC_False;
+    return 0;
 }

-ARC_Bool ARC_Lexer_AutomataMatchStringFn(ARC_String **string, ARC_String **tokenData, void *automataData){
+uint32_t ARC_Lexer_AutomataMatchStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){
    //if there is a match the token will be the same as automataData, so we don't need to store it again
    *tokenData = NULL;

    //check to see if there is a match with automataData as a string
    ARC_String *automataDataString = (ARC_String *)automataData;
-
-    //to keep from erroring instead of stripping from a same length string we can just delete it
-    if(ARC_String_Equals(*string, automataDataString)){
-        if((*string)->length == automataDataString->length){
-            ARC_String_Destroy(*string);
-            *string = NULL;
-        }
-
-        //strip the token string from the front of the string and return that a match was found
-        ARC_String_ReplaceWithSubstring(string, automataDataString->length, (*string)->length - automataDataString->length);
-        return ARC_True;
+    if(ARC_String_SubstringEquals(string, 0, automataDataString)){
+        //return the token was found of the string length
+        return automataDataString->length;
    }

    //no match was found
-    return ARC_False;
+    return 0;
+}
+
+uint32_t ARC_Lexer_AutomataMatchCharInStringFn(ARC_String **tokenData, ARC_String *string, void *automataData){
+    //if there is a match the token will be the same as automataData, so we don't need to store it again
+    *tokenData = NULL;
+
+    //check to see if there is a char match in automataData as a string
+    ARC_String *automataDataString = (ARC_String *)automataData;
+    for(uint64_t index = 0; index < automataDataString->length; index++){
+        if(string->data[0] == automataDataString->data[index]){
+            //return the token was found in the string of length 1
+            return 1;
+        }
+    }
+
+    //no match was found
+    return 0;
 }

 //private function to free automataData stored as a char
@ -266,8 +288,30 @@ ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchStringRule(uint32_t id
    ARC_String_Copy(&automataData, string);
    tokenRule.automataData = (void *)automataData;

-    //we can use the ARC_Lexer_AutomataMatchCharFn for this
-    tokenRule.automataFn = ARC_Lexer_AutomataMatchCharFn;
+    //we can use the ARC_Lexer_AutomataMatchStringFn for this
+    tokenRule.automataFn = ARC_Lexer_AutomataMatchStringFn;
+
+    //add the private destroy function
+    tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyStringAutomataDataFn;
+
+    //return the created tokenRule
+    return tokenRule;
+}
+
+ARC_LexerTokenRule ARC_LexerTokenRule_CreateAndReturnMatchCharInStringRule(uint32_t id, ARC_String *string){
+    //create the token rule
+    ARC_LexerTokenRule tokenRule;
+
+    //set the id
+    tokenRule.id = id;
+
+    //copy and store the automataData (which is just an ARC_String)
+    ARC_String *automataData;
+    ARC_String_Copy(&automataData, string);
+    tokenRule.automataData = (void *)automataData;
+
+    //we can use the ARC_Lexer_AutomataMatchCharInStringFn for this
+    tokenRule.automataFn = ARC_Lexer_AutomataMatchCharInStringFn;

    //add the private destroy function
    tokenRule.destroyAutomataDataFn = ARC_LexerTokenRule_DestroyStringAutomataDataFn;