增加了Tokenizer类

2025-07-09 10:54:37 +00:00 · 2024-12-12 21:12:06 +08:00 · 2024-12-12 21:12:06 +08:00 · 598041d6ff
commit 598041d6ff
parent 96e743fc26
7 changed files with 444 additions and 1 deletions
--- a/components/TextEditor.h
+++ b/components/TextEditor.h
--- a/data/highlight/cpp/keywords.txt
+++ b/data/highlight/cpp/keywords.txt
@ -0,0 +1,91 @@
+asm
+auto
+bool
+break
+case
+catch
+char
+class
+const
+const_cast
+continue
+default
+delete
+do
+double
+dynamic_cast
+else
+enum
+explicit
+export
+extern
+false
+float
+for
+friend
+goto
+if
+inline
+int
+long
+mutable
+namespace
+new
+operator
+private
+protected
+public
+register
+reinterpret_cast
+return
+short
+signed
+sizeof
+static
+static_cast
+struct
+switch
+template
+this
+throw
+true
+try
+typedef
+typeid
+typename
+union
+unsigned
+using
+virtual
+void
+volatile
+wchar_t
+
+//some useful keywords
+std
+string
+vector
+list
+map
+set
+deque
+stack
+queue
+bitset
+complex
+valarray
+pair
+tuple
+array
+shared_ptr
+unique_ptr
+weak_ptr
+function
+thread
+mutex
+condition_variable
+atomic
+future
+promise
+packaged_task
+thread_local
--- a/mystl/my_vector.h
+++ b/mystl/my_vector.h
@ -88,7 +88,7 @@ public:
        }
    }

-    void erase(size_t index) override {
+    void erase(size_t index) {
        if (index >= m_size) {
            throw std::out_of_range("Index out of range");
        }
--- a/utils/SyntaxHighlighter.h
+++ b/utils/SyntaxHighlighter.h
@ -0,0 +1,15 @@
+#ifndef SYNTAX_HIGHLIGHTER_H
+#define SYNTAX_HIGHLIGHTER_H
+
+#include "Color.h"
+#include "RichText.h"
+#include <string>
+
+class SyntaxHighlighter {
+private:
+    std::string ruleName;
+    
+
+};
+
+#endif // SYNTAX_HIGHLIGHTER_H
--- a/utils/Tokenizer
+++ b/utils/Tokenizer
--- a/utils/Tokenizer.h
+++ b/utils/Tokenizer.h
@ -0,0 +1,289 @@
+#ifndef TOKENIZER_H
+#define TOKENIZER_H
+
+#include <string>
+#include <stdio.h>
+#include <fstream>
+#include "../mystl/my_vector.h"
+
+enum class TokenType {
+    TOKEN_TYPE_IDENTIFIER = 9999,
+    TOKEN_TYPE_NUMBER = 10000,
+    TOKEN_TYPE_STRING = 10001,
+    TOKEN_TYPE_RESERVE_WORD,
+    TOKEN_TYPE_OPERATOR_OR_DELIMITER,
+    TOKEN_TYPE_EOF,
+    TOKEN_TYPE_UNDEFINED,
+    TOKEN_TYPE_COMMENT
+};
+
+struct Token {
+    std::string value;
+    TokenType type;
+
+    Token() {}
+    Token(std::string value, TokenType type) {
+        this->value = value;
+        this->type = type;
+    }
+};
+
+static const std::string operatorOrDelimiter[] = {"+", "-", "*", "/", "<", "=", ">", ";", "(", ")", "^", ",", "\'", "#", "&", "|", "%", "~", "[", "]", "{", "}", "\\", ".", "\?", ":", "!", "\"", 
+    "<=", ">=", "==", "!=", "&&", "<<", ">>", "||", "++", "--", "+=", "-=", "*=", "/=", "%=", "|=", "&=", "^=", "<<=", ">>=", "::", "->"};
+
+class Tokenizer {
+private:
+    std::string ruleName;
+    MyVector<std::string> reserveWord;
+    std::string rawText, preprocessedText;
+    MyVector<Token> tokens;
+    std::string currentToken = "";
+    int reserveWordCount, operatorAndDelimiterCount;
+    int syn = -1;
+
+    int searchReserveWord(std::string word) {
+        for(int i = 0; i < reserveWord.size(); i++) {
+            if (word == reserveWord[i]) {
+                return i;
+            }
+        }
+        return -1;
+    }
+
+    bool isLetter(char c) {
+        if(c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == '_') {
+            return true;
+        }
+        return false;
+    }
+
+    bool isDigit(char c) {
+        if(c >= '0' && c <= '9') {
+            return true;
+        }
+        return false;
+    }
+
+    int isOperatorOrDelimiter(std::string str) {
+        for(int i = 0; i < operatorAndDelimiterCount; i++) {
+            if(str == operatorOrDelimiter[i]) {
+                return i + reserveWordCount;
+            }
+        }
+        return -1;
+    }
+
+    bool maybeOperatorOrDelimiterWith2Letters(char c) {
+        for(int i = 0; i < operatorAndDelimiterCount; i++) {
+            if(c == operatorOrDelimiter[i][0] && operatorOrDelimiter[i].size() > 1) {
+                return true;
+            }
+        }
+        return false;
+    }
+    
+    void preprocess() {
+        preprocessedText = "";
+        bool inString = false;
+        for(int i = 0; i < rawText.size(); i++) {
+            if(rawText[i] == '/') {
+                if(i < rawText.size() - 1 && rawText[i + 1] == '/')
+                    while(i < rawText.size() && rawText[i] != '\n') {
+                        ++i;
+                    }
+            }
+            if(rawText[i] == '/') {
+                if(i < rawText.size() - 1 && rawText[i + 1] == '*') {
+                    i += 2;
+                    while(i < rawText.size() - 1 && !(rawText[i] == '*' && rawText[i + 1] == '/')) {
+                        ++i;
+                    }
+                    i += 2;
+                }
+            }
+            if(rawText[i] == '\"') {
+                inString = !inString;
+            }
+            if(!inString && rawText[i] == '\n') {
+                preprocessedText += ' ';
+                continue;
+            }
+            if (rawText[i] != '\t' && rawText[i] != '\v' && rawText[i] != '\r') {
+                preprocessedText += rawText[i];
+            }
+        }
+    }
+
+    void Scan(int & currentIndex) {
+        currentToken = "";
+
+        while(preprocessedText[currentIndex] == ' ') currentIndex++;
+
+        // printf("current letter: [%d]%c\n", currentIndex, preprocessedText[currentIndex]);
+
+        if(isLetter(preprocessedText[currentIndex])) {
+            while(isLetter(preprocessedText[currentIndex]) || isDigit(preprocessedText[currentIndex])) {
+                currentToken += preprocessedText[currentIndex++];
+            }
+
+            syn = searchReserveWord(currentToken);
+            syn = syn == -1 ? static_cast<int>(TokenType::TOKEN_TYPE_IDENTIFIER) : syn;
+            
+            return;
+        } else if(isDigit(preprocessedText[currentIndex])) {
+            while(isDigit(preprocessedText[currentIndex])) {
+                currentToken += preprocessedText[currentIndex++];
+            }
+
+            syn = static_cast<int>(TokenType::TOKEN_TYPE_NUMBER);
+
+            return;
+        } else if((isOperatorOrDelimiter(std::string(1, preprocessedText[currentIndex])) != -1) && 
+                    !maybeOperatorOrDelimiterWith2Letters(preprocessedText[currentIndex])) {
+            if(preprocessedText[currentIndex] == '\"') {
+                currentToken += preprocessedText[currentIndex++];
+                while(preprocessedText[currentIndex] != '\"') {
+                    currentToken += preprocessedText[currentIndex++];
+                }
+                currentToken += preprocessedText[currentIndex++];
+                syn = static_cast<int>(TokenType::TOKEN_TYPE_STRING);
+                return;
+            }
+            if(preprocessedText[currentIndex] == '\'') {
+                currentToken += preprocessedText[currentIndex++];
+                while(preprocessedText[currentIndex] != '\'') {
+                    currentToken += preprocessedText[currentIndex++];
+                }
+                currentToken += preprocessedText[currentIndex++];
+                syn = static_cast<int>(TokenType::TOKEN_TYPE_STRING);
+                return;
+            }
+            currentToken += preprocessedText[currentIndex++];
+            syn = isOperatorOrDelimiter(currentToken);
+            return;
+        } else if(maybeOperatorOrDelimiterWith2Letters(preprocessedText[currentIndex])) {
+            if(currentIndex < preprocessedText.size() - 2) { // 优先匹配三个字母的符号
+                currentToken += preprocessedText[currentIndex];
+                currentToken += preprocessedText[currentIndex + 1];
+                currentToken += preprocessedText[currentIndex + 2];
+                syn = isOperatorOrDelimiter(currentToken);
+                if(syn != -1) {
+                    currentIndex += 3;
+                    return;
+                }
+            }
+            currentToken = "";
+            if (currentIndex < preprocessedText.size() - 1) { // 其次匹配两个字母的符号
+                currentToken += preprocessedText[currentIndex];
+                currentToken += preprocessedText[currentIndex + 1];
+                if(currentToken == "//") {
+                    syn = static_cast<int>(TokenType::TOKEN_TYPE_COMMENT);
+                    currentIndex += 2;
+                    while(preprocessedText[currentIndex] != '\n' && currentIndex < preprocessedText.size()) {
+                        currentToken += preprocessedText[currentIndex++];
+                    }
+                    return;
+                }
+                if(currentToken == "/*") {
+                    syn = static_cast<int>(TokenType::TOKEN_TYPE_COMMENT);
+                    currentIndex += 2;
+                    while(currentIndex < rawText.size() - 1 && !(preprocessedText[currentIndex] == '*' && preprocessedText[currentIndex + 1] == '/')) {
+                        currentToken += preprocessedText[currentIndex++];
+                    }
+                    currentIndex += 2;
+                    return;
+                }
+                syn = isOperatorOrDelimiter(currentToken);
+                if(syn != -1) {
+                    currentIndex += 2;
+                    return;
+                }
+            }
+            currentToken = "";
+            currentToken += preprocessedText[currentIndex];
+            syn = isOperatorOrDelimiter(currentToken);
+            if(syn != -1) {
+                currentIndex += 1;
+            }
+            return;
+        } else if (preprocessedText[currentIndex] == '\0' || currentIndex >= preprocessedText.size()) {
+            syn = static_cast<int>(TokenType::TOKEN_TYPE_EOF);
+            currentIndex++;
+            return;
+        } else if(preprocessedText[currentIndex] != '\n') {
+            syn = static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED);
+            currentIndex++;
+            return;
+        }
+    }
+
+public:
+    Tokenizer(std::string ruleName) {
+        this->ruleName = ruleName;
+        std::ifstream file("../data/highlight/" + ruleName + "/keywords.txt"); // Just for unit test
+        // std::ifstream file("./data/highlight/" + ruleName + "/keywords.txt");
+        if (file.is_open()) {
+            // printf("keywords file open\n");
+            std::string line;
+            while (std::getline(file, line)) {
+                reserveWord.push_back(line);
+            }
+            file.close();
+        }
+        // printf("keywords size: %llu\n", reserveWord.size());
+        // for(int i = 0; i < reserveWord.size(); i++) {
+        //     printf("keywords[%d]: %s\n", i, reserveWord[i].c_str());
+        // }
+        reserveWordCount = reserveWord.size();
+        operatorAndDelimiterCount = sizeof(operatorOrDelimiter) / sizeof(operatorOrDelimiter[0]);
+    }
+
+    void setRawText(std::string rawText) {
+        this->rawText = rawText;
+        preprocess();
+        // printf("preprocessed text: %s\n", preprocessedText.c_str());
+    }
+
+    void printProcessedText() {
+        printf("preprocessed text: %s\n", preprocessedText.c_str());
+    }
+
+    MyVector<Token> tokenize() {
+        syn = -1;
+        int currentIndex = 0;
+        tokens.clear();
+        while(syn != static_cast<int>(TokenType::TOKEN_TYPE_EOF) && syn != static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED)) {
+            Scan(currentIndex);
+            // printf("currentToken: [%s]\n", currentToken.c_str());
+            if(currentToken == "") continue;
+            if(syn == static_cast<int>(TokenType::TOKEN_TYPE_STRING)) {
+                tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_STRING));
+                // printf("string: %s\n", currentToken.c_str());
+            } else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_IDENTIFIER)) {
+                tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_IDENTIFIER));
+                // printf("identifier: %s\n", currentToken.c_str());
+            } else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_NUMBER)) {
+                tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_NUMBER));
+                // printf("number: %s\n", currentToken.c_str());
+            } else if(syn > 0 && syn < reserveWordCount) {
+                tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_RESERVE_WORD));
+                // printf("reserve word: %s\n", currentToken.c_str());
+            } else if(syn >= reserveWordCount && syn < reserveWordCount + operatorAndDelimiterCount) {
+                tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_OPERATOR_OR_DELIMITER));
+                // printf("operator or delimiter: %s\n", currentToken.c_str());
+            } else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_COMMENT)) {
+                tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_COMMENT));
+                // printf("comment: %s\n", currentToken.c_str());
+            } else {
+                tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_UNDEFINED));
+                // printf("undefined: %s\n", currentToken.c_str());
+            }
+            if(currentIndex >= preprocessedText.length()) {
+                syn = static_cast<int>(TokenType::TOKEN_TYPE_EOF);
+            }
+        }
+        return tokens;
+    }
+};
+
+#endif // TOKENIZER_H 目前仅支持C/C++
--- a/utils/Tokenizer_test.cpp
+++ b/utils/Tokenizer_test.cpp
@ -0,0 +1,48 @@
+#include "Tokenizer.h"
+#include <iostream>
+#include <string>
+#include <iomanip>
+
+int main() {
+    std::string input = "int main()\n\
+{\n\
+    int a=-5,b=4,j; //this is an inline comment\n\
+if(a >= b)\n\
+j++;\n\
+/*\n\
+This is a block comment\n\
+*/\n\
+string str=\"test str\";\n\
+j = a - b;\n\
+else  j=b-a;\n\
+    return j;\n\
+}";
+    std::string ruleName = "cpp";
+    Tokenizer tokenizer = Tokenizer(ruleName);
+    tokenizer.setRawText(input);
+    tokenizer.printProcessedText();
+    MyVector<Token> tokens = tokenizer.tokenize();
+    std::cout << "Tokenized text: " << std::endl;
+    for (int i = 0; i < tokens.size(); i++) {
+        if(tokens[i].type == TokenType::TOKEN_TYPE_IDENTIFIER) {
+            std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Identifier: ";
+        } else if(tokens[i].type == TokenType::TOKEN_TYPE_NUMBER) {
+            std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Number: ";
+        } else if(tokens[i].type == TokenType::TOKEN_TYPE_OPERATOR_OR_DELIMITER) {
+            std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Operator or Delimiter: ";
+        } else if(tokens[i].type == TokenType::TOKEN_TYPE_STRING) {
+            std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "String: ";
+        } else if(tokens[i].type == TokenType::TOKEN_TYPE_RESERVE_WORD) {
+            std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Reserve word: ";
+        } else if(tokens[i].type == TokenType::TOKEN_TYPE_EOF) {
+            std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "End of file.";
+        } else if(tokens[i].type == TokenType::TOKEN_TYPE_COMMENT) {
+            std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Comment: ";
+        } else {
+            std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Unknown token: ";
+        }
+        std::cout << " " << tokens[i].value << std::endl;
+    }
+
+    return 0;
+}