LexicalAnalysis类现在支持注释、空格和换行了

2025-07-09 10:54:37 +00:00 · 2024-12-14 22:45:49 +08:00 · 2024-12-14 22:45:49 +08:00 · fafb31ac7f
commit fafb31ac7f
parent 598041d6ff
3 changed files with 35 additions and 14 deletions
--- a/utils/LexicalAnalysis.cpp
+++ b/utils/LexicalAnalysis.cpp
@ -1,4 +1,4 @@
-#include "Tokenizer.h"
+#include "LexicalAnalysis.h"
 #include <iostream>
 #include <string>
 #include <iomanip>
@ -18,10 +18,10 @@ else  j=b-a;\n\
    return j;\n\
 }";
    std::string ruleName = "cpp";
-    Tokenizer tokenizer = Tokenizer(ruleName);
-    tokenizer.setRawText(input);
-    tokenizer.printProcessedText();
-    MyVector<Token> tokens = tokenizer.tokenize();
+    LexicalAnalysis lexicalAnalysis = LexicalAnalysis(ruleName);
+    lexicalAnalysis.setRawText(input);
+    lexicalAnalysis.printProcessedText();
+    MyVector<Token> tokens = lexicalAnalysis.tokenize();
    std::cout << "Tokenized text: " << std::endl;
    for (int i = 0; i < tokens.size(); i++) {
        if(tokens[i].type == TokenType::TOKEN_TYPE_IDENTIFIER) {
@ -38,11 +38,16 @@ else  j=b-a;\n\
            std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "End of file.";
        } else if(tokens[i].type == TokenType::TOKEN_TYPE_COMMENT) {
            std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Comment: ";
+        } else if(tokens[i].type == TokenType::TOKEN_TYPE_NEWLINE) {
+            std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Newline  ";
        } else {
            std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Unknown token: ";
        }
        std::cout << " " << tokens[i].value << std::endl;
    }
+    for (int i = 0; i < tokens.size(); i++) {
+        std::cout << tokens[i].value;
+    }

    return 0;
 }
--- a/utils/LexicalAnalysis.h
+++ b/utils/LexicalAnalysis.h
@ -1,5 +1,5 @@
-#ifndef TOKENIZER_H
-#define TOKENIZER_H
+#ifndef LEXICALANALYSIS_H
+#define LEXICALANALYSIS_H

 #include <string>
 #include <stdio.h>
@ -14,7 +14,8 @@ enum class TokenType {
    TOKEN_TYPE_OPERATOR_OR_DELIMITER,
    TOKEN_TYPE_EOF,
    TOKEN_TYPE_UNDEFINED,
-    TOKEN_TYPE_COMMENT
+    TOKEN_TYPE_COMMENT,
+    TOKEN_TYPE_NEWLINE
 };

 struct Token {
@ -31,7 +32,7 @@ struct Token {
 static const std::string operatorOrDelimiter[] = {"+", "-", "*", "/", "<", "=", ">", ";", "(", ")", "^", ",", "\'", "#", "&", "|", "%", "~", "[", "]", "{", "}", "\\", ".", "\?", ":", "!", "\"", 
    "<=", ">=", "==", "!=", "&&", "<<", ">>", "||", "++", "--", "+=", "-=", "*=", "/=", "%=", "|=", "&=", "^=", "<<=", ">>=", "::", "->"};

-class Tokenizer {
+class LexicalAnalysis {
 private:
    std::string ruleName;
    MyVector<std::string> reserveWord;
@ -83,6 +84,8 @@ private:
    }
    
    void preprocess() {
+        preprocessedText = rawText;
+        return;
        preprocessedText = "";
        bool inString = false;
        for(int i = 0; i < rawText.size(); i++) {
@ -117,7 +120,9 @@ private:
    void Scan(int & currentIndex) {
        currentToken = "";

-        while(preprocessedText[currentIndex] == ' ') currentIndex++;
+        while(preprocessedText[currentIndex] == ' ') {
+            currentToken += preprocessedText[currentIndex++];
+        }

        // printf("current letter: [%d]%c\n", currentIndex, preprocessedText[currentIndex]);

@ -190,6 +195,7 @@ private:
                    while(currentIndex < rawText.size() - 1 && !(preprocessedText[currentIndex] == '*' && preprocessedText[currentIndex + 1] == '/')) {
                        currentToken += preprocessedText[currentIndex++];
                    }
+                    currentToken += "/*";
                    currentIndex += 2;
                    return;
                }
@ -210,7 +216,12 @@ private:
            syn = static_cast<int>(TokenType::TOKEN_TYPE_EOF);
            currentIndex++;
            return;
-        } else if(preprocessedText[currentIndex] != '\n') {
+        } else if(preprocessedText[currentIndex] == '\n') {
+            syn = static_cast<int>(TokenType::TOKEN_TYPE_NEWLINE);
+            currentToken = "\n";
+            currentIndex++;
+            return;
+        } else {
            syn = static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED);
            currentIndex++;
            return;
@ -218,7 +229,7 @@ private:
    }

 public:
-    Tokenizer(std::string ruleName) {
+    LexicalAnalysis(std::string ruleName) {
        this->ruleName = ruleName;
        std::ifstream file("../data/highlight/" + ruleName + "/keywords.txt"); // Just for unit test
        // std::ifstream file("./data/highlight/" + ruleName + "/keywords.txt");
@ -254,8 +265,7 @@ public:
        tokens.clear();
        while(syn != static_cast<int>(TokenType::TOKEN_TYPE_EOF) && syn != static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED)) {
            Scan(currentIndex);
-            // printf("currentToken: [%s]\n", currentToken.c_str());
-            if(currentToken == "") continue;
+            printf("currentToken: [%s]\n", currentToken.c_str());
            if(syn == static_cast<int>(TokenType::TOKEN_TYPE_STRING)) {
                tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_STRING));
                // printf("string: %s\n", currentToken.c_str());
@ -274,6 +284,12 @@ public:
            } else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_COMMENT)) {
                tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_COMMENT));
                // printf("comment: %s\n", currentToken.c_str());
+            } else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_EOF)) {
+                tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_EOF));
+                // printf("EOF: %s\n", currentToken.c_str());
+            } else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_NEWLINE)) {
+                tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_NEWLINE));
+                // printf("newline: %s\n", currentToken.c_str());
            } else {
                tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_UNDEFINED));
                // printf("undefined: %s\n", currentToken.c_str());
--- a/utils/Tokenizer
+++ b/utils/Tokenizer