LexicalAnalysis类现在支持注释、空格和换行了

This commit is contained in:
梦凌汐 2024-12-14 22:45:49 +08:00
parent 598041d6ff
commit fafb31ac7f
3 changed files with 35 additions and 14 deletions

View File

@ -1,4 +1,4 @@
#include "Tokenizer.h"
#include "LexicalAnalysis.h"
#include <iostream>
#include <string>
#include <iomanip>
@ -18,10 +18,10 @@ else j=b-a;\n\
return j;\n\
}";
std::string ruleName = "cpp";
Tokenizer tokenizer = Tokenizer(ruleName);
tokenizer.setRawText(input);
tokenizer.printProcessedText();
MyVector<Token> tokens = tokenizer.tokenize();
LexicalAnalysis lexicalAnalysis = LexicalAnalysis(ruleName);
lexicalAnalysis.setRawText(input);
lexicalAnalysis.printProcessedText();
MyVector<Token> tokens = lexicalAnalysis.tokenize();
std::cout << "Tokenized text: " << std::endl;
for (int i = 0; i < tokens.size(); i++) {
if(tokens[i].type == TokenType::TOKEN_TYPE_IDENTIFIER) {
@ -38,11 +38,16 @@ else j=b-a;\n\
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "End of file.";
} else if(tokens[i].type == TokenType::TOKEN_TYPE_COMMENT) {
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Comment: ";
} else if(tokens[i].type == TokenType::TOKEN_TYPE_NEWLINE) {
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Newline ";
} else {
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Unknown token: ";
}
std::cout << " " << tokens[i].value << std::endl;
}
for (int i = 0; i < tokens.size(); i++) {
std::cout << tokens[i].value;
}
return 0;
}

View File

@ -1,5 +1,5 @@
#ifndef TOKENIZER_H
#define TOKENIZER_H
#ifndef LEXICALANALYSIS_H
#define LEXICALANALYSIS_H
#include <string>
#include <stdio.h>
@ -14,7 +14,8 @@ enum class TokenType {
TOKEN_TYPE_OPERATOR_OR_DELIMITER,
TOKEN_TYPE_EOF,
TOKEN_TYPE_UNDEFINED,
TOKEN_TYPE_COMMENT
TOKEN_TYPE_COMMENT,
TOKEN_TYPE_NEWLINE
};
struct Token {
@ -31,7 +32,7 @@ struct Token {
static const std::string operatorOrDelimiter[] = {"+", "-", "*", "/", "<", "=", ">", ";", "(", ")", "^", ",", "\'", "#", "&", "|", "%", "~", "[", "]", "{", "}", "\\", ".", "\?", ":", "!", "\"",
"<=", ">=", "==", "!=", "&&", "<<", ">>", "||", "++", "--", "+=", "-=", "*=", "/=", "%=", "|=", "&=", "^=", "<<=", ">>=", "::", "->"};
class Tokenizer {
class LexicalAnalysis {
private:
std::string ruleName;
MyVector<std::string> reserveWord;
@ -83,6 +84,8 @@ private:
}
void preprocess() {
preprocessedText = rawText;
return;
preprocessedText = "";
bool inString = false;
for(int i = 0; i < rawText.size(); i++) {
@ -117,7 +120,9 @@ private:
void Scan(int & currentIndex) {
currentToken = "";
while(preprocessedText[currentIndex] == ' ') currentIndex++;
while(preprocessedText[currentIndex] == ' ') {
currentToken += preprocessedText[currentIndex++];
}
// printf("current letter: [%d]%c\n", currentIndex, preprocessedText[currentIndex]);
@ -190,6 +195,7 @@ private:
while(currentIndex < rawText.size() - 1 && !(preprocessedText[currentIndex] == '*' && preprocessedText[currentIndex + 1] == '/')) {
currentToken += preprocessedText[currentIndex++];
}
currentToken += "/*";
currentIndex += 2;
return;
}
@ -210,7 +216,12 @@ private:
syn = static_cast<int>(TokenType::TOKEN_TYPE_EOF);
currentIndex++;
return;
} else if(preprocessedText[currentIndex] != '\n') {
} else if(preprocessedText[currentIndex] == '\n') {
syn = static_cast<int>(TokenType::TOKEN_TYPE_NEWLINE);
currentToken = "\n";
currentIndex++;
return;
} else {
syn = static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED);
currentIndex++;
return;
@ -218,7 +229,7 @@ private:
}
public:
Tokenizer(std::string ruleName) {
LexicalAnalysis(std::string ruleName) {
this->ruleName = ruleName;
std::ifstream file("../data/highlight/" + ruleName + "/keywords.txt"); // Just for unit test
// std::ifstream file("./data/highlight/" + ruleName + "/keywords.txt");
@ -254,8 +265,7 @@ public:
tokens.clear();
while(syn != static_cast<int>(TokenType::TOKEN_TYPE_EOF) && syn != static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED)) {
Scan(currentIndex);
// printf("currentToken: [%s]\n", currentToken.c_str());
if(currentToken == "") continue;
printf("currentToken: [%s]\n", currentToken.c_str());
if(syn == static_cast<int>(TokenType::TOKEN_TYPE_STRING)) {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_STRING));
// printf("string: %s\n", currentToken.c_str());
@ -274,6 +284,12 @@ public:
} else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_COMMENT)) {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_COMMENT));
// printf("comment: %s\n", currentToken.c_str());
} else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_EOF)) {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_EOF));
// printf("EOF: %s\n", currentToken.c_str());
} else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_NEWLINE)) {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_NEWLINE));
// printf("newline: %s\n", currentToken.c_str());
} else {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_UNDEFINED));
// printf("undefined: %s\n", currentToken.c_str());

Binary file not shown.