LexicalAnalysis类现在支持注释、空格和换行了

This commit is contained in:
梦凌汐 2024-12-14 22:45:49 +08:00
parent 598041d6ff
commit fafb31ac7f
3 changed files with 35 additions and 14 deletions

View File

@ -1,4 +1,4 @@
#include "Tokenizer.h" #include "LexicalAnalysis.h"
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <iomanip> #include <iomanip>
@ -18,10 +18,10 @@ else j=b-a;\n\
return j;\n\ return j;\n\
}"; }";
std::string ruleName = "cpp"; std::string ruleName = "cpp";
Tokenizer tokenizer = Tokenizer(ruleName); LexicalAnalysis lexicalAnalysis = LexicalAnalysis(ruleName);
tokenizer.setRawText(input); lexicalAnalysis.setRawText(input);
tokenizer.printProcessedText(); lexicalAnalysis.printProcessedText();
MyVector<Token> tokens = tokenizer.tokenize(); MyVector<Token> tokens = lexicalAnalysis.tokenize();
std::cout << "Tokenized text: " << std::endl; std::cout << "Tokenized text: " << std::endl;
for (int i = 0; i < tokens.size(); i++) { for (int i = 0; i < tokens.size(); i++) {
if(tokens[i].type == TokenType::TOKEN_TYPE_IDENTIFIER) { if(tokens[i].type == TokenType::TOKEN_TYPE_IDENTIFIER) {
@ -38,11 +38,16 @@ else j=b-a;\n\
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "End of file."; std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "End of file.";
} else if(tokens[i].type == TokenType::TOKEN_TYPE_COMMENT) { } else if(tokens[i].type == TokenType::TOKEN_TYPE_COMMENT) {
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Comment: "; std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Comment: ";
} else if(tokens[i].type == TokenType::TOKEN_TYPE_NEWLINE) {
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Newline ";
} else { } else {
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Unknown token: "; std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Unknown token: ";
} }
std::cout << " " << tokens[i].value << std::endl; std::cout << " " << tokens[i].value << std::endl;
} }
for (int i = 0; i < tokens.size(); i++) {
std::cout << tokens[i].value;
}
return 0; return 0;
} }

View File

@ -1,5 +1,5 @@
#ifndef TOKENIZER_H #ifndef LEXICALANALYSIS_H
#define TOKENIZER_H #define LEXICALANALYSIS_H
#include <string> #include <string>
#include <stdio.h> #include <stdio.h>
@ -14,7 +14,8 @@ enum class TokenType {
TOKEN_TYPE_OPERATOR_OR_DELIMITER, TOKEN_TYPE_OPERATOR_OR_DELIMITER,
TOKEN_TYPE_EOF, TOKEN_TYPE_EOF,
TOKEN_TYPE_UNDEFINED, TOKEN_TYPE_UNDEFINED,
TOKEN_TYPE_COMMENT TOKEN_TYPE_COMMENT,
TOKEN_TYPE_NEWLINE
}; };
struct Token { struct Token {
@ -31,7 +32,7 @@ struct Token {
static const std::string operatorOrDelimiter[] = {"+", "-", "*", "/", "<", "=", ">", ";", "(", ")", "^", ",", "\'", "#", "&", "|", "%", "~", "[", "]", "{", "}", "\\", ".", "\?", ":", "!", "\"", static const std::string operatorOrDelimiter[] = {"+", "-", "*", "/", "<", "=", ">", ";", "(", ")", "^", ",", "\'", "#", "&", "|", "%", "~", "[", "]", "{", "}", "\\", ".", "\?", ":", "!", "\"",
"<=", ">=", "==", "!=", "&&", "<<", ">>", "||", "++", "--", "+=", "-=", "*=", "/=", "%=", "|=", "&=", "^=", "<<=", ">>=", "::", "->"}; "<=", ">=", "==", "!=", "&&", "<<", ">>", "||", "++", "--", "+=", "-=", "*=", "/=", "%=", "|=", "&=", "^=", "<<=", ">>=", "::", "->"};
class Tokenizer { class LexicalAnalysis {
private: private:
std::string ruleName; std::string ruleName;
MyVector<std::string> reserveWord; MyVector<std::string> reserveWord;
@ -83,6 +84,8 @@ private:
} }
void preprocess() { void preprocess() {
preprocessedText = rawText;
return;
preprocessedText = ""; preprocessedText = "";
bool inString = false; bool inString = false;
for(int i = 0; i < rawText.size(); i++) { for(int i = 0; i < rawText.size(); i++) {
@ -117,7 +120,9 @@ private:
void Scan(int & currentIndex) { void Scan(int & currentIndex) {
currentToken = ""; currentToken = "";
while(preprocessedText[currentIndex] == ' ') currentIndex++; while(preprocessedText[currentIndex] == ' ') {
currentToken += preprocessedText[currentIndex++];
}
// printf("current letter: [%d]%c\n", currentIndex, preprocessedText[currentIndex]); // printf("current letter: [%d]%c\n", currentIndex, preprocessedText[currentIndex]);
@ -190,6 +195,7 @@ private:
while(currentIndex < rawText.size() - 1 && !(preprocessedText[currentIndex] == '*' && preprocessedText[currentIndex + 1] == '/')) { while(currentIndex < rawText.size() - 1 && !(preprocessedText[currentIndex] == '*' && preprocessedText[currentIndex + 1] == '/')) {
currentToken += preprocessedText[currentIndex++]; currentToken += preprocessedText[currentIndex++];
} }
currentToken += "/*";
currentIndex += 2; currentIndex += 2;
return; return;
} }
@ -210,7 +216,12 @@ private:
syn = static_cast<int>(TokenType::TOKEN_TYPE_EOF); syn = static_cast<int>(TokenType::TOKEN_TYPE_EOF);
currentIndex++; currentIndex++;
return; return;
} else if(preprocessedText[currentIndex] != '\n') { } else if(preprocessedText[currentIndex] == '\n') {
syn = static_cast<int>(TokenType::TOKEN_TYPE_NEWLINE);
currentToken = "\n";
currentIndex++;
return;
} else {
syn = static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED); syn = static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED);
currentIndex++; currentIndex++;
return; return;
@ -218,7 +229,7 @@ private:
} }
public: public:
Tokenizer(std::string ruleName) { LexicalAnalysis(std::string ruleName) {
this->ruleName = ruleName; this->ruleName = ruleName;
std::ifstream file("../data/highlight/" + ruleName + "/keywords.txt"); // Just for unit test std::ifstream file("../data/highlight/" + ruleName + "/keywords.txt"); // Just for unit test
// std::ifstream file("./data/highlight/" + ruleName + "/keywords.txt"); // std::ifstream file("./data/highlight/" + ruleName + "/keywords.txt");
@ -254,8 +265,7 @@ public:
tokens.clear(); tokens.clear();
while(syn != static_cast<int>(TokenType::TOKEN_TYPE_EOF) && syn != static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED)) { while(syn != static_cast<int>(TokenType::TOKEN_TYPE_EOF) && syn != static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED)) {
Scan(currentIndex); Scan(currentIndex);
// printf("currentToken: [%s]\n", currentToken.c_str()); printf("currentToken: [%s]\n", currentToken.c_str());
if(currentToken == "") continue;
if(syn == static_cast<int>(TokenType::TOKEN_TYPE_STRING)) { if(syn == static_cast<int>(TokenType::TOKEN_TYPE_STRING)) {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_STRING)); tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_STRING));
// printf("string: %s\n", currentToken.c_str()); // printf("string: %s\n", currentToken.c_str());
@ -274,6 +284,12 @@ public:
} else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_COMMENT)) { } else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_COMMENT)) {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_COMMENT)); tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_COMMENT));
// printf("comment: %s\n", currentToken.c_str()); // printf("comment: %s\n", currentToken.c_str());
} else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_EOF)) {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_EOF));
// printf("EOF: %s\n", currentToken.c_str());
} else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_NEWLINE)) {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_NEWLINE));
// printf("newline: %s\n", currentToken.c_str());
} else { } else {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_UNDEFINED)); tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_UNDEFINED));
// printf("undefined: %s\n", currentToken.c_str()); // printf("undefined: %s\n", currentToken.c_str());

Binary file not shown.