mirror of
https://github.com/MeowLynxSea/ceditor.git
synced 2025-07-09 10:54:37 +00:00
LexicalAnalysis类现在支持注释、空格和换行了
This commit is contained in:
parent
598041d6ff
commit
fafb31ac7f
@ -1,4 +1,4 @@
|
|||||||
#include "Tokenizer.h"
|
#include "LexicalAnalysis.h"
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
@ -18,10 +18,10 @@ else j=b-a;\n\
|
|||||||
return j;\n\
|
return j;\n\
|
||||||
}";
|
}";
|
||||||
std::string ruleName = "cpp";
|
std::string ruleName = "cpp";
|
||||||
Tokenizer tokenizer = Tokenizer(ruleName);
|
LexicalAnalysis lexicalAnalysis = LexicalAnalysis(ruleName);
|
||||||
tokenizer.setRawText(input);
|
lexicalAnalysis.setRawText(input);
|
||||||
tokenizer.printProcessedText();
|
lexicalAnalysis.printProcessedText();
|
||||||
MyVector<Token> tokens = tokenizer.tokenize();
|
MyVector<Token> tokens = lexicalAnalysis.tokenize();
|
||||||
std::cout << "Tokenized text: " << std::endl;
|
std::cout << "Tokenized text: " << std::endl;
|
||||||
for (int i = 0; i < tokens.size(); i++) {
|
for (int i = 0; i < tokens.size(); i++) {
|
||||||
if(tokens[i].type == TokenType::TOKEN_TYPE_IDENTIFIER) {
|
if(tokens[i].type == TokenType::TOKEN_TYPE_IDENTIFIER) {
|
||||||
@ -38,11 +38,16 @@ else j=b-a;\n\
|
|||||||
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "End of file.";
|
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "End of file.";
|
||||||
} else if(tokens[i].type == TokenType::TOKEN_TYPE_COMMENT) {
|
} else if(tokens[i].type == TokenType::TOKEN_TYPE_COMMENT) {
|
||||||
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Comment: ";
|
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Comment: ";
|
||||||
|
} else if(tokens[i].type == TokenType::TOKEN_TYPE_NEWLINE) {
|
||||||
|
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Newline ";
|
||||||
} else {
|
} else {
|
||||||
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Unknown token: ";
|
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Unknown token: ";
|
||||||
}
|
}
|
||||||
std::cout << " " << tokens[i].value << std::endl;
|
std::cout << " " << tokens[i].value << std::endl;
|
||||||
}
|
}
|
||||||
|
for (int i = 0; i < tokens.size(); i++) {
|
||||||
|
std::cout << tokens[i].value;
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef TOKENIZER_H
|
#ifndef LEXICALANALYSIS_H
|
||||||
#define TOKENIZER_H
|
#define LEXICALANALYSIS_H
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
@ -14,7 +14,8 @@ enum class TokenType {
|
|||||||
TOKEN_TYPE_OPERATOR_OR_DELIMITER,
|
TOKEN_TYPE_OPERATOR_OR_DELIMITER,
|
||||||
TOKEN_TYPE_EOF,
|
TOKEN_TYPE_EOF,
|
||||||
TOKEN_TYPE_UNDEFINED,
|
TOKEN_TYPE_UNDEFINED,
|
||||||
TOKEN_TYPE_COMMENT
|
TOKEN_TYPE_COMMENT,
|
||||||
|
TOKEN_TYPE_NEWLINE
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Token {
|
struct Token {
|
||||||
@ -31,7 +32,7 @@ struct Token {
|
|||||||
static const std::string operatorOrDelimiter[] = {"+", "-", "*", "/", "<", "=", ">", ";", "(", ")", "^", ",", "\'", "#", "&", "|", "%", "~", "[", "]", "{", "}", "\\", ".", "\?", ":", "!", "\"",
|
static const std::string operatorOrDelimiter[] = {"+", "-", "*", "/", "<", "=", ">", ";", "(", ")", "^", ",", "\'", "#", "&", "|", "%", "~", "[", "]", "{", "}", "\\", ".", "\?", ":", "!", "\"",
|
||||||
"<=", ">=", "==", "!=", "&&", "<<", ">>", "||", "++", "--", "+=", "-=", "*=", "/=", "%=", "|=", "&=", "^=", "<<=", ">>=", "::", "->"};
|
"<=", ">=", "==", "!=", "&&", "<<", ">>", "||", "++", "--", "+=", "-=", "*=", "/=", "%=", "|=", "&=", "^=", "<<=", ">>=", "::", "->"};
|
||||||
|
|
||||||
class Tokenizer {
|
class LexicalAnalysis {
|
||||||
private:
|
private:
|
||||||
std::string ruleName;
|
std::string ruleName;
|
||||||
MyVector<std::string> reserveWord;
|
MyVector<std::string> reserveWord;
|
||||||
@ -83,6 +84,8 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
void preprocess() {
|
void preprocess() {
|
||||||
|
preprocessedText = rawText;
|
||||||
|
return;
|
||||||
preprocessedText = "";
|
preprocessedText = "";
|
||||||
bool inString = false;
|
bool inString = false;
|
||||||
for(int i = 0; i < rawText.size(); i++) {
|
for(int i = 0; i < rawText.size(); i++) {
|
||||||
@ -117,7 +120,9 @@ private:
|
|||||||
void Scan(int & currentIndex) {
|
void Scan(int & currentIndex) {
|
||||||
currentToken = "";
|
currentToken = "";
|
||||||
|
|
||||||
while(preprocessedText[currentIndex] == ' ') currentIndex++;
|
while(preprocessedText[currentIndex] == ' ') {
|
||||||
|
currentToken += preprocessedText[currentIndex++];
|
||||||
|
}
|
||||||
|
|
||||||
// printf("current letter: [%d]%c\n", currentIndex, preprocessedText[currentIndex]);
|
// printf("current letter: [%d]%c\n", currentIndex, preprocessedText[currentIndex]);
|
||||||
|
|
||||||
@ -190,6 +195,7 @@ private:
|
|||||||
while(currentIndex < rawText.size() - 1 && !(preprocessedText[currentIndex] == '*' && preprocessedText[currentIndex + 1] == '/')) {
|
while(currentIndex < rawText.size() - 1 && !(preprocessedText[currentIndex] == '*' && preprocessedText[currentIndex + 1] == '/')) {
|
||||||
currentToken += preprocessedText[currentIndex++];
|
currentToken += preprocessedText[currentIndex++];
|
||||||
}
|
}
|
||||||
|
currentToken += "/*";
|
||||||
currentIndex += 2;
|
currentIndex += 2;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -210,7 +216,12 @@ private:
|
|||||||
syn = static_cast<int>(TokenType::TOKEN_TYPE_EOF);
|
syn = static_cast<int>(TokenType::TOKEN_TYPE_EOF);
|
||||||
currentIndex++;
|
currentIndex++;
|
||||||
return;
|
return;
|
||||||
} else if(preprocessedText[currentIndex] != '\n') {
|
} else if(preprocessedText[currentIndex] == '\n') {
|
||||||
|
syn = static_cast<int>(TokenType::TOKEN_TYPE_NEWLINE);
|
||||||
|
currentToken = "\n";
|
||||||
|
currentIndex++;
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
syn = static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED);
|
syn = static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED);
|
||||||
currentIndex++;
|
currentIndex++;
|
||||||
return;
|
return;
|
||||||
@ -218,7 +229,7 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Tokenizer(std::string ruleName) {
|
LexicalAnalysis(std::string ruleName) {
|
||||||
this->ruleName = ruleName;
|
this->ruleName = ruleName;
|
||||||
std::ifstream file("../data/highlight/" + ruleName + "/keywords.txt"); // Just for unit test
|
std::ifstream file("../data/highlight/" + ruleName + "/keywords.txt"); // Just for unit test
|
||||||
// std::ifstream file("./data/highlight/" + ruleName + "/keywords.txt");
|
// std::ifstream file("./data/highlight/" + ruleName + "/keywords.txt");
|
||||||
@ -254,8 +265,7 @@ public:
|
|||||||
tokens.clear();
|
tokens.clear();
|
||||||
while(syn != static_cast<int>(TokenType::TOKEN_TYPE_EOF) && syn != static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED)) {
|
while(syn != static_cast<int>(TokenType::TOKEN_TYPE_EOF) && syn != static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED)) {
|
||||||
Scan(currentIndex);
|
Scan(currentIndex);
|
||||||
// printf("currentToken: [%s]\n", currentToken.c_str());
|
printf("currentToken: [%s]\n", currentToken.c_str());
|
||||||
if(currentToken == "") continue;
|
|
||||||
if(syn == static_cast<int>(TokenType::TOKEN_TYPE_STRING)) {
|
if(syn == static_cast<int>(TokenType::TOKEN_TYPE_STRING)) {
|
||||||
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_STRING));
|
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_STRING));
|
||||||
// printf("string: %s\n", currentToken.c_str());
|
// printf("string: %s\n", currentToken.c_str());
|
||||||
@ -274,6 +284,12 @@ public:
|
|||||||
} else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_COMMENT)) {
|
} else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_COMMENT)) {
|
||||||
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_COMMENT));
|
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_COMMENT));
|
||||||
// printf("comment: %s\n", currentToken.c_str());
|
// printf("comment: %s\n", currentToken.c_str());
|
||||||
|
} else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_EOF)) {
|
||||||
|
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_EOF));
|
||||||
|
// printf("EOF: %s\n", currentToken.c_str());
|
||||||
|
} else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_NEWLINE)) {
|
||||||
|
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_NEWLINE));
|
||||||
|
// printf("newline: %s\n", currentToken.c_str());
|
||||||
} else {
|
} else {
|
||||||
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_UNDEFINED));
|
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_UNDEFINED));
|
||||||
// printf("undefined: %s\n", currentToken.c_str());
|
// printf("undefined: %s\n", currentToken.c_str());
|
BIN
utils/Tokenizer
BIN
utils/Tokenizer
Binary file not shown.
Loading…
Reference in New Issue
Block a user