增加了Tokenizer类

This commit is contained in:
梦凌汐 2024-12-12 21:12:06 +08:00
parent 96e743fc26
commit 598041d6ff
7 changed files with 444 additions and 1 deletions

0
components/TextEditor.h Normal file
View File

View File

@ -0,0 +1,91 @@
asm
auto
bool
break
case
catch
char
class
const
const_cast
continue
default
delete
do
double
dynamic_cast
else
enum
explicit
export
extern
false
float
for
friend
goto
if
inline
int
long
mutable
namespace
new
operator
private
protected
public
register
reinterpret_cast
return
short
signed
sizeof
static
static_cast
struct
switch
template
this
throw
true
try
typedef
typeid
typename
union
unsigned
using
virtual
void
volatile
wchar_t
//some useful keywords
std
string
vector
list
map
set
deque
stack
queue
bitset
complex
valarray
pair
tuple
array
shared_ptr
unique_ptr
weak_ptr
function
thread
mutex
condition_variable
atomic
future
promise
packaged_task
thread_local

View File

@ -88,7 +88,7 @@ public:
} }
} }
void erase(size_t index) override { void erase(size_t index) {
if (index >= m_size) { if (index >= m_size) {
throw std::out_of_range("Index out of range"); throw std::out_of_range("Index out of range");
} }

15
utils/SyntaxHighlighter.h Normal file
View File

@ -0,0 +1,15 @@
#ifndef SYNTAX_HIGHLIGHTER_H
#define SYNTAX_HIGHLIGHTER_H
#include "Color.h"
#include "RichText.h"
#include <string>
class SyntaxHighlighter {
private:
std::string ruleName;
};
#endif // SYNTAX_HIGHLIGHTER_H

BIN
utils/Tokenizer Normal file

Binary file not shown.

289
utils/Tokenizer.h Normal file
View File

@ -0,0 +1,289 @@
#ifndef TOKENIZER_H
#define TOKENIZER_H
#include <string>
#include <stdio.h>
#include <fstream>
#include "../mystl/my_vector.h"
enum class TokenType {
TOKEN_TYPE_IDENTIFIER = 9999,
TOKEN_TYPE_NUMBER = 10000,
TOKEN_TYPE_STRING = 10001,
TOKEN_TYPE_RESERVE_WORD,
TOKEN_TYPE_OPERATOR_OR_DELIMITER,
TOKEN_TYPE_EOF,
TOKEN_TYPE_UNDEFINED,
TOKEN_TYPE_COMMENT
};
struct Token {
std::string value;
TokenType type;
Token() {}
Token(std::string value, TokenType type) {
this->value = value;
this->type = type;
}
};
static const std::string operatorOrDelimiter[] = {"+", "-", "*", "/", "<", "=", ">", ";", "(", ")", "^", ",", "\'", "#", "&", "|", "%", "~", "[", "]", "{", "}", "\\", ".", "\?", ":", "!", "\"",
"<=", ">=", "==", "!=", "&&", "<<", ">>", "||", "++", "--", "+=", "-=", "*=", "/=", "%=", "|=", "&=", "^=", "<<=", ">>=", "::", "->"};
class Tokenizer {
private:
std::string ruleName;
MyVector<std::string> reserveWord;
std::string rawText, preprocessedText;
MyVector<Token> tokens;
std::string currentToken = "";
int reserveWordCount, operatorAndDelimiterCount;
int syn = -1;
int searchReserveWord(std::string word) {
for(int i = 0; i < reserveWord.size(); i++) {
if (word == reserveWord[i]) {
return i;
}
}
return -1;
}
bool isLetter(char c) {
if(c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == '_') {
return true;
}
return false;
}
bool isDigit(char c) {
if(c >= '0' && c <= '9') {
return true;
}
return false;
}
int isOperatorOrDelimiter(std::string str) {
for(int i = 0; i < operatorAndDelimiterCount; i++) {
if(str == operatorOrDelimiter[i]) {
return i + reserveWordCount;
}
}
return -1;
}
bool maybeOperatorOrDelimiterWith2Letters(char c) {
for(int i = 0; i < operatorAndDelimiterCount; i++) {
if(c == operatorOrDelimiter[i][0] && operatorOrDelimiter[i].size() > 1) {
return true;
}
}
return false;
}
void preprocess() {
preprocessedText = "";
bool inString = false;
for(int i = 0; i < rawText.size(); i++) {
if(rawText[i] == '/') {
if(i < rawText.size() - 1 && rawText[i + 1] == '/')
while(i < rawText.size() && rawText[i] != '\n') {
++i;
}
}
if(rawText[i] == '/') {
if(i < rawText.size() - 1 && rawText[i + 1] == '*') {
i += 2;
while(i < rawText.size() - 1 && !(rawText[i] == '*' && rawText[i + 1] == '/')) {
++i;
}
i += 2;
}
}
if(rawText[i] == '\"') {
inString = !inString;
}
if(!inString && rawText[i] == '\n') {
preprocessedText += ' ';
continue;
}
if (rawText[i] != '\t' && rawText[i] != '\v' && rawText[i] != '\r') {
preprocessedText += rawText[i];
}
}
}
void Scan(int & currentIndex) {
currentToken = "";
while(preprocessedText[currentIndex] == ' ') currentIndex++;
// printf("current letter: [%d]%c\n", currentIndex, preprocessedText[currentIndex]);
if(isLetter(preprocessedText[currentIndex])) {
while(isLetter(preprocessedText[currentIndex]) || isDigit(preprocessedText[currentIndex])) {
currentToken += preprocessedText[currentIndex++];
}
syn = searchReserveWord(currentToken);
syn = syn == -1 ? static_cast<int>(TokenType::TOKEN_TYPE_IDENTIFIER) : syn;
return;
} else if(isDigit(preprocessedText[currentIndex])) {
while(isDigit(preprocessedText[currentIndex])) {
currentToken += preprocessedText[currentIndex++];
}
syn = static_cast<int>(TokenType::TOKEN_TYPE_NUMBER);
return;
} else if((isOperatorOrDelimiter(std::string(1, preprocessedText[currentIndex])) != -1) &&
!maybeOperatorOrDelimiterWith2Letters(preprocessedText[currentIndex])) {
if(preprocessedText[currentIndex] == '\"') {
currentToken += preprocessedText[currentIndex++];
while(preprocessedText[currentIndex] != '\"') {
currentToken += preprocessedText[currentIndex++];
}
currentToken += preprocessedText[currentIndex++];
syn = static_cast<int>(TokenType::TOKEN_TYPE_STRING);
return;
}
if(preprocessedText[currentIndex] == '\'') {
currentToken += preprocessedText[currentIndex++];
while(preprocessedText[currentIndex] != '\'') {
currentToken += preprocessedText[currentIndex++];
}
currentToken += preprocessedText[currentIndex++];
syn = static_cast<int>(TokenType::TOKEN_TYPE_STRING);
return;
}
currentToken += preprocessedText[currentIndex++];
syn = isOperatorOrDelimiter(currentToken);
return;
} else if(maybeOperatorOrDelimiterWith2Letters(preprocessedText[currentIndex])) {
if(currentIndex < preprocessedText.size() - 2) { // 优先匹配三个字母的符号
currentToken += preprocessedText[currentIndex];
currentToken += preprocessedText[currentIndex + 1];
currentToken += preprocessedText[currentIndex + 2];
syn = isOperatorOrDelimiter(currentToken);
if(syn != -1) {
currentIndex += 3;
return;
}
}
currentToken = "";
if (currentIndex < preprocessedText.size() - 1) { // 其次匹配两个字母的符号
currentToken += preprocessedText[currentIndex];
currentToken += preprocessedText[currentIndex + 1];
if(currentToken == "//") {
syn = static_cast<int>(TokenType::TOKEN_TYPE_COMMENT);
currentIndex += 2;
while(preprocessedText[currentIndex] != '\n' && currentIndex < preprocessedText.size()) {
currentToken += preprocessedText[currentIndex++];
}
return;
}
if(currentToken == "/*") {
syn = static_cast<int>(TokenType::TOKEN_TYPE_COMMENT);
currentIndex += 2;
while(currentIndex < rawText.size() - 1 && !(preprocessedText[currentIndex] == '*' && preprocessedText[currentIndex + 1] == '/')) {
currentToken += preprocessedText[currentIndex++];
}
currentIndex += 2;
return;
}
syn = isOperatorOrDelimiter(currentToken);
if(syn != -1) {
currentIndex += 2;
return;
}
}
currentToken = "";
currentToken += preprocessedText[currentIndex];
syn = isOperatorOrDelimiter(currentToken);
if(syn != -1) {
currentIndex += 1;
}
return;
} else if (preprocessedText[currentIndex] == '\0' || currentIndex >= preprocessedText.size()) {
syn = static_cast<int>(TokenType::TOKEN_TYPE_EOF);
currentIndex++;
return;
} else if(preprocessedText[currentIndex] != '\n') {
syn = static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED);
currentIndex++;
return;
}
}
public:
Tokenizer(std::string ruleName) {
this->ruleName = ruleName;
std::ifstream file("../data/highlight/" + ruleName + "/keywords.txt"); // Just for unit test
// std::ifstream file("./data/highlight/" + ruleName + "/keywords.txt");
if (file.is_open()) {
// printf("keywords file open\n");
std::string line;
while (std::getline(file, line)) {
reserveWord.push_back(line);
}
file.close();
}
// printf("keywords size: %llu\n", reserveWord.size());
// for(int i = 0; i < reserveWord.size(); i++) {
// printf("keywords[%d]: %s\n", i, reserveWord[i].c_str());
// }
reserveWordCount = reserveWord.size();
operatorAndDelimiterCount = sizeof(operatorOrDelimiter) / sizeof(operatorOrDelimiter[0]);
}
void setRawText(std::string rawText) {
this->rawText = rawText;
preprocess();
// printf("preprocessed text: %s\n", preprocessedText.c_str());
}
void printProcessedText() {
printf("preprocessed text: %s\n", preprocessedText.c_str());
}
MyVector<Token> tokenize() {
syn = -1;
int currentIndex = 0;
tokens.clear();
while(syn != static_cast<int>(TokenType::TOKEN_TYPE_EOF) && syn != static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED)) {
Scan(currentIndex);
// printf("currentToken: [%s]\n", currentToken.c_str());
if(currentToken == "") continue;
if(syn == static_cast<int>(TokenType::TOKEN_TYPE_STRING)) {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_STRING));
// printf("string: %s\n", currentToken.c_str());
} else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_IDENTIFIER)) {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_IDENTIFIER));
// printf("identifier: %s\n", currentToken.c_str());
} else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_NUMBER)) {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_NUMBER));
// printf("number: %s\n", currentToken.c_str());
} else if(syn > 0 && syn < reserveWordCount) {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_RESERVE_WORD));
// printf("reserve word: %s\n", currentToken.c_str());
} else if(syn >= reserveWordCount && syn < reserveWordCount + operatorAndDelimiterCount) {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_OPERATOR_OR_DELIMITER));
// printf("operator or delimiter: %s\n", currentToken.c_str());
} else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_COMMENT)) {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_COMMENT));
// printf("comment: %s\n", currentToken.c_str());
} else {
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_UNDEFINED));
// printf("undefined: %s\n", currentToken.c_str());
}
if(currentIndex >= preprocessedText.length()) {
syn = static_cast<int>(TokenType::TOKEN_TYPE_EOF);
}
}
return tokens;
}
};
#endif // TOKENIZER_H 目前仅支持C/C++

48
utils/Tokenizer_test.cpp Normal file
View File

@ -0,0 +1,48 @@
#include "Tokenizer.h"
#include <iostream>
#include <string>
#include <iomanip>
int main() {
std::string input = "int main()\n\
{\n\
int a=-5,b=4,j; //this is an inline comment\n\
if(a >= b)\n\
j++;\n\
/*\n\
This is a block comment\n\
*/\n\
string str=\"test str\";\n\
j = a - b;\n\
else j=b-a;\n\
return j;\n\
}";
std::string ruleName = "cpp";
Tokenizer tokenizer = Tokenizer(ruleName);
tokenizer.setRawText(input);
tokenizer.printProcessedText();
MyVector<Token> tokens = tokenizer.tokenize();
std::cout << "Tokenized text: " << std::endl;
for (int i = 0; i < tokens.size(); i++) {
if(tokens[i].type == TokenType::TOKEN_TYPE_IDENTIFIER) {
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Identifier: ";
} else if(tokens[i].type == TokenType::TOKEN_TYPE_NUMBER) {
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Number: ";
} else if(tokens[i].type == TokenType::TOKEN_TYPE_OPERATOR_OR_DELIMITER) {
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Operator or Delimiter: ";
} else if(tokens[i].type == TokenType::TOKEN_TYPE_STRING) {
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "String: ";
} else if(tokens[i].type == TokenType::TOKEN_TYPE_RESERVE_WORD) {
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Reserve word: ";
} else if(tokens[i].type == TokenType::TOKEN_TYPE_EOF) {
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "End of file.";
} else if(tokens[i].type == TokenType::TOKEN_TYPE_COMMENT) {
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Comment: ";
} else {
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Unknown token: ";
}
std::cout << " " << tokens[i].value << std::endl;
}
return 0;
}