mirror of
https://github.com/MeowLynxSea/ceditor.git
synced 2025-07-09 10:54:37 +00:00
增加了Tokenizer类
This commit is contained in:
parent
96e743fc26
commit
598041d6ff
0
components/TextEditor.h
Normal file
0
components/TextEditor.h
Normal file
91
data/highlight/cpp/keywords.txt
Normal file
91
data/highlight/cpp/keywords.txt
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
asm
|
||||||
|
auto
|
||||||
|
bool
|
||||||
|
break
|
||||||
|
case
|
||||||
|
catch
|
||||||
|
char
|
||||||
|
class
|
||||||
|
const
|
||||||
|
const_cast
|
||||||
|
continue
|
||||||
|
default
|
||||||
|
delete
|
||||||
|
do
|
||||||
|
double
|
||||||
|
dynamic_cast
|
||||||
|
else
|
||||||
|
enum
|
||||||
|
explicit
|
||||||
|
export
|
||||||
|
extern
|
||||||
|
false
|
||||||
|
float
|
||||||
|
for
|
||||||
|
friend
|
||||||
|
goto
|
||||||
|
if
|
||||||
|
inline
|
||||||
|
int
|
||||||
|
long
|
||||||
|
mutable
|
||||||
|
namespace
|
||||||
|
new
|
||||||
|
operator
|
||||||
|
private
|
||||||
|
protected
|
||||||
|
public
|
||||||
|
register
|
||||||
|
reinterpret_cast
|
||||||
|
return
|
||||||
|
short
|
||||||
|
signed
|
||||||
|
sizeof
|
||||||
|
static
|
||||||
|
static_cast
|
||||||
|
struct
|
||||||
|
switch
|
||||||
|
template
|
||||||
|
this
|
||||||
|
throw
|
||||||
|
true
|
||||||
|
try
|
||||||
|
typedef
|
||||||
|
typeid
|
||||||
|
typename
|
||||||
|
union
|
||||||
|
unsigned
|
||||||
|
using
|
||||||
|
virtual
|
||||||
|
void
|
||||||
|
volatile
|
||||||
|
wchar_t
|
||||||
|
|
||||||
|
//some useful keywords
|
||||||
|
std
|
||||||
|
string
|
||||||
|
vector
|
||||||
|
list
|
||||||
|
map
|
||||||
|
set
|
||||||
|
deque
|
||||||
|
stack
|
||||||
|
queue
|
||||||
|
bitset
|
||||||
|
complex
|
||||||
|
valarray
|
||||||
|
pair
|
||||||
|
tuple
|
||||||
|
array
|
||||||
|
shared_ptr
|
||||||
|
unique_ptr
|
||||||
|
weak_ptr
|
||||||
|
function
|
||||||
|
thread
|
||||||
|
mutex
|
||||||
|
condition_variable
|
||||||
|
atomic
|
||||||
|
future
|
||||||
|
promise
|
||||||
|
packaged_task
|
||||||
|
thread_local
|
@ -88,7 +88,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void erase(size_t index) override {
|
void erase(size_t index) {
|
||||||
if (index >= m_size) {
|
if (index >= m_size) {
|
||||||
throw std::out_of_range("Index out of range");
|
throw std::out_of_range("Index out of range");
|
||||||
}
|
}
|
||||||
|
15
utils/SyntaxHighlighter.h
Normal file
15
utils/SyntaxHighlighter.h
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
#ifndef SYNTAX_HIGHLIGHTER_H
|
||||||
|
#define SYNTAX_HIGHLIGHTER_H
|
||||||
|
|
||||||
|
#include "Color.h"
|
||||||
|
#include "RichText.h"
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
class SyntaxHighlighter {
|
||||||
|
private:
|
||||||
|
std::string ruleName;
|
||||||
|
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // SYNTAX_HIGHLIGHTER_H
|
BIN
utils/Tokenizer
Normal file
BIN
utils/Tokenizer
Normal file
Binary file not shown.
289
utils/Tokenizer.h
Normal file
289
utils/Tokenizer.h
Normal file
@ -0,0 +1,289 @@
|
|||||||
|
#ifndef TOKENIZER_H
|
||||||
|
#define TOKENIZER_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <fstream>
|
||||||
|
#include "../mystl/my_vector.h"
|
||||||
|
|
||||||
|
enum class TokenType {
|
||||||
|
TOKEN_TYPE_IDENTIFIER = 9999,
|
||||||
|
TOKEN_TYPE_NUMBER = 10000,
|
||||||
|
TOKEN_TYPE_STRING = 10001,
|
||||||
|
TOKEN_TYPE_RESERVE_WORD,
|
||||||
|
TOKEN_TYPE_OPERATOR_OR_DELIMITER,
|
||||||
|
TOKEN_TYPE_EOF,
|
||||||
|
TOKEN_TYPE_UNDEFINED,
|
||||||
|
TOKEN_TYPE_COMMENT
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Token {
|
||||||
|
std::string value;
|
||||||
|
TokenType type;
|
||||||
|
|
||||||
|
Token() {}
|
||||||
|
Token(std::string value, TokenType type) {
|
||||||
|
this->value = value;
|
||||||
|
this->type = type;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static const std::string operatorOrDelimiter[] = {"+", "-", "*", "/", "<", "=", ">", ";", "(", ")", "^", ",", "\'", "#", "&", "|", "%", "~", "[", "]", "{", "}", "\\", ".", "\?", ":", "!", "\"",
|
||||||
|
"<=", ">=", "==", "!=", "&&", "<<", ">>", "||", "++", "--", "+=", "-=", "*=", "/=", "%=", "|=", "&=", "^=", "<<=", ">>=", "::", "->"};
|
||||||
|
|
||||||
|
class Tokenizer {
|
||||||
|
private:
|
||||||
|
std::string ruleName;
|
||||||
|
MyVector<std::string> reserveWord;
|
||||||
|
std::string rawText, preprocessedText;
|
||||||
|
MyVector<Token> tokens;
|
||||||
|
std::string currentToken = "";
|
||||||
|
int reserveWordCount, operatorAndDelimiterCount;
|
||||||
|
int syn = -1;
|
||||||
|
|
||||||
|
int searchReserveWord(std::string word) {
|
||||||
|
for(int i = 0; i < reserveWord.size(); i++) {
|
||||||
|
if (word == reserveWord[i]) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isLetter(char c) {
|
||||||
|
if(c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == '_') {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isDigit(char c) {
|
||||||
|
if(c >= '0' && c <= '9') {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
int isOperatorOrDelimiter(std::string str) {
|
||||||
|
for(int i = 0; i < operatorAndDelimiterCount; i++) {
|
||||||
|
if(str == operatorOrDelimiter[i]) {
|
||||||
|
return i + reserveWordCount;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool maybeOperatorOrDelimiterWith2Letters(char c) {
|
||||||
|
for(int i = 0; i < operatorAndDelimiterCount; i++) {
|
||||||
|
if(c == operatorOrDelimiter[i][0] && operatorOrDelimiter[i].size() > 1) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void preprocess() {
|
||||||
|
preprocessedText = "";
|
||||||
|
bool inString = false;
|
||||||
|
for(int i = 0; i < rawText.size(); i++) {
|
||||||
|
if(rawText[i] == '/') {
|
||||||
|
if(i < rawText.size() - 1 && rawText[i + 1] == '/')
|
||||||
|
while(i < rawText.size() && rawText[i] != '\n') {
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(rawText[i] == '/') {
|
||||||
|
if(i < rawText.size() - 1 && rawText[i + 1] == '*') {
|
||||||
|
i += 2;
|
||||||
|
while(i < rawText.size() - 1 && !(rawText[i] == '*' && rawText[i + 1] == '/')) {
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
i += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(rawText[i] == '\"') {
|
||||||
|
inString = !inString;
|
||||||
|
}
|
||||||
|
if(!inString && rawText[i] == '\n') {
|
||||||
|
preprocessedText += ' ';
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (rawText[i] != '\t' && rawText[i] != '\v' && rawText[i] != '\r') {
|
||||||
|
preprocessedText += rawText[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Scan(int & currentIndex) {
|
||||||
|
currentToken = "";
|
||||||
|
|
||||||
|
while(preprocessedText[currentIndex] == ' ') currentIndex++;
|
||||||
|
|
||||||
|
// printf("current letter: [%d]%c\n", currentIndex, preprocessedText[currentIndex]);
|
||||||
|
|
||||||
|
if(isLetter(preprocessedText[currentIndex])) {
|
||||||
|
while(isLetter(preprocessedText[currentIndex]) || isDigit(preprocessedText[currentIndex])) {
|
||||||
|
currentToken += preprocessedText[currentIndex++];
|
||||||
|
}
|
||||||
|
|
||||||
|
syn = searchReserveWord(currentToken);
|
||||||
|
syn = syn == -1 ? static_cast<int>(TokenType::TOKEN_TYPE_IDENTIFIER) : syn;
|
||||||
|
|
||||||
|
return;
|
||||||
|
} else if(isDigit(preprocessedText[currentIndex])) {
|
||||||
|
while(isDigit(preprocessedText[currentIndex])) {
|
||||||
|
currentToken += preprocessedText[currentIndex++];
|
||||||
|
}
|
||||||
|
|
||||||
|
syn = static_cast<int>(TokenType::TOKEN_TYPE_NUMBER);
|
||||||
|
|
||||||
|
return;
|
||||||
|
} else if((isOperatorOrDelimiter(std::string(1, preprocessedText[currentIndex])) != -1) &&
|
||||||
|
!maybeOperatorOrDelimiterWith2Letters(preprocessedText[currentIndex])) {
|
||||||
|
if(preprocessedText[currentIndex] == '\"') {
|
||||||
|
currentToken += preprocessedText[currentIndex++];
|
||||||
|
while(preprocessedText[currentIndex] != '\"') {
|
||||||
|
currentToken += preprocessedText[currentIndex++];
|
||||||
|
}
|
||||||
|
currentToken += preprocessedText[currentIndex++];
|
||||||
|
syn = static_cast<int>(TokenType::TOKEN_TYPE_STRING);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if(preprocessedText[currentIndex] == '\'') {
|
||||||
|
currentToken += preprocessedText[currentIndex++];
|
||||||
|
while(preprocessedText[currentIndex] != '\'') {
|
||||||
|
currentToken += preprocessedText[currentIndex++];
|
||||||
|
}
|
||||||
|
currentToken += preprocessedText[currentIndex++];
|
||||||
|
syn = static_cast<int>(TokenType::TOKEN_TYPE_STRING);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
currentToken += preprocessedText[currentIndex++];
|
||||||
|
syn = isOperatorOrDelimiter(currentToken);
|
||||||
|
return;
|
||||||
|
} else if(maybeOperatorOrDelimiterWith2Letters(preprocessedText[currentIndex])) {
|
||||||
|
if(currentIndex < preprocessedText.size() - 2) { // 优先匹配三个字母的符号
|
||||||
|
currentToken += preprocessedText[currentIndex];
|
||||||
|
currentToken += preprocessedText[currentIndex + 1];
|
||||||
|
currentToken += preprocessedText[currentIndex + 2];
|
||||||
|
syn = isOperatorOrDelimiter(currentToken);
|
||||||
|
if(syn != -1) {
|
||||||
|
currentIndex += 3;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
currentToken = "";
|
||||||
|
if (currentIndex < preprocessedText.size() - 1) { // 其次匹配两个字母的符号
|
||||||
|
currentToken += preprocessedText[currentIndex];
|
||||||
|
currentToken += preprocessedText[currentIndex + 1];
|
||||||
|
if(currentToken == "//") {
|
||||||
|
syn = static_cast<int>(TokenType::TOKEN_TYPE_COMMENT);
|
||||||
|
currentIndex += 2;
|
||||||
|
while(preprocessedText[currentIndex] != '\n' && currentIndex < preprocessedText.size()) {
|
||||||
|
currentToken += preprocessedText[currentIndex++];
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if(currentToken == "/*") {
|
||||||
|
syn = static_cast<int>(TokenType::TOKEN_TYPE_COMMENT);
|
||||||
|
currentIndex += 2;
|
||||||
|
while(currentIndex < rawText.size() - 1 && !(preprocessedText[currentIndex] == '*' && preprocessedText[currentIndex + 1] == '/')) {
|
||||||
|
currentToken += preprocessedText[currentIndex++];
|
||||||
|
}
|
||||||
|
currentIndex += 2;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
syn = isOperatorOrDelimiter(currentToken);
|
||||||
|
if(syn != -1) {
|
||||||
|
currentIndex += 2;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
currentToken = "";
|
||||||
|
currentToken += preprocessedText[currentIndex];
|
||||||
|
syn = isOperatorOrDelimiter(currentToken);
|
||||||
|
if(syn != -1) {
|
||||||
|
currentIndex += 1;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
} else if (preprocessedText[currentIndex] == '\0' || currentIndex >= preprocessedText.size()) {
|
||||||
|
syn = static_cast<int>(TokenType::TOKEN_TYPE_EOF);
|
||||||
|
currentIndex++;
|
||||||
|
return;
|
||||||
|
} else if(preprocessedText[currentIndex] != '\n') {
|
||||||
|
syn = static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED);
|
||||||
|
currentIndex++;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
Tokenizer(std::string ruleName) {
|
||||||
|
this->ruleName = ruleName;
|
||||||
|
std::ifstream file("../data/highlight/" + ruleName + "/keywords.txt"); // Just for unit test
|
||||||
|
// std::ifstream file("./data/highlight/" + ruleName + "/keywords.txt");
|
||||||
|
if (file.is_open()) {
|
||||||
|
// printf("keywords file open\n");
|
||||||
|
std::string line;
|
||||||
|
while (std::getline(file, line)) {
|
||||||
|
reserveWord.push_back(line);
|
||||||
|
}
|
||||||
|
file.close();
|
||||||
|
}
|
||||||
|
// printf("keywords size: %llu\n", reserveWord.size());
|
||||||
|
// for(int i = 0; i < reserveWord.size(); i++) {
|
||||||
|
// printf("keywords[%d]: %s\n", i, reserveWord[i].c_str());
|
||||||
|
// }
|
||||||
|
reserveWordCount = reserveWord.size();
|
||||||
|
operatorAndDelimiterCount = sizeof(operatorOrDelimiter) / sizeof(operatorOrDelimiter[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
void setRawText(std::string rawText) {
|
||||||
|
this->rawText = rawText;
|
||||||
|
preprocess();
|
||||||
|
// printf("preprocessed text: %s\n", preprocessedText.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
void printProcessedText() {
|
||||||
|
printf("preprocessed text: %s\n", preprocessedText.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
MyVector<Token> tokenize() {
|
||||||
|
syn = -1;
|
||||||
|
int currentIndex = 0;
|
||||||
|
tokens.clear();
|
||||||
|
while(syn != static_cast<int>(TokenType::TOKEN_TYPE_EOF) && syn != static_cast<int>(TokenType::TOKEN_TYPE_UNDEFINED)) {
|
||||||
|
Scan(currentIndex);
|
||||||
|
// printf("currentToken: [%s]\n", currentToken.c_str());
|
||||||
|
if(currentToken == "") continue;
|
||||||
|
if(syn == static_cast<int>(TokenType::TOKEN_TYPE_STRING)) {
|
||||||
|
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_STRING));
|
||||||
|
// printf("string: %s\n", currentToken.c_str());
|
||||||
|
} else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_IDENTIFIER)) {
|
||||||
|
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_IDENTIFIER));
|
||||||
|
// printf("identifier: %s\n", currentToken.c_str());
|
||||||
|
} else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_NUMBER)) {
|
||||||
|
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_NUMBER));
|
||||||
|
// printf("number: %s\n", currentToken.c_str());
|
||||||
|
} else if(syn > 0 && syn < reserveWordCount) {
|
||||||
|
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_RESERVE_WORD));
|
||||||
|
// printf("reserve word: %s\n", currentToken.c_str());
|
||||||
|
} else if(syn >= reserveWordCount && syn < reserveWordCount + operatorAndDelimiterCount) {
|
||||||
|
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_OPERATOR_OR_DELIMITER));
|
||||||
|
// printf("operator or delimiter: %s\n", currentToken.c_str());
|
||||||
|
} else if(syn == static_cast<int>(TokenType::TOKEN_TYPE_COMMENT)) {
|
||||||
|
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_COMMENT));
|
||||||
|
// printf("comment: %s\n", currentToken.c_str());
|
||||||
|
} else {
|
||||||
|
tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_UNDEFINED));
|
||||||
|
// printf("undefined: %s\n", currentToken.c_str());
|
||||||
|
}
|
||||||
|
if(currentIndex >= preprocessedText.length()) {
|
||||||
|
syn = static_cast<int>(TokenType::TOKEN_TYPE_EOF);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // TOKENIZER_H 目前仅支持C/C++
|
48
utils/Tokenizer_test.cpp
Normal file
48
utils/Tokenizer_test.cpp
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
#include "Tokenizer.h"
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
#include <iomanip>
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
std::string input = "int main()\n\
|
||||||
|
{\n\
|
||||||
|
int a=-5,b=4,j; //this is an inline comment\n\
|
||||||
|
if(a >= b)\n\
|
||||||
|
j++;\n\
|
||||||
|
/*\n\
|
||||||
|
This is a block comment\n\
|
||||||
|
*/\n\
|
||||||
|
string str=\"test str\";\n\
|
||||||
|
j = a - b;\n\
|
||||||
|
else j=b-a;\n\
|
||||||
|
return j;\n\
|
||||||
|
}";
|
||||||
|
std::string ruleName = "cpp";
|
||||||
|
Tokenizer tokenizer = Tokenizer(ruleName);
|
||||||
|
tokenizer.setRawText(input);
|
||||||
|
tokenizer.printProcessedText();
|
||||||
|
MyVector<Token> tokens = tokenizer.tokenize();
|
||||||
|
std::cout << "Tokenized text: " << std::endl;
|
||||||
|
for (int i = 0; i < tokens.size(); i++) {
|
||||||
|
if(tokens[i].type == TokenType::TOKEN_TYPE_IDENTIFIER) {
|
||||||
|
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Identifier: ";
|
||||||
|
} else if(tokens[i].type == TokenType::TOKEN_TYPE_NUMBER) {
|
||||||
|
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Number: ";
|
||||||
|
} else if(tokens[i].type == TokenType::TOKEN_TYPE_OPERATOR_OR_DELIMITER) {
|
||||||
|
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Operator or Delimiter: ";
|
||||||
|
} else if(tokens[i].type == TokenType::TOKEN_TYPE_STRING) {
|
||||||
|
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "String: ";
|
||||||
|
} else if(tokens[i].type == TokenType::TOKEN_TYPE_RESERVE_WORD) {
|
||||||
|
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Reserve word: ";
|
||||||
|
} else if(tokens[i].type == TokenType::TOKEN_TYPE_EOF) {
|
||||||
|
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "End of file.";
|
||||||
|
} else if(tokens[i].type == TokenType::TOKEN_TYPE_COMMENT) {
|
||||||
|
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Comment: ";
|
||||||
|
} else {
|
||||||
|
std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Unknown token: ";
|
||||||
|
}
|
||||||
|
std::cout << " " << tokens[i].value << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user