diff --git a/components/TextEditor.h b/components/TextEditor.h new file mode 100644 index 0000000..e69de29 diff --git a/data/highlight/cpp/keywords.txt b/data/highlight/cpp/keywords.txt new file mode 100644 index 0000000..8294cbe --- /dev/null +++ b/data/highlight/cpp/keywords.txt @@ -0,0 +1,91 @@ +asm +auto +bool +break +case +catch +char +class +const +const_cast +continue +default +delete +do +double +dynamic_cast +else +enum +explicit +export +extern +false +float +for +friend +goto +if +inline +int +long +mutable +namespace +new +operator +private +protected +public +register +reinterpret_cast +return +short +signed +sizeof +static +static_cast +struct +switch +template +this +throw +true +try +typedef +typeid +typename +union +unsigned +using +virtual +void +volatile +wchar_t + +//some useful keywords +std +string +vector +list +map +set +deque +stack +queue +bitset +complex +valarray +pair +tuple +array +shared_ptr +unique_ptr +weak_ptr +function +thread +mutex +condition_variable +atomic +future +promise +packaged_task +thread_local \ No newline at end of file diff --git a/mystl/my_vector.h b/mystl/my_vector.h index ee81e59..0acdd13 100644 --- a/mystl/my_vector.h +++ b/mystl/my_vector.h @@ -88,7 +88,7 @@ public: } } - void erase(size_t index) override { + void erase(size_t index) { if (index >= m_size) { throw std::out_of_range("Index out of range"); } diff --git a/utils/SyntaxHighlighter.h b/utils/SyntaxHighlighter.h new file mode 100644 index 0000000..cd306a4 --- /dev/null +++ b/utils/SyntaxHighlighter.h @@ -0,0 +1,15 @@ +#ifndef SYNTAX_HIGHLIGHTER_H +#define SYNTAX_HIGHLIGHTER_H + +#include "Color.h" +#include "RichText.h" +#include + +class SyntaxHighlighter { +private: + std::string ruleName; + + +}; + +#endif // SYNTAX_HIGHLIGHTER_H \ No newline at end of file diff --git a/utils/Tokenizer b/utils/Tokenizer new file mode 100644 index 0000000..228c6c6 Binary files /dev/null and b/utils/Tokenizer differ diff --git a/utils/Tokenizer.h b/utils/Tokenizer.h new file mode 100644 index 0000000..20af9db --- /dev/null +++ b/utils/Tokenizer.h @@ -0,0 +1,289 @@ +#ifndef TOKENIZER_H +#define TOKENIZER_H + +#include +#include +#include +#include "../mystl/my_vector.h" + +enum class TokenType { + TOKEN_TYPE_IDENTIFIER = 9999, + TOKEN_TYPE_NUMBER = 10000, + TOKEN_TYPE_STRING = 10001, + TOKEN_TYPE_RESERVE_WORD, + TOKEN_TYPE_OPERATOR_OR_DELIMITER, + TOKEN_TYPE_EOF, + TOKEN_TYPE_UNDEFINED, + TOKEN_TYPE_COMMENT +}; + +struct Token { + std::string value; + TokenType type; + + Token() {} + Token(std::string value, TokenType type) { + this->value = value; + this->type = type; + } +}; + +static const std::string operatorOrDelimiter[] = {"+", "-", "*", "/", "<", "=", ">", ";", "(", ")", "^", ",", "\'", "#", "&", "|", "%", "~", "[", "]", "{", "}", "\\", ".", "\?", ":", "!", "\"", + "<=", ">=", "==", "!=", "&&", "<<", ">>", "||", "++", "--", "+=", "-=", "*=", "/=", "%=", "|=", "&=", "^=", "<<=", ">>=", "::", "->"}; + +class Tokenizer { +private: + std::string ruleName; + MyVector reserveWord; + std::string rawText, preprocessedText; + MyVector tokens; + std::string currentToken = ""; + int reserveWordCount, operatorAndDelimiterCount; + int syn = -1; + + int searchReserveWord(std::string word) { + for(int i = 0; i < reserveWord.size(); i++) { + if (word == reserveWord[i]) { + return i; + } + } + return -1; + } + + bool isLetter(char c) { + if(c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == '_') { + return true; + } + return false; + } + + bool isDigit(char c) { + if(c >= '0' && c <= '9') { + return true; + } + return false; + } + + int isOperatorOrDelimiter(std::string str) { + for(int i = 0; i < operatorAndDelimiterCount; i++) { + if(str == operatorOrDelimiter[i]) { + return i + reserveWordCount; + } + } + return -1; + } + + bool maybeOperatorOrDelimiterWith2Letters(char c) { + for(int i = 0; i < operatorAndDelimiterCount; i++) { + if(c == operatorOrDelimiter[i][0] && operatorOrDelimiter[i].size() > 1) { + return true; + } + } + return false; + } + + void preprocess() { + preprocessedText = ""; + bool inString = false; + for(int i = 0; i < rawText.size(); i++) { + if(rawText[i] == '/') { + if(i < rawText.size() - 1 && rawText[i + 1] == '/') + while(i < rawText.size() && rawText[i] != '\n') { + ++i; + } + } + if(rawText[i] == '/') { + if(i < rawText.size() - 1 && rawText[i + 1] == '*') { + i += 2; + while(i < rawText.size() - 1 && !(rawText[i] == '*' && rawText[i + 1] == '/')) { + ++i; + } + i += 2; + } + } + if(rawText[i] == '\"') { + inString = !inString; + } + if(!inString && rawText[i] == '\n') { + preprocessedText += ' '; + continue; + } + if (rawText[i] != '\t' && rawText[i] != '\v' && rawText[i] != '\r') { + preprocessedText += rawText[i]; + } + } + } + + void Scan(int & currentIndex) { + currentToken = ""; + + while(preprocessedText[currentIndex] == ' ') currentIndex++; + + // printf("current letter: [%d]%c\n", currentIndex, preprocessedText[currentIndex]); + + if(isLetter(preprocessedText[currentIndex])) { + while(isLetter(preprocessedText[currentIndex]) || isDigit(preprocessedText[currentIndex])) { + currentToken += preprocessedText[currentIndex++]; + } + + syn = searchReserveWord(currentToken); + syn = syn == -1 ? static_cast(TokenType::TOKEN_TYPE_IDENTIFIER) : syn; + + return; + } else if(isDigit(preprocessedText[currentIndex])) { + while(isDigit(preprocessedText[currentIndex])) { + currentToken += preprocessedText[currentIndex++]; + } + + syn = static_cast(TokenType::TOKEN_TYPE_NUMBER); + + return; + } else if((isOperatorOrDelimiter(std::string(1, preprocessedText[currentIndex])) != -1) && + !maybeOperatorOrDelimiterWith2Letters(preprocessedText[currentIndex])) { + if(preprocessedText[currentIndex] == '\"') { + currentToken += preprocessedText[currentIndex++]; + while(preprocessedText[currentIndex] != '\"') { + currentToken += preprocessedText[currentIndex++]; + } + currentToken += preprocessedText[currentIndex++]; + syn = static_cast(TokenType::TOKEN_TYPE_STRING); + return; + } + if(preprocessedText[currentIndex] == '\'') { + currentToken += preprocessedText[currentIndex++]; + while(preprocessedText[currentIndex] != '\'') { + currentToken += preprocessedText[currentIndex++]; + } + currentToken += preprocessedText[currentIndex++]; + syn = static_cast(TokenType::TOKEN_TYPE_STRING); + return; + } + currentToken += preprocessedText[currentIndex++]; + syn = isOperatorOrDelimiter(currentToken); + return; + } else if(maybeOperatorOrDelimiterWith2Letters(preprocessedText[currentIndex])) { + if(currentIndex < preprocessedText.size() - 2) { // 优先匹配三个字母的符号 + currentToken += preprocessedText[currentIndex]; + currentToken += preprocessedText[currentIndex + 1]; + currentToken += preprocessedText[currentIndex + 2]; + syn = isOperatorOrDelimiter(currentToken); + if(syn != -1) { + currentIndex += 3; + return; + } + } + currentToken = ""; + if (currentIndex < preprocessedText.size() - 1) { // 其次匹配两个字母的符号 + currentToken += preprocessedText[currentIndex]; + currentToken += preprocessedText[currentIndex + 1]; + if(currentToken == "//") { + syn = static_cast(TokenType::TOKEN_TYPE_COMMENT); + currentIndex += 2; + while(preprocessedText[currentIndex] != '\n' && currentIndex < preprocessedText.size()) { + currentToken += preprocessedText[currentIndex++]; + } + return; + } + if(currentToken == "/*") { + syn = static_cast(TokenType::TOKEN_TYPE_COMMENT); + currentIndex += 2; + while(currentIndex < rawText.size() - 1 && !(preprocessedText[currentIndex] == '*' && preprocessedText[currentIndex + 1] == '/')) { + currentToken += preprocessedText[currentIndex++]; + } + currentIndex += 2; + return; + } + syn = isOperatorOrDelimiter(currentToken); + if(syn != -1) { + currentIndex += 2; + return; + } + } + currentToken = ""; + currentToken += preprocessedText[currentIndex]; + syn = isOperatorOrDelimiter(currentToken); + if(syn != -1) { + currentIndex += 1; + } + return; + } else if (preprocessedText[currentIndex] == '\0' || currentIndex >= preprocessedText.size()) { + syn = static_cast(TokenType::TOKEN_TYPE_EOF); + currentIndex++; + return; + } else if(preprocessedText[currentIndex] != '\n') { + syn = static_cast(TokenType::TOKEN_TYPE_UNDEFINED); + currentIndex++; + return; + } + } + +public: + Tokenizer(std::string ruleName) { + this->ruleName = ruleName; + std::ifstream file("../data/highlight/" + ruleName + "/keywords.txt"); // Just for unit test + // std::ifstream file("./data/highlight/" + ruleName + "/keywords.txt"); + if (file.is_open()) { + // printf("keywords file open\n"); + std::string line; + while (std::getline(file, line)) { + reserveWord.push_back(line); + } + file.close(); + } + // printf("keywords size: %llu\n", reserveWord.size()); + // for(int i = 0; i < reserveWord.size(); i++) { + // printf("keywords[%d]: %s\n", i, reserveWord[i].c_str()); + // } + reserveWordCount = reserveWord.size(); + operatorAndDelimiterCount = sizeof(operatorOrDelimiter) / sizeof(operatorOrDelimiter[0]); + } + + void setRawText(std::string rawText) { + this->rawText = rawText; + preprocess(); + // printf("preprocessed text: %s\n", preprocessedText.c_str()); + } + + void printProcessedText() { + printf("preprocessed text: %s\n", preprocessedText.c_str()); + } + + MyVector tokenize() { + syn = -1; + int currentIndex = 0; + tokens.clear(); + while(syn != static_cast(TokenType::TOKEN_TYPE_EOF) && syn != static_cast(TokenType::TOKEN_TYPE_UNDEFINED)) { + Scan(currentIndex); + // printf("currentToken: [%s]\n", currentToken.c_str()); + if(currentToken == "") continue; + if(syn == static_cast(TokenType::TOKEN_TYPE_STRING)) { + tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_STRING)); + // printf("string: %s\n", currentToken.c_str()); + } else if(syn == static_cast(TokenType::TOKEN_TYPE_IDENTIFIER)) { + tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_IDENTIFIER)); + // printf("identifier: %s\n", currentToken.c_str()); + } else if(syn == static_cast(TokenType::TOKEN_TYPE_NUMBER)) { + tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_NUMBER)); + // printf("number: %s\n", currentToken.c_str()); + } else if(syn > 0 && syn < reserveWordCount) { + tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_RESERVE_WORD)); + // printf("reserve word: %s\n", currentToken.c_str()); + } else if(syn >= reserveWordCount && syn < reserveWordCount + operatorAndDelimiterCount) { + tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_OPERATOR_OR_DELIMITER)); + // printf("operator or delimiter: %s\n", currentToken.c_str()); + } else if(syn == static_cast(TokenType::TOKEN_TYPE_COMMENT)) { + tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_COMMENT)); + // printf("comment: %s\n", currentToken.c_str()); + } else { + tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_UNDEFINED)); + // printf("undefined: %s\n", currentToken.c_str()); + } + if(currentIndex >= preprocessedText.length()) { + syn = static_cast(TokenType::TOKEN_TYPE_EOF); + } + } + return tokens; + } +}; + +#endif // TOKENIZER_H 目前仅支持C/C++ \ No newline at end of file diff --git a/utils/Tokenizer_test.cpp b/utils/Tokenizer_test.cpp new file mode 100644 index 0000000..9b465bf --- /dev/null +++ b/utils/Tokenizer_test.cpp @@ -0,0 +1,48 @@ +#include "Tokenizer.h" +#include +#include +#include + +int main() { + std::string input = "int main()\n\ +{\n\ + int a=-5,b=4,j; //this is an inline comment\n\ +if(a >= b)\n\ +j++;\n\ +/*\n\ +This is a block comment\n\ +*/\n\ +string str=\"test str\";\n\ +j = a - b;\n\ +else j=b-a;\n\ + return j;\n\ +}"; + std::string ruleName = "cpp"; + Tokenizer tokenizer = Tokenizer(ruleName); + tokenizer.setRawText(input); + tokenizer.printProcessedText(); + MyVector tokens = tokenizer.tokenize(); + std::cout << "Tokenized text: " << std::endl; + for (int i = 0; i < tokens.size(); i++) { + if(tokens[i].type == TokenType::TOKEN_TYPE_IDENTIFIER) { + std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Identifier: "; + } else if(tokens[i].type == TokenType::TOKEN_TYPE_NUMBER) { + std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Number: "; + } else if(tokens[i].type == TokenType::TOKEN_TYPE_OPERATOR_OR_DELIMITER) { + std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Operator or Delimiter: "; + } else if(tokens[i].type == TokenType::TOKEN_TYPE_STRING) { + std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "String: "; + } else if(tokens[i].type == TokenType::TOKEN_TYPE_RESERVE_WORD) { + std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Reserve word: "; + } else if(tokens[i].type == TokenType::TOKEN_TYPE_EOF) { + std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "End of file."; + } else if(tokens[i].type == TokenType::TOKEN_TYPE_COMMENT) { + std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Comment: "; + } else { + std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Unknown token: "; + } + std::cout << " " << tokens[i].value << std::endl; + } + + return 0; +} \ No newline at end of file