diff --git a/utils/Tokenizer_test.cpp b/utils/LexicalAnalysis.cpp similarity index 79% rename from utils/Tokenizer_test.cpp rename to utils/LexicalAnalysis.cpp index 9b465bf..8858739 100644 --- a/utils/Tokenizer_test.cpp +++ b/utils/LexicalAnalysis.cpp @@ -1,4 +1,4 @@ -#include "Tokenizer.h" +#include "LexicalAnalysis.h" #include #include #include @@ -18,10 +18,10 @@ else j=b-a;\n\ return j;\n\ }"; std::string ruleName = "cpp"; - Tokenizer tokenizer = Tokenizer(ruleName); - tokenizer.setRawText(input); - tokenizer.printProcessedText(); - MyVector tokens = tokenizer.tokenize(); + LexicalAnalysis lexicalAnalysis = LexicalAnalysis(ruleName); + lexicalAnalysis.setRawText(input); + lexicalAnalysis.printProcessedText(); + MyVector tokens = lexicalAnalysis.tokenize(); std::cout << "Tokenized text: " << std::endl; for (int i = 0; i < tokens.size(); i++) { if(tokens[i].type == TokenType::TOKEN_TYPE_IDENTIFIER) { @@ -38,11 +38,16 @@ else j=b-a;\n\ std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "End of file."; } else if(tokens[i].type == TokenType::TOKEN_TYPE_COMMENT) { std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Comment: "; + } else if(tokens[i].type == TokenType::TOKEN_TYPE_NEWLINE) { + std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Newline "; } else { std::cout << std::fixed << std::setw(25) << std::setfill(' ') << std::right << "Unknown token: "; } std::cout << " " << tokens[i].value << std::endl; } + for (int i = 0; i < tokens.size(); i++) { + std::cout << tokens[i].value; + } return 0; } \ No newline at end of file diff --git a/utils/Tokenizer.h b/utils/LexicalAnalysis.h similarity index 90% rename from utils/Tokenizer.h rename to utils/LexicalAnalysis.h index 20af9db..4359d64 100644 --- a/utils/Tokenizer.h +++ b/utils/LexicalAnalysis.h @@ -1,5 +1,5 @@ -#ifndef TOKENIZER_H -#define TOKENIZER_H +#ifndef LEXICALANALYSIS_H +#define LEXICALANALYSIS_H #include #include @@ -14,7 +14,8 @@ enum class TokenType { TOKEN_TYPE_OPERATOR_OR_DELIMITER, TOKEN_TYPE_EOF, TOKEN_TYPE_UNDEFINED, - TOKEN_TYPE_COMMENT + TOKEN_TYPE_COMMENT, + TOKEN_TYPE_NEWLINE }; struct Token { @@ -31,7 +32,7 @@ struct Token { static const std::string operatorOrDelimiter[] = {"+", "-", "*", "/", "<", "=", ">", ";", "(", ")", "^", ",", "\'", "#", "&", "|", "%", "~", "[", "]", "{", "}", "\\", ".", "\?", ":", "!", "\"", "<=", ">=", "==", "!=", "&&", "<<", ">>", "||", "++", "--", "+=", "-=", "*=", "/=", "%=", "|=", "&=", "^=", "<<=", ">>=", "::", "->"}; -class Tokenizer { +class LexicalAnalysis { private: std::string ruleName; MyVector reserveWord; @@ -83,6 +84,8 @@ private: } void preprocess() { + preprocessedText = rawText; + return; preprocessedText = ""; bool inString = false; for(int i = 0; i < rawText.size(); i++) { @@ -117,7 +120,9 @@ private: void Scan(int & currentIndex) { currentToken = ""; - while(preprocessedText[currentIndex] == ' ') currentIndex++; + while(preprocessedText[currentIndex] == ' ') { + currentToken += preprocessedText[currentIndex++]; + } // printf("current letter: [%d]%c\n", currentIndex, preprocessedText[currentIndex]); @@ -190,6 +195,7 @@ private: while(currentIndex < rawText.size() - 1 && !(preprocessedText[currentIndex] == '*' && preprocessedText[currentIndex + 1] == '/')) { currentToken += preprocessedText[currentIndex++]; } + currentToken += "/*"; currentIndex += 2; return; } @@ -210,7 +216,12 @@ private: syn = static_cast(TokenType::TOKEN_TYPE_EOF); currentIndex++; return; - } else if(preprocessedText[currentIndex] != '\n') { + } else if(preprocessedText[currentIndex] == '\n') { + syn = static_cast(TokenType::TOKEN_TYPE_NEWLINE); + currentToken = "\n"; + currentIndex++; + return; + } else { syn = static_cast(TokenType::TOKEN_TYPE_UNDEFINED); currentIndex++; return; @@ -218,7 +229,7 @@ private: } public: - Tokenizer(std::string ruleName) { + LexicalAnalysis(std::string ruleName) { this->ruleName = ruleName; std::ifstream file("../data/highlight/" + ruleName + "/keywords.txt"); // Just for unit test // std::ifstream file("./data/highlight/" + ruleName + "/keywords.txt"); @@ -254,8 +265,7 @@ public: tokens.clear(); while(syn != static_cast(TokenType::TOKEN_TYPE_EOF) && syn != static_cast(TokenType::TOKEN_TYPE_UNDEFINED)) { Scan(currentIndex); - // printf("currentToken: [%s]\n", currentToken.c_str()); - if(currentToken == "") continue; + printf("currentToken: [%s]\n", currentToken.c_str()); if(syn == static_cast(TokenType::TOKEN_TYPE_STRING)) { tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_STRING)); // printf("string: %s\n", currentToken.c_str()); @@ -274,6 +284,12 @@ public: } else if(syn == static_cast(TokenType::TOKEN_TYPE_COMMENT)) { tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_COMMENT)); // printf("comment: %s\n", currentToken.c_str()); + } else if(syn == static_cast(TokenType::TOKEN_TYPE_EOF)) { + tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_EOF)); + // printf("EOF: %s\n", currentToken.c_str()); + } else if(syn == static_cast(TokenType::TOKEN_TYPE_NEWLINE)) { + tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_NEWLINE)); + // printf("newline: %s\n", currentToken.c_str()); } else { tokens.push_back(Token(currentToken, TokenType::TOKEN_TYPE_UNDEFINED)); // printf("undefined: %s\n", currentToken.c_str()); diff --git a/utils/Tokenizer b/utils/Tokenizer deleted file mode 100644 index 228c6c6..0000000 Binary files a/utils/Tokenizer and /dev/null differ