saqut-compiler/core/Tokenizer.cpp

384 lines
8.4 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <iostream>
#include <string>
#include <stdlib.h>
#include <vector>
#include "./Lexer.cpp"
#ifndef TOKENIZER
#define TOKENIZER
class Token {
protected:
std::string type = "";
public:
int start = 0;
int end = 0;
std::string token;
std::string gettype(){
return this->type;
}
virtual ~Token() = default;
};
class StringToken : public Token {
public:
StringToken(){
this->type = "string";
};
std::string context;
int size = 0;
};
class NumberToken : public Token {
public:
NumberToken(){
this->type = "number";
}
bool isFloat = false;
bool hasEpsilon = false;
int base = 10;
};
class OperatorToken : public Token {
public:
OperatorToken(){
this->type = "operator";
}
};
class DelimiterToken : public Token {
public:
DelimiterToken(){
this->type = "delimiter";
}
};
class KeywordToken : public Token {
public:
KeywordToken(){
this->type = "keyword";
}
};
class IdentifierToken : public Token {
public:
IdentifierToken(){
this->type = "identifier";
}
std::string context;
int size = 0;
};
const constexpr std::string_view operators[] = {
// --- Mantıksal Karşılaştırma ---
"==", "!=", "<=", ">=", "&&", "||",
// --- Aritmetik (Çift Karakterli) ---
"++", "--", "<<", ">>",
// --- Atama Operatörleri ---
"+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=",
// --- Aritmetik (Tek Karakterli) ---
"+", "-", "*", "/", "%", "<", ">",
// --- Bitwise ve Mantıksal (Tek Karakterli) ---
"^", "!", "~", "&", "|",
// --- Temel Atama ---
"="
};
const constexpr std::string_view delimiters[] = {
// Bağlayıcılar
"->",
"::",
// Sınırlandırıcılar
"[",
"]",
"(",
")",
"{",
"}",
// Ayırıcılar
";",
",",
":",
// Bağlayıcılar
".",
};
const constexpr std::string_view keywords[] = {
"implements",
"protected",
"interface",
"continue",
"private",
"finally",
"extends",
"default",
"throws",
"switch",
"return",
"public",
"assert",
"false",
"while",
"throw",
"class",
"catch",
"break",
"null",
"true",
"enum",
"else",
"case",
"new",
"try",
"for",
"if",
"do"
};
class Tokenizer {
public:
Lexer hmx;
std::vector<Token *> scan(std::string input);
Token * scope();
IdentifierToken * readIndetifier();
StringToken * readString();
void skipOneLineComment();
void skipMultiLineComment();
};
std::vector<Token *> Tokenizer::scan(std::string input)
{
std::vector<Token *> tokens;
this->hmx.setText(input);
while(1)
{
Token * token = this->scope();
if(token->token == "EOL")
{
break;
}
tokens.push_back(token);
if(this->hmx.isEnd())
{
break;
}
}
return tokens;
}
Token * Tokenizer::scope()
{
this->hmx.skipWhiteSpace();
// Yorum satırları
if(this->hmx.include("//", true))
{
this->skipOneLineComment();
}
if(this->hmx.include("/*", true))
{
this->skipMultiLineComment();
}
if(this->hmx.isEnd()){
Token * token = new Token();
token->token = "EOL";
return token;
};
// Stringler
if(this->hmx.getchar() == '"')
{
return this->readString();
}
// Sayılar
if(this->hmx.isNumeric())
{
INumber lem = this->hmx.readNumeric();
NumberToken * numberToken = new NumberToken();
numberToken->base = lem.base;
numberToken->start = lem.start;
numberToken->end = lem.end;
numberToken->hasEpsilon = lem.hasEpsilon;
numberToken->isFloat = lem.isFloat;
numberToken->token = lem.token;
return numberToken;
}
for (const std::string_view& keys : keywords) {
if(this->hmx.include(std::string(keys),false))
{
KeywordToken * keytoken = new KeywordToken();
keytoken->start = this->hmx.getOffset();
this->hmx.toChar(+keys.size());
keytoken->end = this->hmx.getOffset();
keytoken->token = keys;
return keytoken;
}
}
for (const std::string_view& del : delimiters) {
if(this->hmx.include(std::string(del),false))
{
DelimiterToken * dtoken = new DelimiterToken();;
dtoken->start = this->hmx.getOffset();
this->hmx.toChar(+del.size());
dtoken->end = this->hmx.getOffset();
dtoken->token = del;
return dtoken;
}
}
for (const std::string_view& op : operators) {
if(this->hmx.include(std::string(op),false))
{
OperatorToken* optoken = new OperatorToken();
optoken->start = this->hmx.getOffset();
this->hmx.toChar(+op.size());
optoken->end = this->hmx.getOffset();
optoken->token = op;
return optoken;
}
}
return this->readIndetifier();
}
IdentifierToken * Tokenizer::readIndetifier()
{
this->hmx.beginPosition();
IdentifierToken * idenditifierToken = new IdentifierToken();
idenditifierToken->start = this->hmx.getOffset();
while(this->hmx.isEnd() == false)
{
bool readed = false;
char c = this->hmx.getchar();
if(c >= 'a' && c <= 'z')
{
readed = true;
idenditifierToken->token.push_back(c);
this->hmx.nextChar();
continue;
}
if(c >= 'A' && c <= 'Z')
{
readed = true;
idenditifierToken->token.push_back(c);
this->hmx.nextChar();
continue;
}
if(c >= '0' && c <= '9')
{
readed = true;
idenditifierToken->token.push_back(c);
this->hmx.nextChar();
continue;
}
switch(c)
{
case '_':{
readed = true;
idenditifierToken->token.push_back(c);
this->hmx.nextChar();
break;
}
case '$':{
readed = true;
idenditifierToken->token.push_back(c);
this->hmx.nextChar();
break;
}
}
if(readed == false)
{
break;
}
}
idenditifierToken->end = this->hmx.getOffset();
idenditifierToken->size = idenditifierToken->context.size();
this->hmx.acceptPosition();
return idenditifierToken;
}
StringToken * Tokenizer::readString()
{
this->hmx.beginPosition();
StringToken * stringToken = new StringToken();
bool started = false;
bool isended = false;
stringToken->start = this->hmx.getOffset();
while(this->hmx.isEnd() == false)
{
char c = this->hmx.getchar();
stringToken->token.push_back(c);
switch(c)
{
case '"':{
if(started == false)
{
started = true;
break;
}else{
isended = true;
break;
}
}
case '\\':{
this->hmx.nextChar();
c = this->hmx.getchar();
stringToken->token.push_back(c);
stringToken->context.push_back(c);
break;
}
default:{
stringToken->context.push_back(c);
}
}
this->hmx.nextChar();
if(isended)
{
break;
}
}
stringToken->end = this->hmx.getOffset();
stringToken->size = stringToken->context.size();
this->hmx.acceptPosition();
return stringToken;
}
void Tokenizer::skipOneLineComment()
{
while(this->hmx.isEnd() == false)
{
if(this->hmx.getchar() == '\n')
{
this->hmx.nextChar();
this->hmx.skipWhiteSpace();
return;
}else{
this->hmx.nextChar();
}
}
}
void Tokenizer::skipMultiLineComment()
{
while(this->hmx.isEnd() == false)
{
if(this->hmx.include("*/",true))
{
this->hmx.skipWhiteSpace();
return;
}else{
this->hmx.nextChar();
}
}
}
#endif