saqut-compiler/core/Tokenizer.cpp

#include <iostream>
#include <string>
#include <stdlib.h>
#include <vector>
#include "./Lexer.cpp"

#ifndef TOKENIZER
#define TOKENIZER

class Token {
    protected:
        std::string type = "";
    public:
        int start = 0;
        int end = 0;
        std::string token;
        std::string gettype(){
            return this->type;
        }
        virtual ~Token() = default;
};

class StringToken : public Token {
    public:
        StringToken(){
            this->type = "string";
        };
        std::string context;
        int size = 0;
};
class NumberToken : public Token {
    public:
        NumberToken(){
            this->type = "number";
        }
        bool isFloat = false;
        bool hasEpsilon = false;
        int base = 10;
};
class OperatorToken : public Token {
    public:
        OperatorToken(){
            this->type = "operator";
        }
};
class DelimiterToken : public Token {
    public:
        DelimiterToken(){
            this->type = "delimiter";
        }
};
class KeywordToken : public Token {
    public:
        KeywordToken(){
            this->type = "keyword";
        }
};
class IdentifierToken : public Token {
    public:
        IdentifierToken(){
            this->type = "identifier";
        }
        std::string context;
        int size = 0;
};


const constexpr std::string_view operators[] = {
    // --- Mantıksal Karşılaştırma ---
    "==", "!=", "<=", ">=", "&&", "||",

    // --- Aritmetik (Çift Karakterli) ---
    "++", "--", "<<", ">>",

    // --- Atama Operatörleri ---
    "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=",

    // --- Aritmetik (Tek Karakterli) ---
    "+", "-", "*", "/", "%", "<", ">",

    // --- Bitwise ve Mantıksal (Tek Karakterli) ---
    "^", "!", "~", "&", "|",

    // --- Temel Atama ---
    "="
};

const constexpr std::string_view delimiters[] = {
    // Bağlayıcılar
    "->",
    "::",
    // Sınırlandırıcılar
    "[",
    "]",
    "(",
    ")",
    "{",
    "}",
    // Ayırıcılar
    ";",
    ",",
    ":",
    // Bağlayıcılar
    ".",
};

const constexpr std::string_view keywords[] = {
    "implements",
    "protected",
    "interface",
    "continue",
    "private",
    "finally",
    "extends",
    "default",
    "throws",
    "switch",
    "return",
    "public",
    "assert",
    "false",
    "while",
    "throw",
    "class",
    "catch",
    "break",
    "null",
    "true",
    "enum",
    "else",
    "case",
    "new",
    "try",
    "for",
    "if",
    "do"
};


class Tokenizer {
public:
    Lexer hmx;
    std::vector<Token *> scan(std::string input);
    Token * scope();
    IdentifierToken * readIndetifier();
    StringToken * readString();
    void skipOneLineComment();
    void skipMultiLineComment();
};


std::vector<Token *> Tokenizer::scan(std::string input)
{
    std::vector<Token *> tokens;
    this->hmx.setText(input);
    while(1)
    {
        Token * token = this->scope();
        if(token->token == "EOL")
        {
            break;
        }
        tokens.push_back(token);
        if(this->hmx.isEnd())
        {
            break;
        }
    }
    return tokens;
}
Token * Tokenizer::scope()
{
    this->hmx.skipWhiteSpace();

    // Yorum satırları
    if(this->hmx.include("//", true))
    {
        this->skipOneLineComment();
    }
    if(this->hmx.include("/*", true))
    {
        this->skipMultiLineComment();
    }

    if(this->hmx.isEnd()){
        Token * token = new Token();
        token->token = "EOL";
        return token;
    };

    // Stringler
    if(this->hmx.getchar() == '"')
    {
        return this->readString();
    }

    // Sayılar
    if(this->hmx.isNumeric())
    {
        INumber lem = this->hmx.readNumeric();
        NumberToken * numberToken = new NumberToken();
        numberToken->base = lem.base;
        numberToken->start = lem.start;
        numberToken->end = lem.end;
        numberToken->hasEpsilon = lem.hasEpsilon;
        numberToken->isFloat = lem.isFloat;
        numberToken->token = lem.token;
        return numberToken;
    }

    for (const std::string_view& keys : keywords) {
        if(this->hmx.include(std::string(keys),false))
        {
            KeywordToken * keytoken = new KeywordToken();
            keytoken->start = this->hmx.getOffset();
            this->hmx.toChar(+keys.size());
            keytoken->end = this->hmx.getOffset();
            keytoken->token = keys;
            return keytoken;
        }
    }

    for (const std::string_view& del : delimiters) {
        if(this->hmx.include(std::string(del),false))
        {
            DelimiterToken * dtoken = new DelimiterToken();;
            dtoken->start = this->hmx.getOffset();
            this->hmx.toChar(+del.size());
            dtoken->end = this->hmx.getOffset();
            dtoken->token = del;
            return dtoken;
        }
    }

    for (const std::string_view& op : operators) {
        if(this->hmx.include(std::string(op),false))
        {
            OperatorToken* optoken = new OperatorToken();
            optoken->start = this->hmx.getOffset();
            this->hmx.toChar(+op.size());
            optoken->end = this->hmx.getOffset();
            optoken->token = op;
            return optoken;
        }
    }

    return this->readIndetifier();
}
IdentifierToken * Tokenizer::readIndetifier()
{
    this->hmx.beginPosition();
    IdentifierToken * idenditifierToken = new IdentifierToken();
    idenditifierToken->start = this->hmx.getOffset();

    while(this->hmx.isEnd() == false)
    {
        bool readed = false;
        char c = this->hmx.getchar();

        if(c >= 'a' && c <= 'z')
        {
            readed = true;
            idenditifierToken->token.push_back(c);
            this->hmx.nextChar();
            continue;
        }

        if(c >= 'A' && c <= 'Z')
        {
            readed = true;
            idenditifierToken->token.push_back(c);
            this->hmx.nextChar();
            continue;
        }


        if(c >= '0' && c <= '9')
        {
            readed = true;
            idenditifierToken->token.push_back(c);
            this->hmx.nextChar();
            continue;
        }

        switch(c)
        {
            case '_':{
                readed = true;
                idenditifierToken->token.push_back(c);
                this->hmx.nextChar();
                break;
            }
            case '$':{
                readed = true;
                idenditifierToken->token.push_back(c);
                this->hmx.nextChar();
                break;
            }
        }
        if(readed == false)
        {
            break;
        }
    }
    idenditifierToken->end = this->hmx.getOffset();
    idenditifierToken->size = idenditifierToken->context.size();
    this->hmx.acceptPosition();
    return idenditifierToken;
}
StringToken * Tokenizer::readString()
{
    this->hmx.beginPosition();
    StringToken * stringToken = new StringToken();
    bool started = false;
    bool isended = false;
    stringToken->start = this->hmx.getOffset();

    while(this->hmx.isEnd() == false)
    {
        char c = this->hmx.getchar();
        stringToken->token.push_back(c);
        switch(c)
        {
            case '"':{
                if(started == false)
                {
                    started = true;
                    break;
                }else{
                    isended = true;
                    break;
                }
            }
            case '\\':{
                this->hmx.nextChar();
                c = this->hmx.getchar();
                stringToken->token.push_back(c);
                stringToken->context.push_back(c);
                break;
            }
            default:{
                stringToken->context.push_back(c);
            }
        }
        this->hmx.nextChar();
        if(isended)
        {
            break;
        }
    }
    stringToken->end = this->hmx.getOffset();
    stringToken->size = stringToken->context.size();
    this->hmx.acceptPosition();
    return stringToken;
}
void Tokenizer::skipOneLineComment()
{
    while(this->hmx.isEnd() == false)
    {
        if(this->hmx.getchar() == '\n')
        {
            this->hmx.nextChar();
            this->hmx.skipWhiteSpace();
            return;
        }else{
            this->hmx.nextChar();
        }
    }
}
void Tokenizer::skipMultiLineComment()
{
    while(this->hmx.isEnd() == false)
    {
        if(this->hmx.include("*/",true))
        {
            this->hmx.skipWhiteSpace();
            return;
        }else{
            this->hmx.nextChar();
        }
    }
}

#endif