Files
mash/src/lex.cpp
haxala1r f93b2deda2
All checks were successful
ci/woodpecker/push/workflow Pipeline was successful
Added lexing logic for doubles
2025-09-30 20:30:05 +03:00

158 lines
3.9 KiB
C++

#include <exception>
#include <iostream>
#include <lex.hpp>
#include <string>
#include <cctype>
using namespace std;
std::ostream &operator<<(std::ostream &os, Token const &t) {
os << "Token(";
switch (t.type) {
case TokenType::OpenParen: os << "OpenParen)"; break;
case TokenType::CloseParen: os << "CloseParen)"; break;
case TokenType::Dollar: os << "Dollar)"; break;
case TokenType::Symbol: os << "Symbol, " << get<string>(t.value) << ")"; break;
case TokenType::String: os << "String, \"" << get<string>(t.value) << "\")"; break;
case TokenType::Int: os << "Int, " << get<int64_t>(t.value) << ")"; break;
case TokenType::Double: os << "Double, " << get<double>(t.value) << ")"; break;
case TokenType::End: os << "END)"; break;
default:
os << ")";
}
return os;
}
bool ispunct(char c) {
for (char i : "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
if (i == c) return true;
}
return false;
}
bool isSymbolChar(char c) {
return c != '(' && c != ')' && isgraph(c);
}
Lexer::Lexer(string s) : ss(s) {}
Lexer::Lexer() : ss("") {}
void Lexer::feed(string s) {
ss << s;
}
Token Lexer::lexNumOrSym() {
// first we take the part that is either a number or symbol, then
// we'll determine which it is.
stringstream acc("");
while (true) {
char c = ss.get();
if (ss.eof())
break;
if (!isSymbolChar(c)) {
ss.unget();
break;
}
acc << c;
}
// TODO: bigint. also reader base.
// ... this will almost certainly change, won't it?
string s = acc.str();
string iterate_over = (s.at(0) == '-') ? s.substr(1) : s;
bool is_number = true;
bool dot_seen = false;
for (char c : s) {
if (c == '.') {
if (dot_seen) {
is_number = false;
break;
}
dot_seen = true;
continue;
}
if (!isdigit(c)) {
is_number = false;
break;
}
}
if (is_number && dot_seen) {
if (s == ".")
return {TokenType::Symbol, s};
return {TokenType::Double, stod(s)};
} else if (is_number) {
return {TokenType::Int, stoll(s)};
}
return {TokenType::Symbol, s};
}
Token Lexer::lexString() {
ss.get(); // skip the quote.
stringstream acc("");
while (true) {
char c = ss.get();
if (ss.eof()) {
cerr << "EOF while reading string.";
throw exception();
}
if (c == '"')
break;
acc << c;
}
return {TokenType::String, acc.str()};
}
Token Lexer::lexNonSpecial() {
// This function will not be called unless a character was received.
char first = ss.peek();
if (first == '"')
return lexString();
else if (isSymbolChar(first))
return lexNumOrSym();
else {
cerr << "Non-printable character found." << endl;
throw std::exception();
}
}
Token Lexer::next() {
while (true) {
// we MUST check for eof AFTER trying to get a character.
// ss.eof() doesn't return true until we try to get another
// character while at EOF, even if we have exhausted the stream.
char c = ss.get();
if (ss.eof())
return {TokenType::End};
if (isspace(c))
continue;
switch (c) {
case '(': return {TokenType::OpenParen};
case ')': return {TokenType::CloseParen};
case '$': return {TokenType::Dollar};
default:
ss.unget();
return lexNonSpecial();
}
}
}
vector<Token> Lexer::collect() {
vector<Token> v;
while (true) {
Token t = next();
if (t.type == TokenType::End)
break;
v.push_back(t);
}
return v;
}
std::vector<Token> lex(std::string s) {
Lexer l(s);
return l.collect();
}