158 lines
3.9 KiB
C++
158 lines
3.9 KiB
C++
#include <exception>
|
|
#include <iostream>
|
|
#include <lex.hpp>
|
|
#include <string>
|
|
#include <cctype>
|
|
|
|
using namespace std;
|
|
|
|
std::ostream &operator<<(std::ostream &os, Token const &t) {
|
|
os << "Token(";
|
|
switch (t.type) {
|
|
case TokenType::OpenParen: os << "OpenParen)"; break;
|
|
case TokenType::CloseParen: os << "CloseParen)"; break;
|
|
case TokenType::Dollar: os << "Dollar)"; break;
|
|
case TokenType::Symbol: os << "Symbol, " << get<string>(t.value) << ")"; break;
|
|
case TokenType::String: os << "String, \"" << get<string>(t.value) << "\")"; break;
|
|
case TokenType::Int: os << "Int, " << get<int64_t>(t.value) << ")"; break;
|
|
case TokenType::Double: os << "Double, " << get<double>(t.value) << ")"; break;
|
|
case TokenType::End: os << "END)"; break;
|
|
default:
|
|
os << ")";
|
|
}
|
|
return os;
|
|
}
|
|
|
|
bool ispunct(char c) {
|
|
for (char i : "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
|
|
if (i == c) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool isSymbolChar(char c) {
|
|
return c != '(' && c != ')' && isgraph(c);
|
|
}
|
|
|
|
Lexer::Lexer(string s) : ss(s) {}
|
|
Lexer::Lexer() : ss("") {}
|
|
|
|
void Lexer::feed(string s) {
|
|
ss << s;
|
|
}
|
|
|
|
Token Lexer::lexNumOrSym() {
|
|
// first we take the part that is either a number or symbol, then
|
|
// we'll determine which it is.
|
|
stringstream acc("");
|
|
while (true) {
|
|
char c = ss.get();
|
|
if (ss.eof())
|
|
break;
|
|
if (!isSymbolChar(c)) {
|
|
ss.unget();
|
|
break;
|
|
}
|
|
|
|
acc << c;
|
|
}
|
|
|
|
// TODO: bigint. also reader base.
|
|
// ... this will almost certainly change, won't it?
|
|
string s = acc.str();
|
|
string iterate_over = (s.at(0) == '-') ? s.substr(1) : s;
|
|
bool is_number = true;
|
|
bool dot_seen = false;
|
|
for (char c : s) {
|
|
if (c == '.') {
|
|
if (dot_seen) {
|
|
is_number = false;
|
|
break;
|
|
}
|
|
dot_seen = true;
|
|
continue;
|
|
}
|
|
if (!isdigit(c)) {
|
|
is_number = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (is_number && dot_seen) {
|
|
if (s == ".")
|
|
return {TokenType::Symbol, s};
|
|
return {TokenType::Double, stod(s)};
|
|
} else if (is_number) {
|
|
return {TokenType::Int, stoll(s)};
|
|
}
|
|
return {TokenType::Symbol, s};
|
|
}
|
|
|
|
Token Lexer::lexString() {
|
|
ss.get(); // skip the quote.
|
|
stringstream acc("");
|
|
while (true) {
|
|
char c = ss.get();
|
|
if (ss.eof()) {
|
|
cerr << "EOF while reading string.";
|
|
throw exception();
|
|
}
|
|
if (c == '"')
|
|
break;
|
|
acc << c;
|
|
}
|
|
return {TokenType::String, acc.str()};
|
|
}
|
|
|
|
Token Lexer::lexNonSpecial() {
|
|
// This function will not be called unless a character was received.
|
|
char first = ss.peek();
|
|
if (first == '"')
|
|
return lexString();
|
|
else if (isSymbolChar(first))
|
|
return lexNumOrSym();
|
|
else {
|
|
cerr << "Non-printable character found." << endl;
|
|
throw std::exception();
|
|
}
|
|
}
|
|
|
|
Token Lexer::next() {
|
|
while (true) {
|
|
// we MUST check for eof AFTER trying to get a character.
|
|
// ss.eof() doesn't return true until we try to get another
|
|
// character while at EOF, even if we have exhausted the stream.
|
|
char c = ss.get();
|
|
if (ss.eof())
|
|
return {TokenType::End};
|
|
|
|
if (isspace(c))
|
|
continue;
|
|
switch (c) {
|
|
case '(': return {TokenType::OpenParen};
|
|
case ')': return {TokenType::CloseParen};
|
|
case '$': return {TokenType::Dollar};
|
|
default:
|
|
ss.unget();
|
|
return lexNonSpecial();
|
|
}
|
|
}
|
|
}
|
|
|
|
vector<Token> Lexer::collect() {
|
|
vector<Token> v;
|
|
while (true) {
|
|
Token t = next();
|
|
if (t.type == TokenType::End)
|
|
break;
|
|
|
|
v.push_back(t);
|
|
}
|
|
return v;
|
|
}
|
|
|
|
std::vector<Token> lex(std::string s) {
|
|
Lexer l(s);
|
|
return l.collect();
|
|
}
|