Untested additions to lexer - almost complete now
This commit is contained in:
@@ -7,6 +7,7 @@
|
||||
enum TokenType {
|
||||
OpenParen,
|
||||
CloseParen,
|
||||
Dollar,
|
||||
Symbol,
|
||||
String,
|
||||
Int,
|
||||
@@ -16,7 +17,7 @@ enum TokenType {
|
||||
// Plain Old Data
|
||||
struct Token {
|
||||
enum TokenType type;
|
||||
std::variant<int, std::string> value;
|
||||
std::variant<int64_t, std::string> value;
|
||||
};
|
||||
|
||||
class Lexer {
|
||||
@@ -24,6 +25,10 @@ private:
|
||||
// we use a stringstream for lexing purposes
|
||||
std::stringstream ss;
|
||||
|
||||
Token lexNumOrSym();
|
||||
Token lexString();
|
||||
Token lexNonSpecial();
|
||||
|
||||
public:
|
||||
Lexer(std::string);
|
||||
Lexer();
|
||||
|
113
src/lex.cpp
113
src/lex.cpp
@@ -1,15 +1,122 @@
|
||||
#include <cstdlib>
|
||||
#include <exception>
|
||||
#include <iostream>
|
||||
#include <lex.hpp>
|
||||
#include <string>
|
||||
#include <cctype>
|
||||
#include <cstring>
|
||||
|
||||
Lexer::Lexer(std::string s) : ss(s) {}
|
||||
using namespace std;
|
||||
|
||||
bool ispunct(char c) {
|
||||
for (char i : "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
|
||||
if (i == c) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool isSymbolChar(char c) {
|
||||
return c != '(' && c != ')' && isgraph(c);
|
||||
}
|
||||
|
||||
Lexer::Lexer(string s) : ss(s) {}
|
||||
Lexer::Lexer() : ss("") {}
|
||||
|
||||
void Lexer::feed(std::string s) {
|
||||
void Lexer::feed(string s) {
|
||||
ss << s;
|
||||
}
|
||||
|
||||
Token Lexer::lexNumOrSym() {
|
||||
// first we take the part that is either a number or symbol, then
|
||||
// we'll determine which it is.
|
||||
stringstream acc("");
|
||||
while (true) {
|
||||
char c = ss.get();
|
||||
if (ss.eof())
|
||||
break;
|
||||
if (!isSymbolChar(c)) {
|
||||
ss.unget();
|
||||
break;
|
||||
}
|
||||
|
||||
acc << c;
|
||||
}
|
||||
|
||||
// TODO: bigint. also reader base.
|
||||
// ... this will almost certainly change, won't it?
|
||||
string s = acc.str();
|
||||
string iterate_over = (s.at(0) == '-') ? s.substr(1) : s;
|
||||
bool is_number = true;
|
||||
bool dot_seen = false;
|
||||
for (char c : s) {
|
||||
if (c == '.') {
|
||||
if (dot_seen) {
|
||||
is_number = false;
|
||||
break;
|
||||
}
|
||||
dot_seen = true;
|
||||
}
|
||||
if (!isdigit(c)) {
|
||||
is_number = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_number) {
|
||||
return {TokenType::Int, atoll(s.c_str())};
|
||||
}
|
||||
return {TokenType::Symbol, s};
|
||||
}
|
||||
|
||||
Token Lexer::lexString() {
|
||||
ss.get(); // skip the quote.
|
||||
stringstream acc("");
|
||||
while (true) {
|
||||
char c = ss.get();
|
||||
if (ss.eof()) {
|
||||
cerr << "EOF while reading string.";
|
||||
throw exception();
|
||||
}
|
||||
if (c == '"')
|
||||
break;
|
||||
acc << c;
|
||||
}
|
||||
return {TokenType::String, acc.str()};
|
||||
}
|
||||
|
||||
Token Lexer::lexNonSpecial() {
|
||||
// This function will not be called unless a character was received.
|
||||
char first = ss.peek();
|
||||
if (first == '"')
|
||||
return lexString();
|
||||
else if (isSymbolChar(first))
|
||||
return lexNumOrSym();
|
||||
else {
|
||||
cerr << "Non-printable character found." << endl;
|
||||
throw std::exception();
|
||||
}
|
||||
}
|
||||
|
||||
Token Lexer::next() {
|
||||
return {TokenType::CloseParen};
|
||||
while (true) {
|
||||
// we MUST check for eof AFTER trying to get a character.
|
||||
// ss.eof() doesn't return true until we try to get another
|
||||
// character while at EOF, even if we have exhausted the stream.
|
||||
char c = ss.get();
|
||||
if (ss.eof())
|
||||
return {TokenType::End};
|
||||
|
||||
if (isspace(c))
|
||||
continue;
|
||||
switch (c) {
|
||||
case '(': return {TokenType::OpenParen};
|
||||
case ')': return {TokenType::CloseParen};
|
||||
case '$': return {TokenType::Dollar};
|
||||
default:
|
||||
ss.unget();
|
||||
return lexNonSpecial();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user