mutable
A Database System for Research and Fast Prototyping
Loading...
Searching...
No Matches
Lexer.cpp
Go to the documentation of this file.
1#include "lex/Lexer.hpp"
2
3#include <cctype>
4
5
6#define UNDO(CHR) { in.putback(c_); c_ = CHR; pos_.column--; }
7
8
9using namespace m;
10using namespace m::ast;
11
12
14{
15#define M_KEYWORD(tok, text) keywords_.emplace(pool(#text), TK_##tok);
16#include <mutable/tables/Keywords.tbl>
17#undef M_KEYWORD
18}
19
21{
22 /* skip whitespaces and comments */
23 for (;;) {
24 switch (c_) {
25 case EOF: return Token(pos_, pool("EOF"), TK_EOF);
26 case ' ': case '\t': case '\v': case '\f': case '\n': case '\r': step(); continue;
27
28 case '-': {
29 step();
30 if (c_ == '-') {
31 /* read comment */
32 do step(); while (c_ != EOF and c_ != '\n');
33 continue;
34 } else {
35 /* TK_MINUS */
36 UNDO('-');
37 goto after;
38 }
39 }
40
41 default: goto after;
42 }
43 }
44after:
45
46 start_ = pos_;
47 buf_.clear();
48
49 switch (c_) {
50 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
51 return read_number();
52
53 case '"':
54 return read_string_literal();
55
56 case 'd': {
57 step();
58 if (c_ == '\'') {
59 buf_.push_back('d'); // add prefix 'd'
60 return read_date_or_datetime();
61 } else {
62 UNDO('d');
64 }
65 }
66
67 /* Punctuators */
68#define LEX(chr, text, tt, SUB) case chr: step(); switch (c_) { SUB } return Token(start_, pool(text), tt);
69#define GUESS(first, SUB) case first: step(); switch (c_) { SUB } UNDO(first); break;
70 LEX('(', "(", TK_LPAR, );
71 LEX(')', ")", TK_RPAR, );
72 LEX('~', "~", TK_TILDE, );
73 LEX('+', "+", TK_PLUS, );
74 LEX('-', "-", TK_MINUS, );
75 LEX('*', "*", TK_ASTERISK, );
76 LEX('/', "/", TK_SLASH, );
77 LEX('%', "%", TK_PERCENT, );
78 LEX('=', "=", TK_EQUAL, );
79 GUESS('!',
80 LEX('=', "!=", TK_BANG_EQUAL, ) );
81 LEX('<', "<", TK_LESS,
82 LEX('=', "<=", TK_LESS_EQUAL, ) );
83 LEX('>', ">", TK_GREATER,
84 LEX('=', ">=", TK_GREATER_EQUAL, ) );
85 LEX(',', ",", TK_COMMA, );
86 LEX(';', ";", TK_SEMICOL, );
87 LEX('.', ".", TK_DOT,
88 LEX('.', "..", TK_DOTDOT, )
89 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
90 UNDO('.');
91 return read_number(););
92 case '\\':
93 return read_instruction();
94
95#undef LEX
96#undef GUESS
97
98 default: /* fallthrough */;
99 }
100
101 if ('_' == c_ or is_alpha(c_)) return read_keyword_or_identifier();
102
103 push();
104 const auto str = internalize();
105 diag.e(start_) << "illegal character '" << str << "'\n";
106 return Token(start_, std::move(str), TK_ERROR);
107}
108
109
110/*====================================================================================================================*/
111//
112// Lexer functions
113//
114/*====================================================================================================================*/
115
117{
118 while ('_' == c_ or is_alnum(c_))
119 push();
120 const auto str = internalize();
121 auto it = keywords_.find(str);
122 if (it == keywords_.end()) return Token(start_, std::move(str), TK_IDENTIFIER);
123 else return Token(start_, std::move(str), it->second);
124}
125
127{
128 bool is_float = false;
129 bool empty = true;
130 enum { Oct, Dec, Hex } is, has;
131
132 /*-- Prefix ----------------------*/
133 is = Dec;
134 if ('0' == c_) { is = Oct; empty = false; push(); }
135 if ('x' == c_ || 'X' == c_) { is = Hex; empty = true; push(); }
136 has = is;
137
138 /*-- sequence before dot ---------*/
139 for (;;) {
140 if (is == Oct && is_oct(c_)) /* OK */;
141 else if (is == Oct && is_dec(c_)) has = Dec;
142 else if (is == Dec && is_dec(c_)) /* OK */;
143 else if (is == Hex && is_hex(c_)) /* OK */;
144 else break;
145 empty = false;
146 push();
147 }
148
149 /*-- the dot ---------------------*/
150 if ('.' == c_) {
151 push();
152 is_float = true;
153 if (is == Oct) is = Dec; // there are no octal floating point constants
154 if (has == Oct) has = Dec;
155
156 /*-- sequence after dot ------*/
157 if (is == Dec) { if (is_dec(c_)) empty = false; while (is_dec(c_)) push(); }
158 else { M_insist(is == Hex); if (is_hex(c_)) empty = false; while (is_hex(c_)) push(); }
159 }
160
161 /*-- exponent part ---------------*/
162 if ((is == Oct && ('e' == c_ || 'E' == c_)) ||
163 (is == Dec && ('e' == c_ || 'E' == c_)) ||
164 (is == Hex && ('p' == c_ || 'P' == c_))) {
165 push();
166 is_float = true;
167 if (is == Oct) is = Dec; // there are no octal floating point constants
168 if (has == Oct) has = Dec;
169 if ('-' == c_ || '+' == c_) push();
170 empty = true;
171 while (is_dec(c_)) { empty = false; push(); }
172 }
173
174 if (empty or is != has) {
175 const auto str = internalize();
176 diag.e(start_) << "invalid number '" << str << "'\n";
177 return Token(start_, std::move(str), TK_ERROR);
178 }
179 TokenType tt;
180 switch (is) {
181 case Oct: tt = TK_OCT_INT; break;
182 case Dec: tt = is_float ? TK_DEC_FLOAT : TK_DEC_INT; break;
183 case Hex: tt = is_float ? TK_HEX_FLOAT : TK_HEX_INT; break;
184 }
185 return Token(start_, internalize(), tt);
186}
187
189{
190 push(); // initial '"'
191 bool invalid = false;
192 while (EOF != c_ and '"' != c_) {
193 if (c_ == '\\') { // escape character
194 push();
195 switch (c_) {
196 default:
197 /* invalid escape sequence */
198 invalid = true;
199 /* fallthrough */
200 case '"':
201 case '\\':
202 case 'n':
203 case 't':
204 /* valid escape sequence */
205 push();
206 }
207 } else {
208 push();
209 }
210 }
211
212 if ('"' != c_) {
213 const auto str = internalize();
214 diag.e(start_) << "unterminated string literal '" << str << "'\n";
215 return Token(start_, std::move(str), TK_ERROR);
216 }
217
218 push(); // terminal '"'
219 const auto str = internalize();
220
221 if (invalid) {
222 diag.e(start_) << "invalid escape sequence in string literal '" << str << "'\n";
223 return Token(start_, std::move(str), TK_ERROR);
224 }
225
226 return Token(start_, std::move(str), TK_STRING_LITERAL);
227}
228
230{
231 push(); // initial '''
232 bool invalid = false;
233 bool datetime = false;
234
235#define DIGITS(num) for (auto i = 0; i < num; ++i) if (is_dec(c_)) push(); else invalid = true;
236 accept('-'); // for years BC
237 DIGITS(4); // year
238 invalid &= accept('-');
239 DIGITS(2); // month
240 invalid &= accept('-');
241 DIGITS(2); // day
242
243 if (accept(' ')) {
244 datetime = true;
245 DIGITS(2) // hours
246 invalid &= accept(':');
247 DIGITS(2); // minutes
248 invalid &= accept(':');
249 DIGITS(2); // seconds
250 }
251#undef DIGITS
252
253 if ('\'' != c_) {
254 const auto str = internalize();
255 diag.e(start_) << "unterminated " << (datetime ? "datetime" : "date") << " '" << str << "'\n";
256 return Token(start_, std::move(str), TK_ERROR);
257 }
258
259 push(); // terminal '''
260 const auto str = internalize();
261
262 if (invalid) {
263 diag.e(start_) << "invalid symbol in " << (datetime ? "datetime" : "date") << " '" << str << "'\n";
264 return Token(start_, std::move(str), TK_ERROR);
265 }
266
267 return Token(start_, std::move(str), datetime ? TK_DATE_TIME : TK_DATE);
268}
269
271{
272 push(); // initial '\'
273 while (';' != c_ and EOF != c_)
274 push();
275 return Token(start_, internalize(), TK_INSTRUCTION);
276}
#define DIGITS(num)
#define GUESS(first, SUB)
#define UNDO(CHR)
Definition: Lexer.cpp:6
#define LEX(chr, text, tt, SUB)
#define M_insist(...)
Definition: macro.hpp:129
‍mutable namespace
Definition: Backend.hpp:10
bool is_oct(int c)
Definition: fn.hpp:588
bool is_hex(int c)
Definition: fn.hpp:590
bool is_dec(int c)
Definition: fn.hpp:589
bool is_alnum(int c)
Definition: fn.hpp:594
and
Definition: enum_ops.hpp:12
bool is_alpha(int c)
Definition: fn.hpp:593
TokenType
Definition: TokenType.hpp:10
std::ostream & e(const Position pos)
Definition: Diagnostic.hpp:41
Token read_string_literal()
Definition: Lexer.cpp:188
Position start_
Definition: Lexer.hpp:29
Token next()
Obtains the next token from the input stream.
Definition: Lexer.cpp:20
Token read_keyword_or_identifier()
Definition: Lexer.cpp:116
void initialize_keywords()
Initializes the set of all keywords.
Definition: Lexer.cpp:13
Token read_number()
Definition: Lexer.cpp:126
ThreadSafePooledString internalize()
Definition: Lexer.hpp:81
int step()
Reads the next character from in to c_, and updates pos_ accordingly.
Definition: Lexer.hpp:54
Position pos_
Definition: Lexer.hpp:29
Diagnostic & diag
Definition: Lexer.hpp:19
ThreadSafeStringPool & pool
Definition: Lexer.hpp:20
bool accept(const int c)
Definition: Lexer.hpp:73
Token read_instruction()
Definition: Lexer.cpp:270
Token read_date_or_datetime()
Definition: Lexer.cpp:229
buf_t buf_
Definition: Lexer.hpp:30
Keywords_t keywords_
Definition: Lexer.hpp:27
void push()
Definition: Lexer.hpp:68