|
35 | 35 | #if MICROPY_ENABLE_COMPILER |
36 | 36 |
|
37 | 37 | #define TAB_SIZE (8) |
38 | | - |
39 | | -// TODO seems that CPython allows NULL byte in the input stream |
40 | | -// don't know if that's intentional or not, but we don't allow it |
41 | | - |
42 | | -#define MP_LEXER_EOF ((unichar)MP_READER_EOF) |
| 38 | +#define MP_LEXER_EOF ('\0') |
| 39 | +#define MP_LEXER_INVALID_BYTE ('\1') |
43 | 40 | #define CUR_CHAR(lex) ((lex)->chr0) |
44 | 41 |
|
45 | 42 | static bool is_end(mp_lexer_t *lex) { |
@@ -149,44 +146,60 @@ static void next_char(mp_lexer_t *lex) { |
149 | 146 | lex->chr1 = lex->chr2; |
150 | 147 |
|
151 | 148 | // and add the next byte from either the fstring args or the reader |
| 149 | + mp_uint_t chr2; |
| 150 | +fetch_next_byte: |
152 | 151 | #if MICROPY_PY_FSTRINGS |
153 | 152 | if (lex->fstring_args_idx) { |
154 | 153 | // if there are saved chars, then we're currently injecting fstring args |
155 | 154 | if (lex->fstring_args_idx < lex->fstring_args.len) { |
156 | | - lex->chr2 = lex->fstring_args.buf[lex->fstring_args_idx++]; |
| 155 | + chr2 = lex->fstring_args.buf[lex->fstring_args_idx++]; |
157 | 156 | } else { |
158 | 157 | // no more fstring arg bytes |
159 | | - lex->chr2 = '\0'; |
| 158 | + chr2 = '\0'; |
160 | 159 | } |
161 | 160 |
|
162 | 161 | if (lex->chr0 == '\0') { |
163 | 162 | // consumed all fstring data, restore saved input queue |
164 | 163 | lex->chr0 = lex->chr0_saved; |
165 | 164 | lex->chr1 = lex->chr1_saved; |
166 | | - lex->chr2 = lex->chr2_saved; |
| 165 | + chr2 = lex->chr2_saved; |
167 | 166 | // stop consuming fstring arg data |
168 | 167 | vstr_reset(&lex->fstring_args); |
169 | 168 | lex->fstring_args_idx = 0; |
170 | 169 | } |
171 | 170 | } else |
172 | 171 | #endif |
173 | 172 | { |
174 | | - lex->chr2 = lex->reader.readbyte(lex->reader.data); |
| 173 | + // get next byte from the reader |
| 174 | + chr2 = lex->reader.readbyte(lex->reader.data); |
| 175 | + |
| 176 | + // convert stream mp_uint_t value to lexer uint8_t value: |
| 177 | + // - MP_READER_EOF indicates end-of-stream, for which lexer uses MP_LEXER_EOF |
| 178 | + // - MP_LEXER_EOF is not allowed in the input stream, as is converted to |
| 179 | + // MP_LEXER_INVALID_BYTE so it's not interpreted as end-of-stream |
| 180 | + // - all other byte values (1 through 255 inclusive) are passed through as-is |
| 181 | + if (chr2 == MP_READER_EOF) { |
| 182 | + chr2 = MP_LEXER_EOF; |
| 183 | + } else if (chr2 == MP_LEXER_EOF) { |
| 184 | + chr2 = MP_LEXER_INVALID_BYTE; |
| 185 | + } |
175 | 186 | } |
176 | 187 |
|
177 | 188 | if (lex->chr1 == '\r') { |
178 | 189 | // CR is a new line, converted to LF |
179 | 190 | lex->chr1 = '\n'; |
180 | | - if (lex->chr2 == '\n') { |
| 191 | + if (chr2 == '\n') { |
181 | 192 | // CR LF is a single new line, throw out the extra LF |
182 | | - lex->chr2 = lex->reader.readbyte(lex->reader.data); |
| 193 | + goto fetch_next_byte; |
183 | 194 | } |
184 | 195 | } |
185 | 196 |
|
186 | 197 | // check if we need to insert a newline at end of file |
187 | | - if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') { |
188 | | - lex->chr2 = '\n'; |
| 198 | + if (chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') { |
| 199 | + chr2 = '\n'; |
189 | 200 | } |
| 201 | + |
| 202 | + lex->chr2 = chr2; |
190 | 203 | } |
191 | 204 |
|
192 | 205 | static void indent_push(mp_lexer_t *lex, size_t indent) { |
@@ -417,11 +430,9 @@ static void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) |
417 | 430 | vstr_add_char(&lex->vstr, '\\'); |
418 | 431 | } else { |
419 | 432 | switch (c) { |
420 | | - // note: "c" can never be MP_LEXER_EOF because next_char |
421 | | - // always inserts a newline at the end of the input stream |
422 | 433 | case '\n': |
423 | | - c = MP_LEXER_EOF; |
424 | | - break; // backslash escape the newline, just ignore it |
| 434 | + // backslash escape the newline, just ignore it |
| 435 | + goto continue_parsing_string_literal; |
425 | 436 | case '\\': |
426 | 437 | break; |
427 | 438 | case '\'': |
@@ -492,32 +503,31 @@ static void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) |
492 | 503 | break; |
493 | 504 | } |
494 | 505 | } |
495 | | - if (c != MP_LEXER_EOF) { |
496 | | - #if MICROPY_PY_BUILTINS_STR_UNICODE |
497 | | - if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) { |
498 | | - // Valid unicode character in a str object. |
499 | | - vstr_add_char(&lex->vstr, c); |
500 | | - } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) { |
501 | | - // Valid byte in a bytes object. |
502 | | - vstr_add_byte(&lex->vstr, c); |
503 | | - } |
504 | | - #else |
505 | | - if (c < 0x100) { |
506 | | - // Without unicode everything is just added as an 8-bit byte. |
507 | | - vstr_add_byte(&lex->vstr, c); |
508 | | - } |
509 | | - #endif |
510 | | - else { |
511 | | - // Character out of range; this raises a generic SyntaxError. |
512 | | - lex->tok_kind = MP_TOKEN_INVALID; |
513 | | - } |
| 506 | + #if MICROPY_PY_BUILTINS_STR_UNICODE |
| 507 | + if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) { |
| 508 | + // Valid unicode character in a str object. |
| 509 | + vstr_add_char(&lex->vstr, c); |
| 510 | + } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) { |
| 511 | + // Valid byte in a bytes object. |
| 512 | + vstr_add_byte(&lex->vstr, c); |
| 513 | + } |
| 514 | + #else |
| 515 | + if (c < 0x100) { |
| 516 | + // Without unicode everything is just added as an 8-bit byte. |
| 517 | + vstr_add_byte(&lex->vstr, c); |
| 518 | + } |
| 519 | + #endif |
| 520 | + else { |
| 521 | + // Character out of range; this raises a generic SyntaxError. |
| 522 | + lex->tok_kind = MP_TOKEN_INVALID; |
514 | 523 | } |
515 | 524 | } else { |
516 | 525 | // Add the "character" as a byte so that we remain 8-bit clean. |
517 | 526 | // This way, strings are parsed correctly whether or not they contain utf-8 chars. |
518 | 527 | vstr_add_byte(&lex->vstr, CUR_CHAR(lex)); |
519 | 528 | } |
520 | 529 | } |
| 530 | + continue_parsing_string_literal: |
521 | 531 | next_char(lex); |
522 | 532 | } |
523 | 533 |
|
|
0 commit comments