Skip to content

Commit 617c7db

Browse files
committed
py/lexer: Use null char as lexer EOF sentinel.
The null byte cannot exist in source code (per CPython), so use it to indicate the end of the input stream (instead of `(mp_uint_t)-1`). This allows the cache chars (chr0/1/2 and their saved versions) to be 8-bit bytes, making it clear that they are not `unichar` values. It also saves a bit of memory in the `mp_lexer_t` data structure. (And in a future commit allows the saved cache chars to be eliminated entirely by storing them in a vstr instead.) In order to keep code size down, the frequently used `chr0` is still of type `uint32_t`. Having it 32-bit means that machine instructions to load it are smaller (it adds about +80 bytes to Thumb code if `chr0` is changed to `uint8_t`). Also add tests for invalid bytes in the input stream to make sure there are no regressions in this regard. Signed-off-by: Damien George <damien@micropython.org>
1 parent bfb5e77 commit 617c7db

3 files changed

Lines changed: 57 additions & 38 deletions

File tree

py/lexer.c

Lines changed: 46 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,8 @@
3535
#if MICROPY_ENABLE_COMPILER
3636

3737
#define TAB_SIZE (8)
38-
39-
// TODO seems that CPython allows NULL byte in the input stream
40-
// don't know if that's intentional or not, but we don't allow it
41-
42-
#define MP_LEXER_EOF ((unichar)MP_READER_EOF)
38+
#define MP_LEXER_EOF ('\0')
39+
#define MP_LEXER_INVALID_BYTE ('\1')
4340
#define CUR_CHAR(lex) ((lex)->chr0)
4441

4542
static bool is_end(mp_lexer_t *lex) {
@@ -149,44 +146,60 @@ static void next_char(mp_lexer_t *lex) {
149146
lex->chr1 = lex->chr2;
150147

151148
// and add the next byte from either the fstring args or the reader
149+
mp_uint_t chr2;
150+
fetch_next_byte:
152151
#if MICROPY_PY_FSTRINGS
153152
if (lex->fstring_args_idx) {
154153
// if there are saved chars, then we're currently injecting fstring args
155154
if (lex->fstring_args_idx < lex->fstring_args.len) {
156-
lex->chr2 = lex->fstring_args.buf[lex->fstring_args_idx++];
155+
chr2 = lex->fstring_args.buf[lex->fstring_args_idx++];
157156
} else {
158157
// no more fstring arg bytes
159-
lex->chr2 = '\0';
158+
chr2 = '\0';
160159
}
161160

162161
if (lex->chr0 == '\0') {
163162
// consumed all fstring data, restore saved input queue
164163
lex->chr0 = lex->chr0_saved;
165164
lex->chr1 = lex->chr1_saved;
166-
lex->chr2 = lex->chr2_saved;
165+
chr2 = lex->chr2_saved;
167166
// stop consuming fstring arg data
168167
vstr_reset(&lex->fstring_args);
169168
lex->fstring_args_idx = 0;
170169
}
171170
} else
172171
#endif
173172
{
174-
lex->chr2 = lex->reader.readbyte(lex->reader.data);
173+
// get next byte from the reader
174+
chr2 = lex->reader.readbyte(lex->reader.data);
175+
176+
// convert stream mp_uint_t value to lexer uint8_t value:
177+
// - MP_READER_EOF indicates end-of-stream, for which lexer uses MP_LEXER_EOF
178+
// - MP_LEXER_EOF is not allowed in the input stream, as is converted to
179+
// MP_LEXER_INVALID_BYTE so it's not interpreted as end-of-stream
180+
// - all other byte values (1 through 255 inclusive) are passed through as-is
181+
if (chr2 == MP_READER_EOF) {
182+
chr2 = MP_LEXER_EOF;
183+
} else if (chr2 == MP_LEXER_EOF) {
184+
chr2 = MP_LEXER_INVALID_BYTE;
185+
}
175186
}
176187

177188
if (lex->chr1 == '\r') {
178189
// CR is a new line, converted to LF
179190
lex->chr1 = '\n';
180-
if (lex->chr2 == '\n') {
191+
if (chr2 == '\n') {
181192
// CR LF is a single new line, throw out the extra LF
182-
lex->chr2 = lex->reader.readbyte(lex->reader.data);
193+
goto fetch_next_byte;
183194
}
184195
}
185196

186197
// check if we need to insert a newline at end of file
187-
if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
188-
lex->chr2 = '\n';
198+
if (chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
199+
chr2 = '\n';
189200
}
201+
202+
lex->chr2 = chr2;
190203
}
191204

192205
static void indent_push(mp_lexer_t *lex, size_t indent) {
@@ -417,11 +430,9 @@ static void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring)
417430
vstr_add_char(&lex->vstr, '\\');
418431
} else {
419432
switch (c) {
420-
// note: "c" can never be MP_LEXER_EOF because next_char
421-
// always inserts a newline at the end of the input stream
422433
case '\n':
423-
c = MP_LEXER_EOF;
424-
break; // backslash escape the newline, just ignore it
434+
// backslash escape the newline, just ignore it
435+
goto continue_parsing_string_literal;
425436
case '\\':
426437
break;
427438
case '\'':
@@ -492,32 +503,31 @@ static void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring)
492503
break;
493504
}
494505
}
495-
if (c != MP_LEXER_EOF) {
496-
#if MICROPY_PY_BUILTINS_STR_UNICODE
497-
if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
498-
// Valid unicode character in a str object.
499-
vstr_add_char(&lex->vstr, c);
500-
} else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
501-
// Valid byte in a bytes object.
502-
vstr_add_byte(&lex->vstr, c);
503-
}
504-
#else
505-
if (c < 0x100) {
506-
// Without unicode everything is just added as an 8-bit byte.
507-
vstr_add_byte(&lex->vstr, c);
508-
}
509-
#endif
510-
else {
511-
// Character out of range; this raises a generic SyntaxError.
512-
lex->tok_kind = MP_TOKEN_INVALID;
513-
}
506+
#if MICROPY_PY_BUILTINS_STR_UNICODE
507+
if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
508+
// Valid unicode character in a str object.
509+
vstr_add_char(&lex->vstr, c);
510+
} else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
511+
// Valid byte in a bytes object.
512+
vstr_add_byte(&lex->vstr, c);
513+
}
514+
#else
515+
if (c < 0x100) {
516+
// Without unicode everything is just added as an 8-bit byte.
517+
vstr_add_byte(&lex->vstr, c);
518+
}
519+
#endif
520+
else {
521+
// Character out of range; this raises a generic SyntaxError.
522+
lex->tok_kind = MP_TOKEN_INVALID;
514523
}
515524
} else {
516525
// Add the "character" as a byte so that we remain 8-bit clean.
517526
// This way, strings are parsed correctly whether or not they contain utf-8 chars.
518527
vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
519528
}
520529
}
530+
continue_parsing_string_literal:
521531
next_char(lex);
522532
}
523533

py/lexer.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,9 +162,10 @@ typedef struct _mp_lexer_t {
162162
qstr source_name; // name of source
163163
mp_reader_t reader; // stream source
164164

165-
unichar chr0, chr1, chr2; // current cached characters from source
165+
uint32_t chr0; // first cached byte from source (32-bits for efficient access)
166+
uint8_t chr1, chr2; // subsequent cached bytes from source
166167
#if MICROPY_PY_FSTRINGS
167-
unichar chr0_saved, chr1_saved, chr2_saved; // current cached characters from alt source
168+
uint8_t chr0_saved, chr1_saved, chr2_saved; // current cached bytes from alt source
168169
#endif
169170

170171
size_t line; // current source line

tests/basics/lexer.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,3 +91,11 @@ def a(x):
9191
eval("01")
9292
except SyntaxError:
9393
print("SyntaxError")
94+
95+
# Bytes 0-8 inclusive are not allowed in input stream.
96+
# Earlier CPython (eg 3.10.12) raises ValueError, later CPython (eg 3.11.14) raises SyntaxError.
97+
for invalid_byte_value in range(0, 10):
98+
try:
99+
print(eval(b"123" + bytes([invalid_byte_value])))
100+
except (ValueError, SyntaxError):
101+
print("byte {}: SyntaxError".format(invalid_byte_value))

0 commit comments

Comments
 (0)