py/lexer: Use null char as lexer EOF sentinel.

dpgeorge · dpgeorge · commit 617c7dba3b71 · 2026-02-04T23:19:09.000+11:00
The null byte cannot exist in source code (per CPython), so use it to
indicate the end of the input stream (instead of `(mp_uint_t)-1`).  This
allows the cache chars (chr0/1/2 and their saved versions) to be 8-bit
bytes, making it clear that they are not `unichar` values.  It also saves a
bit of memory in the `mp_lexer_t` data structure.  (And in a future commit
allows the saved cache chars to be eliminated entirely by storing them in
a vstr instead.)

In order to keep code size down, the frequently used `chr0` is still of
type `uint32_t`.  Having it 32-bit means that machine instructions to load
it are smaller (it adds about +80 bytes to Thumb code if `chr0` is changed
to `uint8_t`).

Also add tests for invalid bytes in the input stream to make sure there are
no regressions in this regard.

Signed-off-by: Damien George &lt;damien@micropython.org&gt;
diff --git a/py/lexer.c b/py/lexer.c
@@ -35,11 +35,8 @@
 #if MICROPY_ENABLE_COMPILER
 
 #define TAB_SIZE (8)
-
-// TODO seems that CPython allows NULL byte in the input stream
-// don't know if that's intentional or not, but we don't allow it
-
-#define MP_LEXER_EOF ((unichar)MP_READER_EOF)
+#define MP_LEXER_EOF ('\0')
+#define MP_LEXER_INVALID_BYTE ('\1')
 #define CUR_CHAR(lex) ((lex)->chr0)
 
 static bool is_end(mp_lexer_t *lex) {
@@ -149,44 +146,60 @@ static void next_char(mp_lexer_t *lex) {
     lex->chr1 = lex->chr2;
 
     // and add the next byte from either the fstring args or the reader
+    mp_uint_t chr2;
+fetch_next_byte:
     #if MICROPY_PY_FSTRINGS
     if (lex->fstring_args_idx) {
         // if there are saved chars, then we're currently injecting fstring args
         if (lex->fstring_args_idx < lex->fstring_args.len) {
-            lex->chr2 = lex->fstring_args.buf[lex->fstring_args_idx++];
+            chr2 = lex->fstring_args.buf[lex->fstring_args_idx++];
         } else {
             // no more fstring arg bytes
-            lex->chr2 = '\0';
+            chr2 = '\0';
         }
 
         if (lex->chr0 == '\0') {
             // consumed all fstring data, restore saved input queue
             lex->chr0 = lex->chr0_saved;
             lex->chr1 = lex->chr1_saved;
-            lex->chr2 = lex->chr2_saved;
+            chr2 = lex->chr2_saved;
             // stop consuming fstring arg data
             vstr_reset(&lex->fstring_args);
             lex->fstring_args_idx = 0;
         }
     } else
     #endif
     {
-        lex->chr2 = lex->reader.readbyte(lex->reader.data);
+        // get next byte from the reader
+        chr2 = lex->reader.readbyte(lex->reader.data);
+
+        // convert stream mp_uint_t value to lexer uint8_t value:
+        // - MP_READER_EOF indicates end-of-stream, for which lexer uses MP_LEXER_EOF
+        // - MP_LEXER_EOF is not allowed in the input stream, as is converted to
+        //   MP_LEXER_INVALID_BYTE so it's not interpreted as end-of-stream
+        // - all other byte values (1 through 255 inclusive) are passed through as-is
+        if (chr2 == MP_READER_EOF) {
+            chr2 = MP_LEXER_EOF;
+        } else if (chr2 == MP_LEXER_EOF) {
+            chr2 = MP_LEXER_INVALID_BYTE;
+        }
     }
 
     if (lex->chr1 == '\r') {
         // CR is a new line, converted to LF
         lex->chr1 = '\n';
-        if (lex->chr2 == '\n') {
+        if (chr2 == '\n') {
             // CR LF is a single new line, throw out the extra LF
-            lex->chr2 = lex->reader.readbyte(lex->reader.data);
+            goto fetch_next_byte;
         }
     }
 
     // check if we need to insert a newline at end of file
-    if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
-        lex->chr2 = '\n';
+    if (chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
+        chr2 = '\n';
     }
+
+    lex->chr2 = chr2;
 }
 
 static void indent_push(mp_lexer_t *lex, size_t indent) {
@@ -417,11 +430,9 @@ static void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring)
                     vstr_add_char(&lex->vstr, '\\');
                 } else {
                     switch (c) {
-                        // note: "c" can never be MP_LEXER_EOF because next_char
-                        // always inserts a newline at the end of the input stream
                         case '\n':
-                            c = MP_LEXER_EOF;
-                            break;                          // backslash escape the newline, just ignore it
+                            // backslash escape the newline, just ignore it
+                            goto continue_parsing_string_literal;
                         case '\\':
                             break;
                         case '\'':
@@ -492,32 +503,31 @@ static void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring)
                             break;
                     }
                 }
-                if (c != MP_LEXER_EOF) {
-                    #if MICROPY_PY_BUILTINS_STR_UNICODE
-                    if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
-                        // Valid unicode character in a str object.
-                        vstr_add_char(&lex->vstr, c);
-                    } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
-                        // Valid byte in a bytes object.
-                        vstr_add_byte(&lex->vstr, c);
-                    }
-                    #else
-                    if (c < 0x100) {
-                        // Without unicode everything is just added as an 8-bit byte.
-                        vstr_add_byte(&lex->vstr, c);
-                    }
-                    #endif
-                    else {
-                        // Character out of range; this raises a generic SyntaxError.
-                        lex->tok_kind = MP_TOKEN_INVALID;
-                    }
+                #if MICROPY_PY_BUILTINS_STR_UNICODE
+                if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
+                    // Valid unicode character in a str object.
+                    vstr_add_char(&lex->vstr, c);
+                } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
+                    // Valid byte in a bytes object.
+                    vstr_add_byte(&lex->vstr, c);
+                }
+                #else
+                if (c < 0x100) {
+                    // Without unicode everything is just added as an 8-bit byte.
+                    vstr_add_byte(&lex->vstr, c);
+                }
+                #endif
+                else {
+                    // Character out of range; this raises a generic SyntaxError.
+                    lex->tok_kind = MP_TOKEN_INVALID;
                 }
             } else {
                 // Add the "character" as a byte so that we remain 8-bit clean.
                 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
                 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
             }
         }
+    continue_parsing_string_literal:
         next_char(lex);
     }
 
diff --git a/py/lexer.h b/py/lexer.h
@@ -162,9 +162,10 @@ typedef struct _mp_lexer_t {
     qstr source_name;           // name of source
     mp_reader_t reader;         // stream source
 
-    unichar chr0, chr1, chr2;   // current cached characters from source
+    uint32_t chr0;              // first cached byte from source (32-bits for efficient access)
+    uint8_t chr1, chr2;         // subsequent cached bytes from source
     #if MICROPY_PY_FSTRINGS
-    unichar chr0_saved, chr1_saved, chr2_saved; // current cached characters from alt source
+    uint8_t chr0_saved, chr1_saved, chr2_saved; // current cached bytes from alt source
     #endif
 
     size_t line;                // current source line
diff --git a/tests/basics/lexer.py b/tests/basics/lexer.py
@@ -91,3 +91,11 @@ def a(x):
     eval("01")
 except SyntaxError:
     print("SyntaxError")
+
+# Bytes 0-8 inclusive are not allowed in input stream.
+# Earlier CPython (eg 3.10.12) raises ValueError, later CPython (eg 3.11.14) raises SyntaxError.
+for invalid_byte_value in range(0, 10):
+    try:
+        print(eval(b"123" + bytes([invalid_byte_value])))
+    except (ValueError, SyntaxError):
+        print("byte {}: SyntaxError".format(invalid_byte_value))