gh-136595: Normalize surrogate pairs in REPL input to fix UnicodeEncodeError on Windows

vedant713 · web-flow · commit b9c4f77e804f · 2025-07-13T21:24:59.000-04:00
The new REPL implementation (_pyrepl) crashes on Windows when the user inputs Unicode characters outside the Basic Multilingual Plane (≥ U+10000), such as emoji (e.g. 🐍). This happens because the Windows input layer provides surrogate pairs (UTF-16 code units) that _pyrepl attempts to process and tokenize directly, leading to unpaired surrogate handling issues. This commit introduces a `normalize_surrogates()` helper in `Reader` to explicitly normalize surrogate pairs by encoding to UTF-16 with 'surrogatepass' and decoding back. The `get_unicode()` method is patched to use this normalization so that any code consuming REPL input (e.g. syntax highlighting via tokenize) receives valid Unicode text. This resolves UnicodeEncodeError crashes in the REPL when typing emoji or other non-BMP characters on Windows. Fixes #136595
diff --git a/Lib/_pyrepl/reader.py b/Lib/_pyrepl/reader.py
@@ -40,6 +40,12 @@
 # syntax classes
 SYNTAX_WHITESPACE, SYNTAX_WORD, SYNTAX_SYMBOL = range(3)
 
+def normalize_surrogates(s):
+    # Encode with surrogatepass, decode to normalize surrogate pairs
+    try:
+        return s.encode('utf-16', 'surrogatepass').decode('utf-16')
+    except UnicodeEncodeError:
+        return s  # fallback if encoding somehow fails
 
 def make_default_syntax_table() -> dict[str, int]:
     # XXX perhaps should use some unicodedata here?
@@ -759,4 +765,5 @@ def bind(self, spec: KeySpec, command: CommandName) -> None:
 
     def get_unicode(self) -> str:
         """Return the current buffer as a unicode string."""
-        return "".join(self.buffer)
+        text = "".join(self.buffer)
+        return normalize_surrogates(text)