Skip to content

Commit 4626d3f

Browse files
committed
pythongh-149214: Fix non-ASCII strings in remote debugging
1 parent fea2a57 commit 4626d3f

5 files changed

Lines changed: 160 additions & 27 deletions

File tree

Include/internal/pycore_debug_offsets.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ typedef struct _Py_DebugOffsets {
215215
uint64_t state;
216216
uint64_t length;
217217
uint64_t asciiobject_size;
218+
uint64_t compactunicodeobject_size;
218219
} unicode_object;
219220

220221
// GC runtime state offset;
@@ -370,6 +371,7 @@ typedef struct _Py_DebugOffsets {
370371
.state = offsetof(PyUnicodeObject, _base._base.state), \
371372
.length = offsetof(PyUnicodeObject, _base._base.length), \
372373
.asciiobject_size = sizeof(PyASCIIObject), \
374+
.compactunicodeobject_size = sizeof(PyCompactUnicodeObject), \
373375
}, \
374376
.gc = { \
375377
.size = sizeof(struct _gc_runtime_state), \

Lib/test/test_external_inspection.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,75 @@ def test_self_trace_after_ctypes_import(self):
559559
f"stdout: {result.stdout}\nstderr: {result.stderr}"
560560
)
561561

562+
@skip_if_not_supported
563+
@unittest.skipIf(
564+
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
565+
"Test only runs on Linux with process_vm_readv support",
566+
)
567+
def test_remote_stack_trace_non_ascii_names(self):
568+
# Exercise each PyUnicode kind (1-byte non-ASCII, 2-byte BMP,
569+
# 4-byte non-BMP) for both the filename and the function name
570+
# reported in the stack trace.
571+
latin1 = "zażółć" # 1-byte non-ASCII (forces non-ASCII path)
572+
bmp = "λάμβδα" # 2-byte BMP
573+
astral = "𐌀𐌁𐌂𐌃" # 4-byte non-BMP (Old Italic; XID, no NFKC fold)
574+
func_name = f"{latin1}_{bmp}_{astral}"
575+
script_basename = f"mod_{latin1}_{bmp}_{astral}"
576+
577+
port = find_unused_port()
578+
script = textwrap.dedent(
579+
f"""\
580+
import socket
581+
import time
582+
583+
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
584+
sock.connect(('localhost', {port}))
585+
586+
def {func_name}():
587+
sock.sendall(b"ready")
588+
time.sleep(10_000)
589+
590+
{func_name}()
591+
"""
592+
)
593+
with os_helper.temp_dir() as work_dir:
594+
script_dir = os.path.join(work_dir, "script_pkg")
595+
os.mkdir(script_dir)
596+
script_name = _make_test_script(script_dir, script_basename, script)
597+
598+
server_socket = _create_server_socket(port)
599+
client_socket = None
600+
try:
601+
p = subprocess.Popen([sys.executable, script_name])
602+
client_socket, _ = server_socket.accept()
603+
server_socket.close()
604+
_wait_for_signal(client_socket, b"ready")
605+
606+
stack_trace = get_stack_trace(p.pid)
607+
except PermissionError:
608+
self.skipTest("Insufficient permissions to read the stack trace")
609+
finally:
610+
if client_socket is not None:
611+
client_socket.close()
612+
p.kill()
613+
p.wait(timeout=SHORT_TIMEOUT)
614+
615+
frames = [
616+
frame
617+
for interp in stack_trace
618+
for thread in interp.threads
619+
for frame in thread.frame_info
620+
]
621+
target = next(
622+
(f for f in frames if f.funcname == func_name), None
623+
)
624+
self.assertIsNotNone(
625+
target,
626+
f"Frame for {func_name!r} missing; got "
627+
f"{[(f.filename, f.funcname) for f in frames]}",
628+
)
629+
self.assertEqual(target.filename, script_name)
630+
562631
@skip_if_not_supported
563632
@unittest.skipIf(
564633
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix :mod:`!_remote_debugging` misreading non-ASCII Unicode strings (Latin-1,
2+
BMP and non-BMP) from a remote process. Filenames and function names that
3+
contain non-ASCII characters are now reported correctly in stack traces, the
4+
sampling profiler, and :mod:`asyncio` task introspection.

Modules/_remote_debugging/debug_offsets_validation.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
#define FIELD_SIZE(type, member) sizeof(((type *)0)->member)
3232

3333
enum {
34-
PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 840,
34+
PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 848,
3535
PY_REMOTE_ASYNC_DEBUG_OFFSETS_TOTAL_SIZE = 104,
3636
};
3737

@@ -304,7 +304,9 @@ validate_fixed_field(
304304

305305
#define PY_REMOTE_DEBUG_UNICODE_OBJECT_FIELDS(APPLY, buffer_size) \
306306
APPLY(unicode_object, length, sizeof(Py_ssize_t), _Alignof(Py_ssize_t), buffer_size); \
307-
APPLY(unicode_object, asciiobject_size, sizeof(char), _Alignof(char), buffer_size)
307+
APPLY(unicode_object, state, sizeof(struct _PyUnicodeObject_state), _Alignof(struct _PyUnicodeObject_state), buffer_size); \
308+
APPLY(unicode_object, asciiobject_size, sizeof(char), _Alignof(char), buffer_size); \
309+
APPLY(unicode_object, compactunicodeobject_size, sizeof(char), _Alignof(char), buffer_size)
308310

309311
#define PY_REMOTE_DEBUG_GEN_OBJECT_FIELDS(APPLY, buffer_size) \
310312
APPLY(gen_object, gi_frame_state, sizeof(int8_t), _Alignof(int8_t), buffer_size); \

Modules/_remote_debugging/object_reading.c

Lines changed: 81 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,8 @@ read_py_str(
4848
uintptr_t address,
4949
Py_ssize_t max_len
5050
) {
51-
PyObject *result = NULL;
52-
char *buf = NULL;
53-
54-
// Read the entire PyUnicodeObject at once
51+
// Read the entire PyUnicodeObject at once; for short strings the data
52+
// is inline right after the header and we'll already have (some of) it.
5553
char unicode_obj[SIZEOF_UNICODE_OBJ];
5654
int res = _Py_RemoteDebug_PagedReadRemoteMemory(
5755
&unwinder->handle,
@@ -61,7 +59,7 @@ read_py_str(
6159
);
6260
if (res < 0) {
6361
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read PyUnicodeObject");
64-
goto err;
62+
return NULL;
6563
}
6664

6765
Py_ssize_t len = GET_MEMBER(Py_ssize_t, unicode_obj, unwinder->debug_offsets.unicode_object.length);
@@ -72,36 +70,94 @@ read_py_str(
7270
return NULL;
7371
}
7472

75-
buf = (char *)PyMem_RawMalloc(len+1);
76-
if (buf == NULL) {
77-
PyErr_NoMemory();
78-
set_exception_cause(unwinder, PyExc_MemoryError, "Failed to allocate buffer for string reading");
73+
// Inspect state to pick the right data offset and character width.
74+
// We rely on the remote process sharing this Python version's
75+
// PyASCIIObject layout, the same assumption already used for `length`.
76+
struct _PyUnicodeObject_state state = GET_MEMBER(
77+
struct _PyUnicodeObject_state,
78+
unicode_obj,
79+
unwinder->debug_offsets.unicode_object.state);
80+
81+
if (!state.compact) {
82+
PyErr_Format(PyExc_RuntimeError,
83+
"Cannot read non-compact Unicode object at 0x%lx", address);
84+
set_exception_cause(unwinder, PyExc_RuntimeError,
85+
"Legacy (non-compact) Unicode objects are not supported");
7986
return NULL;
8087
}
8188

82-
size_t offset = (size_t)unwinder->debug_offsets.unicode_object.asciiobject_size;
83-
res = _Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, address + offset, len, buf);
84-
if (res < 0) {
85-
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read string data from remote memory");
86-
goto err;
89+
int kind = (int)state.kind;
90+
Py_UCS4 max_char;
91+
switch (kind) {
92+
case PyUnicode_1BYTE_KIND:
93+
max_char = state.ascii ? 0x7F : 0xFF;
94+
break;
95+
case PyUnicode_2BYTE_KIND:
96+
max_char = 0xFFFF;
97+
break;
98+
case PyUnicode_4BYTE_KIND:
99+
max_char = 0x10FFFF;
100+
break;
101+
default:
102+
PyErr_Format(PyExc_RuntimeError,
103+
"Invalid Unicode kind %d at 0x%lx", kind, address);
104+
set_exception_cause(unwinder, PyExc_RuntimeError,
105+
"Invalid kind in remote Unicode object");
106+
return NULL;
87107
}
88-
buf[len] = '\0';
89108

90-
result = PyUnicode_FromStringAndSize(buf, len);
109+
size_t header_size = state.ascii
110+
? (size_t)unwinder->debug_offsets.unicode_object.asciiobject_size
111+
: (size_t)unwinder->debug_offsets.unicode_object.compactunicodeobject_size;
112+
113+
// len * kind is bounded by max_len * 4 (kind <= 4, len <= max_len), so
114+
// the multiplication can't overflow for any caller-sane max_len, but the
115+
// explicit cap here keeps a corrupted remote `length` from later turning
116+
// into a giant allocation.
117+
size_t nbytes = (size_t)len * (size_t)kind;
118+
if ((size_t)len > (SIZE_MAX / 4) || nbytes > (size_t)max_len * 4) {
119+
PyErr_Format(PyExc_RuntimeError,
120+
"Implausible Unicode byte size %zu at 0x%lx", nbytes, address);
121+
set_exception_cause(unwinder, PyExc_RuntimeError,
122+
"Garbage byte size in remote Unicode object");
123+
return NULL;
124+
}
125+
126+
PyObject *result = PyUnicode_New(len, max_char);
91127
if (result == NULL) {
92-
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create PyUnicode from remote string data");
93-
goto err;
128+
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to allocate PyUnicode for remote string");
129+
return NULL;
130+
}
131+
if (nbytes == 0) {
132+
return result;
94133
}
95134

96-
PyMem_RawFree(buf);
97-
assert(result != NULL);
98-
return result;
135+
void *data = PyUnicode_DATA(result);
99136

100-
err:
101-
if (buf != NULL) {
102-
PyMem_RawFree(buf);
137+
// Reuse data already present in the header read; only round-trip for
138+
// whatever spills past it.
139+
size_t inline_avail = (header_size < SIZEOF_UNICODE_OBJ)
140+
? SIZEOF_UNICODE_OBJ - header_size
141+
: 0;
142+
size_t inline_bytes = nbytes < inline_avail ? nbytes : inline_avail;
143+
if (inline_bytes > 0) {
144+
memcpy(data, unicode_obj + header_size, inline_bytes);
103145
}
104-
return NULL;
146+
147+
if (nbytes > inline_bytes) {
148+
res = _Py_RemoteDebug_PagedReadRemoteMemory(
149+
&unwinder->handle,
150+
address + header_size + inline_bytes,
151+
nbytes - inline_bytes,
152+
(char *)data + inline_bytes);
153+
if (res < 0) {
154+
Py_DECREF(result);
155+
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read string data from remote memory");
156+
return NULL;
157+
}
158+
}
159+
160+
return result;
105161
}
106162

107163
PyObject *

0 commit comments

Comments
 (0)