Skip to content

Commit 8287460

Browse files
committed
gh-139353: Add Objects/unicode_codecs_utf.c file
Rename functions: * ascii_decode() => _PyUnicode_DecodeASCII() * backslashreplace() => _PyUnicode_backslashreplace() * raise_encode_exception() => _PyUnicode_RaiseEncodeException() * unicode_decode_call_errorhandler_writer() => _PyUnicode_DecodeCallErrorHandler() * unicode_decode_utf8() => _PyUnicode_DecodeUTF8() * unicode_encode_call_errorhandler() => _PyUnicode_EncodeCallErrorHandler() * unicode_encode_utf8() => _PyUnicode_EncodeUTF8() * xmlcharrefreplace() => _PyUnicode_xmlcharrefreplace() Move static inline functions and macros to pycore_unicodeobject.h: * _PyUnicode_CHECK() * _PyUnicode_UTF8() * PyUnicode_UTF8() * PyUnicode_SET_UTF8() * PyUnicode_UTF8_LENGTH() * PyUnicode_SET_UTF8_LENGTH()
1 parent 748c4b4 commit 8287460

9 files changed

Lines changed: 2391 additions & 2269 deletions

File tree

Include/internal/pycore_unicodeobject.h

Lines changed: 105 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ extern "C" {
99
#endif
1010

1111
#include "pycore_fileutils.h" // _Py_error_handler
12+
#include "pycore_pyatomic_ft_wrappers.h" // FT_ATOMIC_STORE_PTR_RELEASE()
1213
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
1314

1415

@@ -32,6 +33,110 @@ extern PyObject* _PyUnicode_ResizeCompact(
3233
PyObject *unicode,
3334
Py_ssize_t length);
3435
extern PyObject* _PyUnicode_GetEmpty(void);
36+
extern int _PyUnicode_DecodeCallErrorHandler(
37+
const char *errors,
38+
PyObject **errorHandler,
39+
const char *encoding,
40+
const char *reason,
41+
const char **input,
42+
const char **inend,
43+
Py_ssize_t *startinpos,
44+
Py_ssize_t *endinpos,
45+
PyObject **exceptionObject,
46+
const char **inptr,
47+
_PyUnicodeWriter *writer);
48+
extern char* _PyUnicode_backslashreplace(
49+
PyBytesWriter *writer,
50+
char *str,
51+
PyObject *unicode,
52+
Py_ssize_t collstart,
53+
Py_ssize_t collend);
54+
extern char* _PyUnicode_xmlcharrefreplace(
55+
PyBytesWriter *writer,
56+
char *str,
57+
PyObject *unicode,
58+
Py_ssize_t collstart,
59+
Py_ssize_t collend);
60+
extern PyObject* _PyUnicode_EncodeCallErrorHandler(
61+
const char *errors,
62+
PyObject **errorHandler,
63+
const char *encoding,
64+
const char *reason,
65+
PyObject *unicode,
66+
PyObject **exceptionObject,
67+
Py_ssize_t startpos,
68+
Py_ssize_t endpos,
69+
Py_ssize_t *newpos);
70+
extern void _PyUnicode_RaiseEncodeException(
71+
PyObject **exceptionObject,
72+
const char *encoding,
73+
PyObject *unicode,
74+
Py_ssize_t startpos,
75+
Py_ssize_t endpos,
76+
const char *reason);
77+
extern Py_ssize_t _PyUnicode_DecodeASCII(
78+
const char *start,
79+
const char *end,
80+
Py_UCS1 *dest);
81+
extern PyObject* _PyUnicode_EncodeUTF8(
82+
PyObject *unicode,
83+
_Py_error_handler error_handler,
84+
const char *errors);
85+
extern PyObject* _PyUnicode_DecodeUTF8(
86+
const char *s,
87+
Py_ssize_t size,
88+
_Py_error_handler error_handler,
89+
const char *errors,
90+
Py_ssize_t *consumed);
91+
92+
// Export for '_json' shared extension
93+
PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
94+
PyObject *op,
95+
int check_content);
96+
97+
98+
#ifdef Py_DEBUG
99+
# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
100+
#else
101+
# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
102+
#endif
103+
104+
static inline char* _PyUnicode_UTF8(PyObject *op)
105+
{
106+
return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
107+
}
108+
109+
static inline char* PyUnicode_UTF8(PyObject *op)
110+
{
111+
assert(_PyUnicode_CHECK(op));
112+
if (PyUnicode_IS_COMPACT_ASCII(op)) {
113+
return ((char*)(_PyASCIIObject_CAST(op) + 1));
114+
}
115+
else {
116+
return _PyUnicode_UTF8(op);
117+
}
118+
}
119+
120+
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
121+
{
122+
assert(_PyUnicode_CHECK(op));
123+
if (PyUnicode_IS_COMPACT_ASCII(op)) {
124+
return _PyASCIIObject_CAST(op)->length;
125+
}
126+
else {
127+
return _PyCompactUnicodeObject_CAST(op)->utf8_length;
128+
}
129+
}
130+
131+
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
132+
{
133+
FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
134+
}
135+
136+
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
137+
{
138+
_PyCompactUnicodeObject_CAST(op)->utf8_length = length;
139+
}
35140

36141

37142
/* Generic helper macro to convert characters of different types.
@@ -116,11 +221,6 @@ _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
116221

117222
/* --- Unicode API -------------------------------------------------------- */
118223

119-
// Export for '_json' shared extension
120-
PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
121-
PyObject *op,
122-
int check_content);
123-
124224
PyAPI_FUNC(void) _PyUnicode_ExactDealloc(PyObject *op);
125225
extern Py_ssize_t _PyUnicode_InternedSize(void);
126226
extern Py_ssize_t _PyUnicode_InternedSize_Immortal(void);

Makefile.pre.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,7 @@ OBJECT_OBJS= \
558558
Objects/tupleobject.o \
559559
Objects/typeobject.o \
560560
Objects/typevarobject.o \
561+
Objects/unicode_codecs_utf.o \
561562
Objects/unicode_format.o \
562563
Objects/unicode_formatter.o \
563564
Objects/unicode_writer.o \

Objects/stringlib/codecs.h

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -352,8 +352,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
352352
case _Py_ERROR_BACKSLASHREPLACE:
353353
/* subtract preallocated bytes */
354354
writer->size -= max_char_size * (endpos - startpos);
355-
p = backslashreplace(writer, p,
356-
unicode, startpos, endpos);
355+
p = _PyUnicode_backslashreplace(writer, p,
356+
unicode, startpos, endpos);
357357
if (p == NULL)
358358
goto error;
359359
i += (endpos - startpos - 1);
@@ -362,8 +362,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
362362
case _Py_ERROR_XMLCHARREFREPLACE:
363363
/* subtract preallocated bytes */
364364
writer->size -= max_char_size * (endpos - startpos);
365-
p = xmlcharrefreplace(writer, p,
366-
unicode, startpos, endpos);
365+
p = _PyUnicode_xmlcharrefreplace(writer, p,
366+
unicode, startpos, endpos);
367367
if (p == NULL)
368368
goto error;
369369
i += (endpos - startpos - 1);
@@ -384,7 +384,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
384384
assert(startpos < endpos);
385385
_Py_FALLTHROUGH;
386386
default:
387-
rep = unicode_encode_call_errorhandler(
387+
rep = _PyUnicode_EncodeCallErrorHandler(
388388
errors, &error_handler_obj, "utf-8", "surrogates not allowed",
389389
unicode, &exc, startpos, endpos, &newpos);
390390
if (!rep)
@@ -415,9 +415,10 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
415415
else {
416416
/* rep is unicode */
417417
if (!PyUnicode_IS_ASCII(rep)) {
418-
raise_encode_exception(&exc, "utf-8", unicode,
419-
startpos, endpos,
420-
"surrogates not allowed");
418+
_PyUnicode_RaiseEncodeException(
419+
&exc, "utf-8", unicode,
420+
startpos, endpos,
421+
"surrogates not allowed");
421422
goto error;
422423
}
423424

@@ -452,7 +453,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
452453
#if STRINGLIB_SIZEOF_CHAR > 2
453454
else /* ch >= 0x10000 */
454455
{
455-
assert(ch <= MAX_UNICODE);
456+
assert(ch <= _Py_MAX_UNICODE);
456457
/* Encode UCS4 Unicode ordinals */
457458
*p++ = (char)(0xf0 | (ch >> 18));
458459
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));

0 commit comments

Comments
 (0)