Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
c84f314
gh-119609: Add PyUnicode_Export() function
vstinner May 27, 2024
d0cdbd1
Address reviews
vstinner Sep 5, 2024
9b33dca
Exclude from limited C API 3.13 and older
vstinner Sep 5, 2024
cf1f74a
Replace PyErr_Format() with PyErr_SetString()
vstinner Sep 5, 2024
93d4470
Fix test_collections: implement UserString.__release_buffer__()
vstinner Sep 5, 2024
17ad7b9
Add format parameter to PyUnicode_Export()
vstinner Sep 9, 2024
d683d0a
format must not be NULL
vstinner Sep 9, 2024
78a70fa
Fix memory leak in unicode_releasebuffer()
vstinner Sep 10, 2024
79207f5
Remove PyUnicode_GetBufferFormat() documentation
vstinner Sep 10, 2024
bc0fb69
Apply suggestions from code review
vstinner Sep 10, 2024
2cdbc27
Set format to 0 on error
vstinner Sep 10, 2024
b5be22d
Remove trailing space
vstinner Sep 10, 2024
2960b25
Change constant values
vstinner Sep 10, 2024
bcb41f3
Update constants value in the doc
vstinner Sep 11, 2024
44cb702
Remove unicode_releasebuffer(); use bytes instead
vstinner Sep 12, 2024
1809d8d
PyUnicode_Export() returns the format
vstinner Sep 12, 2024
6707ef4
Fix PyUnicode_Export() signature in doc
vstinner Sep 12, 2024
abf5c58
Use _PyUnicode_EncodeUTF16() and _PyUnicode_EncodeUTF32()
vstinner Sep 12, 2024
033fc07
Use signed int in C tests
vstinner Sep 12, 2024
078dfcf
Update stable_abi: remove PyUnicode_GetBufferFormat()
vstinner Sep 12, 2024
79c6d01
Revert "Use _PyUnicode_EncodeUTF16() and _PyUnicode_EncodeUTF32()"
vstinner Sep 12, 2024
5479ab2
Allow surrogate characters in UTF-8
vstinner Sep 12, 2024
ab2f9b0
Merge branch 'main' into unicode_view
vstinner Sep 13, 2024
f71f230
Avoid a second copy in the UTF-8 export
vstinner Sep 13, 2024
492f10a
UCS-4 export: remove one memory copy
vstinner Sep 13, 2024
b031163
Update Py_buffer format
vstinner Sep 16, 2024
21e6012
Add PyUnicode_EXPORT_COPY flag
vstinner Sep 23, 2024
3267ce6
doc
vstinner Sep 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions Lib/test/test_capi/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -1766,28 +1766,29 @@ def check_ucs1(text, formats):

def check_ucs2(text, formats):
self.assertEqual(unicode_export(text, formats),
(text.encode(ucs2_enc),
(text.encode(ucs2_enc, 'surrogatepass'),
PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2))

def check_ucs4(text, formats):
self.assertEqual(unicode_export(text, formats),
(text.encode(ucs4_enc),
(text.encode(ucs4_enc, 'surrogatepass'),
PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4))

def check_utf8(text):
self.assertEqual(unicode_export(text, PyUnicode_FORMAT_UTF8),
(text.encode('utf8'),
(text.encode('utf8', 'surrogatepass'),
PyUnicode_FORMAT_UTF8, 1, 'B'))

# export as native format
check_ucs1("abc", formats)
check_ucs1("latin1:\xe9", formats)
check_ucs2('ucs2:\u20ac', formats)
check_ucs4('ucs4:\U0010ffff', formats)

# export ASCII as UCS1
# convert ASCII to UCS1
check_ucs1("abc", PyUnicode_FORMAT_UCS1)

# export ASCII and UCS1 to UCS2
# convert ASCII and UCS1 to UCS2
check_ucs2("abc", PyUnicode_FORMAT_UCS2)
check_ucs2("latin1:\xe9", PyUnicode_FORMAT_UCS2)

Expand All @@ -1797,12 +1798,17 @@ def check_utf8(text):
check_ucs4('ucs2:\u20ac', PyUnicode_FORMAT_UCS4)
check_ucs4('ucs4:\U0010ffff', PyUnicode_FORMAT_UCS4)

# always export to UTF8
# always encode to UTF8
check_utf8("abc")
check_utf8("latin1:\xe9")
check_utf8('ucs2:\u20ac')
check_utf8('ucs4:\U0010ffff')

# surrogates
check_ucs2('\udc80', PyUnicode_FORMAT_UCS2)
check_ucs4('\udc80', PyUnicode_FORMAT_UCS4)
check_utf8('\udc80')

# No supported format or invalid format
for formats in (0, PyUnicode_FORMAT_INVALID):
err_msg = "unable to find a matching export format"
Expand Down
77 changes: 58 additions & 19 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -2335,16 +2335,39 @@ PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
static int32_t
unicode_export(PyObject *obj, Py_buffer *view,
Py_ssize_t len, const void *buf,
int itemsize, const char *format, int32_t internal_format)
int itemsize, const char *format, int32_t export_format)
{
if (PyBuffer_FillInfo(view, obj, (void*)buf, len,
1, PyBUF_SIMPLE) < 0) {
return -1;
}
view->itemsize = itemsize;
view->format = (char*)format;
view->internal = (void*)(uintptr_t)internal_format;
return internal_format;
return export_format;
}


static int32_t
unicode_export_bytes(PyObject *bytes, Py_buffer *view,
int itemsize, const char *format, int32_t export_format)
{
const void *buf = PyBytes_AS_STRING(bytes);
assert((PyBytes_GET_SIZE(bytes) % itemsize) == 0);
Py_ssize_t len = PyBytes_GET_SIZE(bytes) / itemsize;
assert(len >= 1);
len--; // ignore the trailing NULL character

if (PyBuffer_FillInfo(view, bytes, (void*)buf, len,
1, PyBUF_SIMPLE) < 0)
{
Py_DECREF(bytes);
return -1;
}
Py_DECREF(bytes);

view->itemsize = itemsize;
view->format = (char*)format;
return export_format;
}


Expand Down Expand Up @@ -2410,11 +2433,8 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats,
ucs2);
ucs2[len] = 0;

int32_t res = unicode_export(bytes, view,
len, ucs2,
2, "H", PyUnicode_FORMAT_UCS2);
Py_DECREF(bytes);
return res;
return unicode_export_bytes(bytes, view,
2, "H", PyUnicode_FORMAT_UCS2);
}

// Native UCS4
Expand All @@ -2438,25 +2458,44 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats,
if (bytes == NULL) {
return -1;
}
ucs4 = (Py_UCS4*)PyBytes_AS_STRING(bytes);

int32_t res = unicode_export(bytes, view,
len, ucs4,
4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4);
Py_DECREF(bytes);
return res;
return unicode_export_bytes(bytes, view,
4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4);
}

// Encode UCS1, UCS2 or UCS4 to UTF-8
if (requested_formats & PyUnicode_FORMAT_UTF8) {
Py_ssize_t nbytes;
const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &nbytes);
if (utf8 == NULL) {
return -1;
if (utf8 != NULL) {
return unicode_export(unicode, view,
nbytes, utf8,
1, "B", PyUnicode_FORMAT_UTF8);
}
return unicode_export(unicode, view,
nbytes, utf8,
1, "B", PyUnicode_FORMAT_UTF8);
if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) {
PyErr_Clear();
PyObject *bytes = _PyUnicode_AsUTF8String(unicode, "surrogatepass");
if (bytes == NULL) {
return -1;
}
len = PyBytes_GET_SIZE(bytes);

// Copy to add a NULL character
PyObject *bytes2 = PyBytes_FromStringAndSize(NULL, len + 1);
if (bytes2 == NULL) {
Py_DECREF(bytes);
return -1;
}

char *str = PyBytes_AS_STRING(bytes2);
memcpy(str, PyBytes_AS_STRING(bytes), len);
str[len] = '\0';
Py_DECREF(bytes);

return unicode_export_bytes(bytes2, view,
1, "B", PyUnicode_FORMAT_UTF8);
}
return -1;
}

PyErr_SetString(PyExc_ValueError,
Expand Down