Skip to content

Commit 086f52c

Browse files
committed
implemented correct conversion from bytes to unicde string pyobject
1 parent bf7c0d7 commit 086f52c

File tree

2 files changed

+20
-35
lines changed

2 files changed

+20
-35
lines changed

src/_arraykit.c

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6736,36 +6736,22 @@ AK_TM_fill_flexible(TriMapObject* tm,
67366736

67376737
bool decref_fill_value = false;
67386738
if (PyBytes_Check(fill_value)) {
6739-
AK_DEBUG_MSG_OBJ("pre unicode", fill_value);
6740-
Py_ssize_t byte_length;
6741-
char *byte_data;
6742-
if (PyBytes_AsStringAndSize(fill_value, &byte_data, &byte_length)) {
6743-
return -1;
6744-
}
6745-
6746-
PyObject* fill_value = PyUnicode_DecodeASCII(byte_data, byte_length, "strict");
6747-
// original fill_value is a borrowed ref; replace it with a new ref
6739+
fill_value = PyUnicode_FromEncodedObject(fill_value, "utf-8", NULL);
67486740
if (fill_value == NULL) {
67496741
return -1;
67506742
}
6751-
AK_DEBUG_MSG_OBJ("as unicode", fill_value);
67526743
decref_fill_value = true;
67536744
}
67546745
else if (!PyUnicode_Check(fill_value)) {
67556746
return -1;
67566747
}
67576748
Py_ssize_t fill_cp = PyUnicode_GET_LENGTH(fill_value) * UCS4_SIZE; // code points
6758-
AK_DEBUG_MSG_OBJ("fill_cp", PyLong_FromSsize_t(fill_cp));
67596749

67606750
// p is the index position to fill
67616751
npy_int64* p = (npy_int64*)PyArray_DATA(final_fill);
67626752
npy_int64* p_end = p + PyArray_SIZE(final_fill);
6763-
67646753
Py_UCS4* target;
67656754
while (p < p_end) {
6766-
AK_DEBUG_MSG_OBJ("p iteration", PyLong_FromSsize_t(*p));
6767-
AK_DEBUG_MSG_OBJ("shift", PyLong_FromSsize_t(*p * cp));
6768-
67696755
target = array_to_data + (*p * cp);
67706756
// disabling copying a null
67716757
if (PyUnicode_AsUCS4(fill_value, target, fill_cp, 0) == NULL) {

test/test_tri_map.py

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -994,26 +994,25 @@ def test_tri_map_map_unicode_a(self) -> None:
994994
post_dst = tm.map_dst_fill(dst, '====', np.array('====').dtype)
995995
self.assertEqual(post_dst.tolist(), ['a', 'a', 'a', '====', 'cc', 'cc', '===='])
996996

997-
# TODO
998-
# def test_tri_map_map_unicode_b(self) -> None:
999-
# src = np.array(['a', 'bbb', 'cc', 'dddd'])
1000-
# dst = np.array(['cc', 'a', 'a', 'a', 'cc'])
1001-
1002-
# tm = TriMap(len(src), len(dst))
1003-
# tm.register_many(0, np.array([1, 2, 3], dtype=np.dtype(np.int64)))
1004-
# tm.register_one(1, -1)
1005-
# tm.register_many(2, np.array([0, 4], dtype=np.dtype(np.int64)))
1006-
# tm.register_one(3, -1)
1007-
# tm.finalize()
1008-
1009-
# post_src = tm.map_src_no_fill(src)
1010-
# self.assertEqual(post_src.tolist(), ['a', 'a', 'a', 'bbb', 'cc', 'cc', 'dddd'])
1011-
1012-
# post_dst1 = tm.map_dst_fill(dst, b'====', np.array(b'====').dtype)
1013-
# self.assertEqual(post_dst1.tolist(), ['a', 'a', 'a', '====', 'cc', 'cc', '===='])
1014-
1015-
# post_dst2 = tm.map_dst_fill(dst, b'?', np.array(b'?').dtype)
1016-
# self.assertEqual(post_dst2.tolist(), ['a', 'a', 'a', '?', 'cc', 'cc', '?'])
997+
def test_tri_map_map_unicode_b(self) -> None:
998+
src = np.array(['a', 'bbb', 'cc', 'dddd'])
999+
dst = np.array(['cc', 'a', 'a', 'a', 'cc'])
1000+
1001+
tm = TriMap(len(src), len(dst))
1002+
tm.register_many(0, np.array([1, 2, 3], dtype=np.dtype(np.int64)))
1003+
tm.register_one(1, -1)
1004+
tm.register_many(2, np.array([0, 4], dtype=np.dtype(np.int64)))
1005+
tm.register_one(3, -1)
1006+
tm.finalize()
1007+
1008+
post_src = tm.map_src_no_fill(src)
1009+
self.assertEqual(post_src.tolist(), ['a', 'a', 'a', 'bbb', 'cc', 'cc', 'dddd'])
1010+
1011+
post_dst1 = tm.map_dst_fill(dst, b'====', np.array(b'====').dtype)
1012+
self.assertEqual(post_dst1.tolist(), ['a', 'a', 'a', '====', 'cc', 'cc', '===='])
1013+
1014+
post_dst2 = tm.map_dst_fill(dst, b'?', np.array(b'?').dtype)
1015+
self.assertEqual(post_dst2.tolist(), ['a', 'a', 'a', '?', 'cc', 'cc', '?'])
10171016

10181017
def test_tri_map_map_unicode_c(self) -> None:
10191018
src = np.array(['a', 'bbb', 'cc', 'dddd'])

0 commit comments

Comments
 (0)