Skip to content

Commit e4c3a5b

Browse files
committed
Merge remote-tracking branch 'upstream/main' into aijams-take-function-invalid-dtype
2 parents b4258f2 + d81171b commit e4c3a5b

File tree

26 files changed

+227
-235
lines changed

26 files changed

+227
-235
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1087,6 +1087,7 @@ I/O
10871087
- Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`)
10881088
- Bug in :meth:`MultiIndex.factorize` incorrectly raising on length-0 indexes (:issue:`57517`)
10891089
- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
1090+
- Bug in :meth:`read_csv` for the ``c`` and ``python`` engines where parsing numbers with large exponents caused overflows. Now, numbers with large positive exponents are parsed as ``inf`` or ``-inf`` depending on the sign of the mantissa, while those with large negative exponents are parsed as ``0.0`` (:issue:`62617`, :issue:`38794`, :issue:`62740`)
10901091
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
10911092
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
10921093
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)

pandas/_config/config.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -944,3 +944,11 @@ def is_callable(obj: object) -> bool:
944944
if not callable(obj):
945945
raise ValueError("Value must be a callable")
946946
return True
947+
948+
949+
# import set_module here would cause circular import
950+
get_option.__module__ = "pandas"
951+
set_option.__module__ = "pandas"
952+
describe_option.__module__ = "pandas"
953+
reset_option.__module__ = "pandas"
954+
option_context.__module__ = "pandas"

pandas/_libs/lib.pyx

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ from cython cimport (
4141
from pandas._config import using_string_dtype
4242

4343
from pandas._libs.missing import check_na_tuples_nonequal
44+
from pandas.util._decorators import set_module
4445

4546
import_datetime()
4647

@@ -154,6 +155,7 @@ def memory_usage_of_objects(arr: object[:]) -> int64_t:
154155
# ----------------------------------------------------------------------
155156

156157

158+
@set_module("pandas.api.types")
157159
def is_scalar(val: object) -> bool:
158160
"""
159161
Return True if given object is scalar.
@@ -255,6 +257,7 @@ cdef int64_t get_itemsize(object val):
255257
return -1
256258

257259

260+
@set_module("pandas.api.types")
258261
def is_iterator(obj: object) -> bool:
259262
"""
260263
Check if the object is an iterator.
@@ -1095,6 +1098,7 @@ def indices_fast(ndarray[intp_t, ndim=1] index, const int64_t[:] labels, list ke
10951098

10961099
# core.common import for fast inference checks
10971100

1101+
@set_module("pandas.api.types")
10981102
def is_float(obj: object) -> bool:
10991103
"""
11001104
Return True if given object is float.
@@ -1128,6 +1132,7 @@ def is_float(obj: object) -> bool:
11281132
return util.is_float_object(obj)
11291133

11301134

1135+
@set_module("pandas.api.types")
11311136
def is_integer(obj: object) -> bool:
11321137
"""
11331138
Return True if given object is integer.
@@ -1172,6 +1177,7 @@ def is_int_or_none(obj) -> bool:
11721177
return obj is None or util.is_integer_object(obj)
11731178

11741179

1180+
@set_module("pandas.api.types")
11751181
def is_bool(obj: object) -> bool:
11761182
"""
11771183
Return True if given object is boolean.
@@ -1202,6 +1208,7 @@ def is_bool(obj: object) -> bool:
12021208
return util.is_bool_object(obj)
12031209

12041210

1211+
@set_module("pandas.api.types")
12051212
def is_complex(obj: object) -> bool:
12061213
"""
12071214
Return True if given object is complex.
@@ -1237,6 +1244,7 @@ cpdef bint is_decimal(object obj):
12371244
return isinstance(obj, Decimal)
12381245

12391246

1247+
@set_module("pandas.api.types")
12401248
def is_list_like(obj: object, allow_sets: bool = True) -> bool:
12411249
"""
12421250
Check if the object is list-like.
@@ -1520,6 +1528,7 @@ cdef object _try_infer_map(object dtype):
15201528
return None
15211529

15221530

1531+
@set_module("pandas.api.types")
15231532
def infer_dtype(value: object, skipna: bool = True) -> str:
15241533
"""
15251534
Return a string label of the type of the elements in a list-like input.

pandas/_libs/parsers.pyx

Lines changed: 29 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ cdef extern from "pandas/parser/tokenizer.h":
144144
SKIP_LINE
145145
FINISHED
146146

147-
enum: ERROR_OVERFLOW
147+
enum: ERROR_OVERFLOW, ERROR_INVALID_CHARS
148148

149149
ctypedef enum BadLineHandleMethod:
150150
ERROR,
@@ -1051,7 +1051,7 @@ cdef class TextReader:
10511051
if col_dtype is not None:
10521052
col_res, na_count = self._convert_with_dtype(
10531053
col_dtype, i, start, end, na_filter,
1054-
1, na_hashset, na_fset)
1054+
1, na_hashset, na_fset, False)
10551055

10561056
# Fallback on the parse (e.g. we requested int dtype,
10571057
# but its actually a float).
@@ -1062,30 +1062,34 @@ cdef class TextReader:
10621062
return self._string_convert(i, start, end, na_filter, na_hashset)
10631063
else:
10641064
col_res = None
1065+
maybe_int = True
10651066
for dt in self.dtype_cast_order:
1066-
if (dt.kind in "iu" and
1067-
self._column_has_float(i, start, end, na_filter, na_hashset)):
1067+
if not maybe_int and dt.kind in "iu":
10681068
continue
10691069

10701070
try:
10711071
col_res, na_count = self._convert_with_dtype(
1072-
dt, i, start, end, na_filter, 0, na_hashset, na_fset)
1073-
except ValueError:
1074-
# This error is raised from trying to convert to uint64,
1075-
# and we discover that we cannot convert to any numerical
1076-
# dtype successfully. As a result, we leave the data
1077-
# column AS IS with object dtype.
1078-
col_res, na_count = self._convert_with_dtype(
1079-
np.dtype("object"), i, start, end, 0,
1080-
0, na_hashset, na_fset)
1072+
dt, i, start, end, na_filter, 0, na_hashset, na_fset, True)
1073+
except ValueError as e:
1074+
if str(e) == "Number is not int":
1075+
maybe_int = False
1076+
continue
1077+
else:
1078+
# This error is raised from trying to convert to uint64,
1079+
# and we discover that we cannot convert to any numerical
1080+
# dtype successfully. As a result, we leave the data
1081+
# column AS IS with object dtype.
1082+
col_res, na_count = self._convert_with_dtype(
1083+
np.dtype("object"), i, start, end, 0,
1084+
0, na_hashset, na_fset, False)
10811085
except OverflowError:
10821086
try:
10831087
col_res, na_count = _try_pylong(self.parser, i, start,
10841088
end, na_filter, na_hashset)
10851089
except ValueError:
10861090
col_res, na_count = self._convert_with_dtype(
10871091
np.dtype("object"), i, start, end, 0,
1088-
0, na_hashset, na_fset)
1092+
0, na_hashset, na_fset, False)
10891093

10901094
if col_res is not None:
10911095
break
@@ -1133,7 +1137,7 @@ cdef class TextReader:
11331137
bint na_filter,
11341138
bint user_dtype,
11351139
kh_str_starts_t *na_hashset,
1136-
set na_fset):
1140+
set na_fset, bint raise_on_invalid):
11371141
if isinstance(dtype, CategoricalDtype):
11381142
# TODO: I suspect that _categorical_convert could be
11391143
# optimized when dtype is an instance of CategoricalDtype
@@ -1174,14 +1178,14 @@ cdef class TextReader:
11741178

11751179
elif dtype.kind in "iu":
11761180
try:
1177-
result, na_count = _try_int64(self.parser, i, start,
1178-
end, na_filter, na_hashset)
1181+
result, na_count = _try_int64(self.parser, i, start, end,
1182+
na_filter, na_hashset, raise_on_invalid)
11791183
if user_dtype and na_count is not None:
11801184
if na_count > 0:
11811185
raise ValueError(f"Integer column has NA values in column {i}")
11821186
except OverflowError:
11831187
result = _try_uint64(self.parser, i, start, end,
1184-
na_filter, na_hashset)
1188+
na_filter, na_hashset, raise_on_invalid)
11851189
na_count = 0
11861190

11871191
if result is not None and dtype != "int64":
@@ -1344,59 +1348,6 @@ cdef class TextReader:
13441348
else:
13451349
return None
13461350

1347-
cdef bint _column_has_float(self, Py_ssize_t col,
1348-
int64_t start, int64_t end,
1349-
bint na_filter, kh_str_starts_t *na_hashset):
1350-
"""Check if the column contains any float number."""
1351-
cdef:
1352-
Py_ssize_t i, j, lines = end - start
1353-
coliter_t it
1354-
const char *word = NULL
1355-
const char *ignored_chars = " +-"
1356-
const char *digits = "0123456789"
1357-
const char *float_indicating_chars = "eE"
1358-
char null_byte = 0
1359-
1360-
coliter_setup(&it, self.parser, col, start)
1361-
1362-
for i in range(lines):
1363-
COLITER_NEXT(it, word)
1364-
1365-
if na_filter and kh_get_str_starts_item(na_hashset, word):
1366-
continue
1367-
1368-
found_first_digit = False
1369-
j = 0
1370-
while word[j] != null_byte:
1371-
if word[j] == self.parser.decimal:
1372-
return True
1373-
elif not found_first_digit and word[j] in ignored_chars:
1374-
# no-op
1375-
pass
1376-
elif not found_first_digit and word[j] not in digits:
1377-
# word isn't numeric
1378-
return False
1379-
elif not found_first_digit and word[j] in digits:
1380-
found_first_digit = True
1381-
elif word[j] in float_indicating_chars:
1382-
# preceding chars indicates numeric and
1383-
# current char indicates float
1384-
return True
1385-
elif word[j] not in digits:
1386-
# previous characters indicates numeric
1387-
# current character shows otherwise
1388-
return False
1389-
elif word[j] in digits:
1390-
# no-op
1391-
pass
1392-
else:
1393-
raise AssertionError(
1394-
f"Unhandled case {word[j]=} {found_first_digit=}"
1395-
)
1396-
j += 1
1397-
1398-
return False
1399-
14001351
# Factor out code common to TextReader.__dealloc__ and TextReader.close
14011352
# It cannot be a class method, since calling self.close() in __dealloc__
14021353
# which causes a class attribute lookup and violates best practices
@@ -1793,7 +1744,8 @@ cdef int _try_double_nogil(parser_t *parser,
17931744

17941745
cdef _try_uint64(parser_t *parser, int64_t col,
17951746
int64_t line_start, int64_t line_end,
1796-
bint na_filter, kh_str_starts_t *na_hashset):
1747+
bint na_filter, kh_str_starts_t *na_hashset,
1748+
bint raise_on_invalid):
17971749
cdef:
17981750
int error
17991751
Py_ssize_t lines
@@ -1815,6 +1767,8 @@ cdef _try_uint64(parser_t *parser, int64_t col,
18151767
if error == ERROR_OVERFLOW:
18161768
# Can't get the word variable
18171769
raise OverflowError("Overflow")
1770+
elif raise_on_invalid and error == ERROR_INVALID_CHARS:
1771+
raise ValueError("Number is not int")
18181772
return None
18191773

18201774
if uint64_conflict(&state):
@@ -1863,7 +1817,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
18631817

18641818
cdef _try_int64(parser_t *parser, int64_t col,
18651819
int64_t line_start, int64_t line_end,
1866-
bint na_filter, kh_str_starts_t *na_hashset):
1820+
bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_invalid):
18671821
cdef:
18681822
int error, na_count = 0
18691823
Py_ssize_t lines
@@ -1883,6 +1837,8 @@ cdef _try_int64(parser_t *parser, int64_t col,
18831837
if error == ERROR_OVERFLOW:
18841838
# Can't get the word variable
18851839
raise OverflowError("Overflow")
1840+
elif raise_on_invalid and error == ERROR_INVALID_CHARS:
1841+
raise ValueError("Number is not int")
18861842
return None, None
18871843

18881844
return result, na_count

pandas/_libs/src/parser/tokenizer.c

Lines changed: 17 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1620,9 +1620,9 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
16201620
}
16211621

16221622
double number = 0.;
1623-
int exponent = 0;
1624-
int num_digits = 0;
1625-
int num_decimals = 0;
1623+
long int exponent = 0;
1624+
long int num_digits = 0;
1625+
long int num_decimals = 0;
16261626

16271627
// Process string of digits.
16281628
while (isdigit_ascii(*p)) {
@@ -1671,39 +1671,26 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
16711671
if (maybe_int != NULL)
16721672
*maybe_int = 0;
16731673

1674-
// Handle optional sign
1675-
negative = 0;
1676-
switch (*++p) {
1677-
case '-':
1678-
negative = 1;
1679-
PD_FALLTHROUGH; // Fall through to increment position.
1680-
case '+':
1681-
p++;
1682-
break;
1683-
}
1674+
// move past scientific notation
1675+
p++;
16841676

1685-
// Process string of digits.
1686-
num_digits = 0;
1687-
int n = 0;
1688-
while (num_digits < max_digits && isdigit_ascii(*p)) {
1689-
n = n * 10 + (*p - '0');
1690-
num_digits++;
1691-
p++;
1692-
}
1677+
char *tmp_ptr;
1678+
long int n = strtol(p, &tmp_ptr, 10);
16931679

1694-
if (negative)
1695-
exponent -= n;
1696-
else
1697-
exponent += n;
1680+
if (errno == ERANGE || checked_add(exponent, n, &exponent)) {
1681+
errno = 0;
1682+
exponent = n;
1683+
}
16981684

16991685
// If no digits after the 'e'/'E', un-consume it.
1700-
if (num_digits == 0)
1686+
if (tmp_ptr == p)
17011687
p--;
1688+
else
1689+
p = tmp_ptr;
17021690
}
17031691

17041692
if (exponent > 308) {
1705-
*error = ERANGE;
1706-
return HUGE_VAL;
1693+
number = number == 0 ? 0 : number < 0 ? -HUGE_VAL : HUGE_VAL;
17071694
} else if (exponent > 0) {
17081695
number *= e[exponent];
17091696
} else if (exponent < -308) { // Subnormal
@@ -1718,9 +1705,6 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
17181705
number /= e[-exponent];
17191706
}
17201707

1721-
if (number == HUGE_VAL || number == -HUGE_VAL)
1722-
*error = ERANGE;
1723-
17241708
if (skip_trailing) {
17251709
// Skip trailing whitespace.
17261710
while (isspace_ascii(*p))
@@ -1812,8 +1796,6 @@ double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci),
18121796
*maybe_int = 0;
18131797
if (PyErr_Occurred() != NULL)
18141798
*error = -1;
1815-
else if (r == Py_HUGE_VAL)
1816-
*error = (int)Py_HUGE_VAL;
18171799
PyErr_Clear();
18181800

18191801
PyGILState_Release(gstate);
@@ -1907,7 +1889,7 @@ int64_t str_to_int64(const char *p_item, int *error, char tsep) {
19071889
int64_t number = strtoll(p, &endptr, 10);
19081890

19091891
if (errno == ERANGE) {
1910-
*error = ERROR_OVERFLOW;
1892+
*error = *endptr ? ERROR_INVALID_CHARS : ERROR_OVERFLOW;
19111893
errno = 0;
19121894
return 0;
19131895
}
@@ -1967,7 +1949,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int *error,
19671949
uint64_t number = strtoull(p, &endptr, 10);
19681950

19691951
if (errno == ERANGE) {
1970-
*error = ERROR_OVERFLOW;
1952+
*error = *endptr ? ERROR_INVALID_CHARS : ERROR_OVERFLOW;
19711953
errno = 0;
19721954
return 0;
19731955
}

0 commit comments

Comments
 (0)