Skip to content

Commit 9ad1e00

Browse files
h-vetinarijreback
authored andcommitted
REF: shift ravel in infer_dtype (#24560)
1 parent 43b35fc commit 9ad1e00

File tree

23 files changed

+147
-138
lines changed

23 files changed

+147
-138
lines changed

pandas/_libs/lib.pyx

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -623,7 +623,7 @@ def clean_index_list(obj: list):
623623
return obj, all_arrays
624624

625625
# don't force numpy coerce with nan's
626-
inferred = infer_dtype(obj)
626+
inferred = infer_dtype(obj, skipna=False)
627627
if inferred in ['string', 'bytes', 'unicode', 'mixed', 'mixed-integer']:
628628
return np.asarray(obj, dtype=object), 0
629629
elif inferred in ['integer']:
@@ -1210,6 +1210,10 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
12101210
values = construct_1d_object_array_from_listlike(value)
12111211

12121212
values = getattr(values, 'values', values)
1213+
1214+
# make contiguous
1215+
values = values.ravel()
1216+
12131217
if skipna:
12141218
values = values[~isnaobj(values)]
12151219

@@ -1220,9 +1224,6 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
12201224
if values.dtype != np.object_:
12211225
values = values.astype('O')
12221226

1223-
# make contiguous
1224-
values = values.ravel()
1225-
12261227
n = len(values)
12271228
if n == 0:
12281229
return 'empty'

pandas/core/algorithms.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def _ensure_arraylike(values):
165165
ensure that we are arraylike if not already
166166
"""
167167
if not is_array_like(values):
168-
inferred = lib.infer_dtype(values)
168+
inferred = lib.infer_dtype(values, skipna=False)
169169
if inferred in ['mixed', 'string', 'unicode']:
170170
if isinstance(values, tuple):
171171
values = list(values)
@@ -202,8 +202,10 @@ def _get_hashtable_algo(values):
202202

203203
if ndtype == 'object':
204204

205-
# its cheaper to use a String Hash Table than Object
206-
if lib.infer_dtype(values) in ['string']:
205+
# it's cheaper to use a String Hash Table than Object; we infer
206+
# including nulls because that is the only difference between
207+
# StringHashTable and ObjectHashtable
208+
if lib.infer_dtype(values, skipna=False) in ['string']:
207209
ndtype = 'string'
208210
else:
209211
ndtype = 'object'
@@ -220,8 +222,10 @@ def _get_data_algo(values, func_map):
220222
values, dtype, ndtype = _ensure_data(values)
221223
if ndtype == 'object':
222224

223-
# its cheaper to use a String Hash Table than Object
224-
if lib.infer_dtype(values) in ['string']:
225+
# it's cheaper to use a String Hash Table than Object; we infer
226+
# including nulls because that is the only difference between
227+
# StringHashTable and ObjectHashtable
228+
if lib.infer_dtype(values, skipna=False) in ['string']:
225229
ndtype = 'string'
226230

227231
f = func_map.get(ndtype, func_map['object'])

pandas/core/arrays/datetimes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1652,7 +1652,7 @@ def sequence_to_dt64ns(data, dtype=None, copy=False,
16521652
# TODO: We do not have tests specific to string-dtypes,
16531653
# also complex or categorical or other extension
16541654
copy = False
1655-
if lib.infer_dtype(data) == 'integer':
1655+
if lib.infer_dtype(data, skipna=False) == 'integer':
16561656
data = data.astype(np.int64)
16571657
else:
16581658
# data comes back here as either i8 to denote UTC timestamps

pandas/core/arrays/integer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,8 +171,8 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
171171

172172
values = np.array(values, copy=copy)
173173
if is_object_dtype(values):
174-
inferred_type = lib.infer_dtype(values)
175-
if inferred_type is 'mixed' and isna(values).all():
174+
inferred_type = lib.infer_dtype(values, skipna=True)
175+
if inferred_type == 'empty':
176176
values = np.empty(len(values))
177177
values.fill(np.nan)
178178
elif inferred_type not in ['floating', 'integer',

pandas/core/arrays/timedeltas.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,7 @@ def __floordiv__(self, other):
594594
elif is_object_dtype(other):
595595
result = [self[n] // other[n] for n in range(len(self))]
596596
result = np.array(result)
597-
if lib.infer_dtype(result) == 'timedelta':
597+
if lib.infer_dtype(result, skipna=False) == 'timedelta':
598598
result, _ = sequence_to_td64ns(result)
599599
return type(self)(result)
600600
return result

pandas/core/dtypes/cast.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ def trans(x):
7575

7676
if isinstance(dtype, string_types):
7777
if dtype == 'infer':
78-
inferred_type = lib.infer_dtype(ensure_object(result.ravel()))
78+
inferred_type = lib.infer_dtype(ensure_object(result.ravel()),
79+
skipna=False)
7980
if inferred_type == 'boolean':
8081
dtype = 'bool'
8182
elif inferred_type == 'integer':
@@ -460,7 +461,7 @@ def infer_dtype_from_array(arr, pandas_dtype=False):
460461
return arr.dtype, np.asarray(arr)
461462

462463
# don't force numpy coerce with nan's
463-
inferred = lib.infer_dtype(arr)
464+
inferred = lib.infer_dtype(arr, skipna=False)
464465
if inferred in ['string', 'bytes', 'unicode',
465466
'mixed', 'mixed-integer']:
466467
return (np.object_, arr)
@@ -941,10 +942,11 @@ def try_timedelta(v):
941942

942943
# We have at least a NaT and a string
943944
# try timedelta first to avoid spurious datetime conversions
944-
# e.g. '00:00:01' is a timedelta but
945-
# technically is also a datetime
945+
# e.g. '00:00:01' is a timedelta but technically is also a datetime
946946
value = try_timedelta(v)
947-
if lib.infer_dtype(value) in ['mixed']:
947+
if lib.infer_dtype(value, skipna=False) in ['mixed']:
948+
# cannot skip missing values, as NaT implies that the string
949+
# is actually a datetime
948950
value = try_datetime(v)
949951

950952
return value

pandas/core/dtypes/common.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -703,7 +703,8 @@ def is_datetime_arraylike(arr):
703703
if isinstance(arr, ABCDatetimeIndex):
704704
return True
705705
elif isinstance(arr, (np.ndarray, ABCSeries)):
706-
return arr.dtype == object and lib.infer_dtype(arr) == 'datetime'
706+
return (is_object_dtype(arr.dtype)
707+
and lib.infer_dtype(arr, skipna=False) == 'datetime')
707708
return getattr(arr, 'inferred_type', None) == 'datetime'
708709

709710

pandas/core/dtypes/missing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -474,7 +474,7 @@ def _infer_fill_value(val):
474474
if is_datetimelike(val):
475475
return np.array('NaT', dtype=val.dtype)
476476
elif is_object_dtype(val.dtype):
477-
dtype = lib.infer_dtype(ensure_object(val))
477+
dtype = lib.infer_dtype(ensure_object(val), skipna=False)
478478
if dtype in ['datetime', 'datetime64']:
479479
return np.array('NaT', dtype=_NS_DTYPE)
480480
elif dtype in ['timedelta', 'timedelta64']:

pandas/core/indexes/base.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
346346
# should not be coerced
347347
# GH 11836
348348
if is_integer_dtype(dtype):
349-
inferred = lib.infer_dtype(data)
349+
inferred = lib.infer_dtype(data, skipna=False)
350350
if inferred == 'integer':
351351
data = maybe_cast_to_integer_array(data, dtype,
352352
copy=copy)
@@ -376,7 +376,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
376376
else:
377377
data = data.astype(dtype)
378378
elif is_float_dtype(dtype):
379-
inferred = lib.infer_dtype(data)
379+
inferred = lib.infer_dtype(data, skipna=False)
380380
if inferred == 'string':
381381
pass
382382
else:
@@ -414,7 +414,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
414414
subarr = subarr.copy()
415415

416416
if dtype is None:
417-
inferred = lib.infer_dtype(subarr)
417+
inferred = lib.infer_dtype(subarr, skipna=False)
418418
if inferred == 'integer':
419419
try:
420420
return cls._try_convert_to_int_index(
@@ -1718,7 +1718,7 @@ def inferred_type(self):
17181718
"""
17191719
Return a string of the type inferred from the values.
17201720
"""
1721-
return lib.infer_dtype(self)
1721+
return lib.infer_dtype(self, skipna=False)
17221722

17231723
@cache_readonly
17241724
def is_all_dates(self):

pandas/core/indexes/multi.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2318,7 +2318,8 @@ def _partial_tup_index(self, tup, side='left'):
23182318
section = labs[start:end]
23192319

23202320
if lab not in lev:
2321-
if not lev.is_type_compatible(lib.infer_dtype([lab])):
2321+
if not lev.is_type_compatible(lib.infer_dtype([lab],
2322+
skipna=False)):
23222323
raise TypeError('Level type mismatch: %s' % lab)
23232324

23242325
# short circuit

0 commit comments

Comments
 (0)