diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 2db46abca119c..9f3689779d056 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -87,8 +87,8 @@ def setup(self): self.vals_short = np.arange(2).astype(object) self.vals_long = np.arange(10 ** 5).astype(object) # because of nans floats are special: - self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float)).astype(object) - self.vals_long_floats = np.arange(10 ** 5, dtype=np.float).astype(object) + self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float_)).astype(object) + self.vals_long_floats = np.arange(10 ** 5, dtype=np.float_).astype(object) def time_isin_nans(self): # if nan-objects are different objects, diff --git a/ci/deps/actions-37-locale.yaml b/ci/deps/actions-37-locale.yaml index b18ce37d05ca0..551308f1d5fac 100644 --- a/ci/deps/actions-37-locale.yaml +++ b/ci/deps/actions-37-locale.yaml @@ -11,7 +11,7 @@ dependencies: - hypothesis>=3.58.0 # required - - numpy + - numpy<1.20 # GH#39541 compat for pyarrow<3 - python-dateutil - pytz diff --git a/ci/deps/azure-37.yaml b/ci/deps/azure-37.yaml index 82cb6760b6d1e..4fe3de161960c 100644 --- a/ci/deps/azure-37.yaml +++ b/ci/deps/azure-37.yaml @@ -18,7 +18,7 @@ dependencies: - numpy - python-dateutil - nomkl - - pyarrow + - pyarrow=0.15.1 - pytz - s3fs>=0.4.0 - moto>=1.3.14 diff --git a/ci/deps/azure-38-locale.yaml b/ci/deps/azure-38-locale.yaml index 15d503e8fd0a5..26297a3066fa5 100644 --- a/ci/deps/azure-38-locale.yaml +++ b/ci/deps/azure-38-locale.yaml @@ -24,7 +24,7 @@ dependencies: - moto - nomkl - numexpr - - numpy + - numpy<1.20 # GH#39541 compat with pyarrow<3 - openpyxl - pytables - python-dateutil diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml index 0b8aff83fe230..d667adddda859 100644 --- a/ci/deps/azure-macos-37.yaml +++ b/ci/deps/azure-macos-37.yaml @@ -21,7 +21,7 @@ dependencies: - numexpr - numpy=1.16.5 - openpyxl - - pyarrow>=0.15.0 + - pyarrow=0.15.1 - pytables - python-dateutil==2.7.3 - pytz diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index 42621c032416d..b474989f43731 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -199,8 +199,8 @@ in Python, so maybe we could minimize these by cythonizing the apply part. ...: return s * dx ...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b, ...: np.ndarray col_N): - ...: assert (col_a.dtype == np.float - ...: and col_b.dtype == np.float and col_N.dtype == np.int) + ...: assert (col_a.dtype == np.float_ + ...: and col_b.dtype == np.float_ and col_N.dtype == np.int_) ...: cdef Py_ssize_t i, n = len(col_N) ...: assert (len(col_a) == len(col_b) == n) ...: cdef np.ndarray[double] res = np.empty(n) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index a78af82ba4db8..b6d686ee2551f 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2834,7 +2834,7 @@ See the :ref:`cookbook` for some advanced strategies. The `xlrd `__ package is now only for reading old-style ``.xls`` files. - Previously, the default argument ``engine=None`` to :func:`~pandas.read_excel` + Before pandas 1.2.0, the default argument ``engine=None`` to :func:`~pandas.read_excel` would result in using the ``xlrd`` engine in many cases, including new Excel 2007+ (``.xlsx``) files. If `openpyxl `__ is installed, diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 55e3971502c0a..c37255c765171 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 1.2 .. toctree:: :maxdepth: 2 + v1.2.2 v1.2.1 v1.2.0 diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index dfd23309faaef..8a935f269e1a6 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -518,7 +518,7 @@ Deprecations - Deprecated parameter ``dtype`` of method :meth:`~Index.copy` for all :class:`Index` subclasses. Use the :meth:`~Index.astype` method instead for changing dtype (:issue:`35853`) - Deprecated parameters ``levels`` and ``codes`` in :meth:`MultiIndex.copy`. Use the :meth:`~MultiIndex.set_levels` and :meth:`~MultiIndex.set_codes` methods instead (:issue:`36685`) - Date parser functions :func:`~pandas.io.date_converters.parse_date_time`, :func:`~pandas.io.date_converters.parse_date_fields`, :func:`~pandas.io.date_converters.parse_all_fields` and :func:`~pandas.io.date_converters.generic_parser` from ``pandas.io.date_converters`` are deprecated and will be removed in a future version; use :func:`to_datetime` instead (:issue:`35741`) -- :meth:`DataFrame.lookup` is deprecated and will be removed in a future version, use :meth:`DataFrame.melt` and :meth:`DataFrame.loc` instead (:issue:`18682`) +- :meth:`DataFrame.lookup` is deprecated and will be removed in a future version, use :meth:`DataFrame.melt` and :meth:`DataFrame.loc` instead (:issue:`35224`) - The method :meth:`Index.to_native_types` is deprecated. Use ``.astype(str)`` instead (:issue:`28867`) - Deprecated indexing :class:`DataFrame` rows with a single datetime-like string as ``df[string]`` (given the ambiguity whether it is indexing the rows or selecting a column), use ``df.loc[string]`` instead (:issue:`36179`) - Deprecated :meth:`Index.is_all_dates` (:issue:`27744`) @@ -746,7 +746,7 @@ I/O - Parse missing values using :func:`read_json` with ``dtype=False`` to ``NaN`` instead of ``None`` (:issue:`28501`) - :meth:`read_fwf` was inferring compression with ``compression=None`` which was not consistent with the other ``read_*`` functions (:issue:`37909`) - :meth:`DataFrame.to_html` was ignoring ``formatters`` argument for ``ExtensionDtype`` columns (:issue:`36525`) -- Bumped minimum xarray version to 0.12.3 to avoid reference to the removed ``Panel`` class (:issue:`27101`) +- Bumped minimum xarray version to 0.12.3 to avoid reference to the removed ``Panel`` class (:issue:`27101`, :issue:`37983`) - :meth:`DataFrame.to_csv` was re-opening file-like handles that also implement ``os.PathLike`` (:issue:`38125`) - Bug in the conversion of a sliced ``pyarrow.Table`` with missing values to a DataFrame (:issue:`38525`) - Bug in :func:`read_sql_table` raising a ``sqlalchemy.exc.OperationalError`` when column names contained a percentage sign (:issue:`37517`) diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index 474970601022c..8bfe233ae50cc 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -144,4 +144,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v1.2.0..v1.2.1|HEAD +.. contributors:: v1.2.0..v1.2.1 diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst new file mode 100644 index 0000000000000..fa41a83a5e5fb --- /dev/null +++ b/doc/source/whatsnew/v1.2.2.rst @@ -0,0 +1,49 @@ +.. _whatsnew_122: + +What's new in 1.2.2 (February 09, 2021) +--------------------------------------- + +These are the changes in pandas 1.2.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_122.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ + +- Fixed regression in :func:`read_excel` that caused it to raise ``AttributeError`` when checking version of older xlrd versions (:issue:`38955`) +- Fixed regression in :class:`DataFrame` constructor reordering element when construction from datetime ndarray with dtype not ``"datetime64[ns]"`` (:issue:`39422`) +- Fixed regression in :meth:`DataFrame.astype` and :meth:`Series.astype` not casting to bytes dtype (:issue:`39474`) +- Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`) +- Fixed regression in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` always raising ``AssertionError`` when comparing extension dtypes (:issue:`39410`) +- Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`) +- Fixed regression in :meth:`Categorical.astype` casting to incorrect dtype when ``np.int32`` is passed to dtype argument (:issue:`39402`) +- Fixed regression in :meth:`~DataFrame.to_excel` creating corrupt files when appending (``mode="a"``) to an existing file (:issue:`39576`) +- Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`) +- Fixed regression in :meth:`~DataFrame.groupby` or :meth:`~DataFrame.resample` when aggregating an all-NaN or numeric object dtype column (:issue:`39329`) +- Fixed regression in :meth:`.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`) +- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_122.bug_fixes: + +Bug fixes +~~~~~~~~~ + +- :func:`pandas.read_excel` error message when a specified ``sheetname`` does not exist is now uniform across engines (:issue:`39250`) +- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information; the fix requires ``openpyxl`` >= 3.0.0, prior versions may still fail (:issue:`38956`, :issue:`39001`) +- Fixed bug in :func:`pandas.read_excel` sometimes producing a ``DataFrame`` with trailing rows of ``np.nan`` when the engine ``openpyxl`` is used (:issue:`39181`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_122.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.2.1..v1.2.2|HEAD diff --git a/environment.yml b/environment.yml index 6f3f81d8a4d77..319261ec6640c 100644 --- a/environment.yml +++ b/environment.yml @@ -113,5 +113,5 @@ dependencies: - tabulate>=0.8.3 # DataFrame.to_markdown - natsort # DataFrame.sort_values - pip: - - git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master + - git+https://github.com/pandas-dev/pydata-sphinx-theme.git@2488b7defbd3d753dd5fcfc890fc4a7e79d25103 - git+https://github.com/numpy/numpydoc diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 3b52b4d499694..07507d5f5d2d4 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -224,7 +224,7 @@ def ensure_datetime64ns(arr: ndarray, copy: bool=True): ivalues = arr.view(np.int64).ravel("K") - result = np.empty(shape, dtype=DT64NS_DTYPE) + result = np.empty_like(arr, dtype=DT64NS_DTYPE) iresult = result.ravel("K").view(np.int64) if len(iresult) == 0: diff --git a/pandas/_testing.py b/pandas/_testing.py index 224c8d540c6bb..1df3351a7241c 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1402,14 +1402,26 @@ def assert_series_equal( assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype): + left_values = left._values + right_values = right._values # Only check exact if dtype is numeric - assert_numpy_array_equal( - left._values, - right._values, - check_dtype=check_dtype, - obj=str(obj), - index_values=np.asarray(left.index), - ) + if is_extension_array_dtype(left_values) and is_extension_array_dtype( + right_values + ): + assert_extension_array_equal( + left_values, + right_values, + check_dtype=check_dtype, + index_values=np.asarray(left.index), + ) + else: + assert_numpy_array_equal( + left_values, + right_values, + check_dtype=check_dtype, + obj=str(obj), + index_values=np.asarray(left.index), + ) elif check_datetimelike_compat and ( needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype) ): diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 533e67acfa2f4..2bde42357b96c 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -15,7 +15,7 @@ "matplotlib": "2.2.3", "numexpr": "2.6.8", "odfpy": "1.3.0", - "openpyxl": "2.5.7", + "openpyxl": "2.6.0", "pandas_gbq": "0.12.0", "pyarrow": "0.15.0", "pytest": "5.0.1", @@ -46,7 +46,7 @@ } -def _get_version(module: types.ModuleType) -> str: +def get_version(module: types.ModuleType) -> str: version = getattr(module, "__version__", None) if version is None: # xlrd uses a capitalized attribute name @@ -112,7 +112,7 @@ def import_optional_dependency( minimum_version = VERSIONS.get(name) if minimum_version: - version = _get_version(module) + version = get_version(module) if distutils.version.LooseVersion(version) < minimum_version: assert on_version in {"warn", "raise", "ignore"} msg = ( diff --git a/pandas/conftest.py b/pandas/conftest.py index d84a72d4cc7a8..6b26f855c36f0 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1188,6 +1188,32 @@ def any_nullable_int_dtype(request): return request.param +@pytest.fixture(params=tm.ALL_INT_DTYPES + tm.ALL_EA_INT_DTYPES) +def any_int_or_nullable_int_dtype(request): + """ + Parameterized fixture for any nullable integer dtype. + + * int + * 'int8' + * 'uint8' + * 'int16' + * 'uint16' + * 'int32' + * 'uint32' + * 'int64' + * 'uint64' + * 'UInt8' + * 'Int8' + * 'UInt16' + * 'Int16' + * 'UInt32' + * 'Int32' + * 'UInt64' + * 'Int64' + """ + return request.param + + @pytest.fixture(params=tm.ALL_EA_INT_DTYPES + tm.FLOAT_EA_DTYPES) def any_numeric_dtype(request): """ diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index c64f0bd71cf84..74f21bae39ba9 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -456,7 +456,7 @@ def transform( # Functions that transform may return empty Series/DataFrame # when the dtype is not appropriate - if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty: + if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty and not obj.empty: raise ValueError("Transform function failed") if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( obj.index diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3995e7b251184..d5819697066ef 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -427,7 +427,8 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: else: # GH8628 (PERF): astype category codes instead of astyping array try: - astyped_cats = self.categories.astype(dtype=dtype, copy=copy) + new_cats = np.asarray(self.categories) + new_cats = new_cats.astype(dtype=dtype, copy=copy) except ( TypeError, # downstream error msg for CategoricalIndex is misleading ValueError, @@ -435,8 +436,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" raise ValueError(msg) - astyped_cats = extract_array(astyped_cats, extract_numpy=True) - result = take_1d(astyped_cats, libalgos.ensure_platform_int(self._codes)) + result = take_1d(new_cats, libalgos.ensure_platform_int(self._codes)) return result diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c77991ced3907..1338d22f74347 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1137,7 +1137,7 @@ def soft_convert_objects( # bound of nanosecond-resolution 64-bit integers. try: values = lib.maybe_convert_objects(values, convert_datetime=True) - except OutOfBoundsDatetime: + except (OutOfBoundsDatetime, ValueError): pass if timedelta and is_object_dtype(values.dtype): diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 07ffb881495fa..875522c52be24 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1078,11 +1078,18 @@ def py_fallback(bvalues: ArrayLike) -> ArrayLike: # in the operation. We un-split here. result = result._consolidate() assert isinstance(result, (Series, DataFrame)) # for mypy - assert len(result._mgr.blocks) == 1 + mgr = result._mgr + assert isinstance(mgr, BlockManager) # unwrap DataFrame to get array - result = result._mgr.blocks[0].values - return result + if len(mgr.blocks) != 1: + # We've split an object block! Everything we've assumed + # about a single block input returning a single block output + # is a lie. See eg GH-39329 + return mgr.as_array() + else: + result = mgr.blocks[0].values + return result def blk_func(bvalues: ArrayLike) -> ArrayLike: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a9d93f473e0e1..2ef9e4a028793 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1286,16 +1286,18 @@ def _format_native_types(self, na_rep="nan", **kwargs): # go through the levels and format them for level, level_codes in zip(self.levels, self.codes): - level = level._format_native_types(na_rep=na_rep, **kwargs) + level_strs = level._format_native_types(na_rep=na_rep, **kwargs) # add nan values, if there are any mask = level_codes == -1 if mask.any(): - nan_index = len(level) - level = np.append(level, na_rep) + nan_index = len(level_strs) + # numpy 1.21 deprecated implicit string casting + level_strs = level_strs.astype(str) + level_strs = np.append(level_strs, na_rep) assert not level_codes.flags.writeable # i.e. copy is needed level_codes = level_codes.copy() # make writeable level_codes[mask] = nan_index - new_levels.append(level) + new_levels.append(level_strs) new_codes.append(level_codes) if len(new_levels) == 1: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 32aade97c8736..99218cebc37e1 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2503,7 +2503,7 @@ class ObjectBlock(Block): _can_hold_na = True def _maybe_coerce_values(self, values): - if issubclass(values.dtype.type, (str, bytes)): + if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) return values diff --git a/pandas/core/series.py b/pandas/core/series.py index b4e8696ad9e13..4b0d5f0b407be 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -169,8 +169,8 @@ class Series(base.IndexOpsMixin, generic.NDFrame): Values must be hashable and have the same length as `data`. Non-unique index values are allowed. Will default to RangeIndex (0, 1, 2, ..., n) if not provided. If data is dict-like - and index is None, then the values in the index are used to - reindex the Series after it is created using the keys in the data. + and index is None, then the keys in the data are used as the index. If the + index is not None, the resulting Series is reindexed with the index values. dtype : str, numpy.dtype, or ExtensionDtype, optional Data type for the output Series. If not specified, this will be inferred from `data`. @@ -179,6 +179,33 @@ class Series(base.IndexOpsMixin, generic.NDFrame): The name to give to the Series. copy : bool, default False Copy input data. + + Examples + -------- + Constructing Series from a dictionary with an Index specified + + >>> d = {'a': 1, 'b': 2, 'c': 3} + >>> ser = pd.Series(data=d, index=['a', 'b', 'c']) + >>> ser + a 1 + b 2 + c 3 + dtype: int64 + + The keys of the dictionary match with the Index values, hence the Index + values have no effect. + + >>> d = {'a': 1, 'b': 2, 'c': 3} + >>> ser = pd.Series(data=d, index=['x', 'y', 'z']) + >>> ser + x NaN + y NaN + z NaN + dtype: float64 + + Note that the Index is first build with the keys from the dictionary. + After this the Series is reindexed with the given Index values, hence we + get all NaN as a result. """ _typ = "series" diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index e6185f8ae0679..3a3df1a15dbcc 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2016,7 +2016,11 @@ def count(self): FutureWarning, ) self.min_periods = 0 - return super().count() + result = super().count() + self.min_periods = None + else: + result = super().count() + return result @Substitution(name="rolling") @Appender(_shared_docs["apply"]) diff --git a/pandas/io/common.py b/pandas/io/common.py index 90622ef0c0f2c..be353fefdd1ef 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -845,12 +845,15 @@ def file_exists(filepath_or_buffer: FilePathOrBuffer) -> bool: def _is_binary_mode(handle: FilePathOrBuffer, mode: str) -> bool: """Whether the handle is opened in binary mode""" + # specified by user + if "t" in mode or "b" in mode: + return "b" in mode + # classes that expect string but have 'b' in mode - text_classes = (codecs.StreamReaderWriter,) - if isinstance(handle, text_classes): + text_classes = (codecs.StreamWriter, codecs.StreamReader, codecs.StreamReaderWriter) + if issubclass(type(handle), text_classes): return False # classes that expect bytes binary_classes = (BufferedIOBase, RawIOBase) - return isinstance(handle, binary_classes) or "b" in getattr(handle, "mode", mode) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 5be8dbf152309..850570fc743b7 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -13,7 +13,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions -from pandas.compat._optional import import_optional_dependency +from pandas.compat._optional import get_version, import_optional_dependency from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments, doc @@ -425,6 +425,17 @@ def get_sheet_by_index(self, index): def get_sheet_data(self, sheet, convert_float): pass + def raise_if_bad_sheet_by_index(self, index: int) -> None: + n_sheets = len(self.sheet_names) + if index >= n_sheets: + raise ValueError( + f"Worksheet index {index} is invalid, {n_sheets} worksheets found" + ) + + def raise_if_bad_sheet_by_name(self, name: str) -> None: + if name not in self.sheet_names: + raise ValueError(f"Worksheet named '{name}' not found") + def parse( self, sheet_name=0, @@ -1049,16 +1060,18 @@ def __init__( else: import xlrd - xlrd_version = LooseVersion(xlrd.__version__) - - if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): - ext = "xls" - else: - ext = inspect_excel_format( - content=path_or_buffer, storage_options=storage_options - ) + xlrd_version = LooseVersion(get_version(xlrd)) + ext = None if engine is None: + # Only determine ext if it is needed + if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + ext = inspect_excel_format( + content=path_or_buffer, storage_options=storage_options + ) + if ext == "ods": engine = "odf" elif ext == "xls": @@ -1075,13 +1088,22 @@ def __init__( else: engine = "xlrd" - if engine == "xlrd" and ext != "xls" and xlrd_version is not None: - if xlrd_version >= "2": + if engine == "xlrd" and xlrd_version is not None: + if ext is None: + # Need ext to determine ext in order to raise/warn + if isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + ext = inspect_excel_format( + content=path_or_buffer, storage_options=storage_options + ) + + if ext != "xls" and xlrd_version >= "2": raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) - else: + elif ext != "xls": caller = inspect.stack()[1] if ( caller.filename.endswith( diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index c5c3927216850..8987d5bb42057 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -57,12 +57,14 @@ def sheet_names(self) -> List[str]: def get_sheet_by_index(self, index: int): from odf.table import Table + self.raise_if_bad_sheet_by_index(index) tables = self.book.getElementsByType(Table) return tables[index] def get_sheet_by_name(self, name: str): from odf.table import Table + self.raise_if_bad_sheet_by_name(name) tables = self.book.getElementsByType(Table) for table in tables: diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 7de958df206d5..be1587dbc010c 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -1,9 +1,11 @@ +from distutils.version import LooseVersion +import mmap from typing import TYPE_CHECKING, Dict, List, Optional import numpy as np from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions -from pandas.compat._optional import import_optional_dependency +from pandas.compat._optional import get_version, import_optional_dependency from pandas.io.excel._base import BaseExcelReader, ExcelWriter from pandas.io.excel._util import validate_freeze_panes @@ -37,6 +39,7 @@ def __init__( from openpyxl import load_workbook self.book = load_workbook(self.handles.handle) + self.handles.handle.seek(0) else: # Create workbook object with default optimized_write=True. self.book = Workbook() @@ -49,6 +52,9 @@ def save(self): Save workbook to disk. """ self.book.save(self.handles.handle) + if "r+" in self.mode and not isinstance(self.handles.handle, mmap.mmap): + # truncate file to the written content + self.handles.handle.truncate() @classmethod def _convert_to_style_kwargs(cls, style_dict: dict) -> Dict[str, "Serialisable"]: @@ -492,23 +498,25 @@ def sheet_names(self) -> List[str]: return self.book.sheetnames def get_sheet_by_name(self, name: str): + self.raise_if_bad_sheet_by_name(name) return self.book[name] def get_sheet_by_index(self, index: int): + self.raise_if_bad_sheet_by_index(index) return self.book.worksheets[index] def _convert_cell(self, cell, convert_float: bool) -> Scalar: from openpyxl.cell.cell import TYPE_BOOL, TYPE_ERROR, TYPE_NUMERIC - if cell.is_date: + if cell.value is None: + return "" # compat with xlrd + elif cell.is_date: return cell.value elif cell.data_type == TYPE_ERROR: return np.nan elif cell.data_type == TYPE_BOOL: return bool(cell.value) - elif cell.value is None: - return "" # compat with xlrd elif cell.data_type == TYPE_NUMERIC: # GH5394 if convert_float: @@ -521,8 +529,39 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + # GH 39001 + # Reading of excel file depends on dimension data being correct but + # writers sometimes omit or get it wrong + import openpyxl + + version = LooseVersion(get_version(openpyxl)) + + # There is no good way of determining if a sheet is read-only + # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1605 + is_readonly = hasattr(sheet, "reset_dimensions") + + if version >= "3.0.0" and is_readonly: + sheet.reset_dimensions() + data: List[List[Scalar]] = [] - for row in sheet.rows: - data.append([self._convert_cell(cell, convert_float) for cell in row]) + last_row_with_data = -1 + for row_number, row in enumerate(sheet.rows): + converted_row = [self._convert_cell(cell, convert_float) for cell in row] + if not all(cell == "" for cell in converted_row): + last_row_with_data = row_number + data.append(converted_row) + + # Trim trailing empty rows + data = data[: last_row_with_data + 1] + + if version >= "3.0.0" and is_readonly and len(data) > 0: + # With dimension reset, openpyxl no longer pads rows + max_width = max(len(data_row) for data_row in data) + if min(len(data_row) for data_row in data) < max_width: + empty_cell: List[Scalar] = [""] + data = [ + data_row + (max_width - len(data_row)) * empty_cell + for data_row in data + ] return data diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index de4f7bba1a179..f77a6bd5b1ad5 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -47,9 +47,11 @@ def sheet_names(self) -> List[str]: return self.book.sheets def get_sheet_by_name(self, name: str): + self.raise_if_bad_sheet_by_name(name) return self.book.get_sheet(name) def get_sheet_by_index(self, index: int): + self.raise_if_bad_sheet_by_index(index) # pyxlsb sheets are indexed from 1 onwards # There's a fix for this in the source, but the pypi package doesn't have it return self.book.get_sheet(index + 1) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index c655db4bc772b..5eb88a694218a 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -44,9 +44,11 @@ def sheet_names(self): return self.book.sheet_names() def get_sheet_by_name(self, name): + self.raise_if_bad_sheet_by_name(name) return self.book.sheet_by_name(name) def get_sheet_by_index(self, index): + self.raise_if_bad_sheet_by_index(index) return self.book.sheet_by_index(index) def get_sheet_data(self, sheet, convert_float): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d99abbea90a51..8ad86fd0a0dce 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -325,6 +325,11 @@ Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python standard encodings `_ . + .. versionchanged:: 1.2 + + When ``encoding`` is ``None``, ``errors="replace"`` is passed to + ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``. + This behavior was previously only the case for ``engine="python"``. dialect : str or csv.Dialect, optional If provided, this parameter will override values (default or not) for the following parameters: `delimiter`, `doublequote`, `escapechar`, diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index a5507259b7b6a..2dcbaf38fa51a 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -94,7 +94,19 @@ def to_pickle( is_text=False, storage_options=storage_options, ) as handles: - pickle.dump(obj, handles.handle, protocol=protocol) # type: ignore[arg-type] + if handles.compression["method"] in ("bz2", "xz") and protocol >= 5: + # some weird TypeError GH#39002 with pickle 5: fallback to letting + # pickle create the entire object and then write it to the buffer. + # "zip" would also be here if pandas.io.common._BytesZipFile + # wouldn't buffer write calls + handles.handle.write( + pickle.dumps(obj, protocol=protocol) # type: ignore[arg-type] + ) + else: + # letting pickle write directly to the buffer is more memory-efficient + pickle.dump( + obj, handles.handle, protocol=protocol # type: ignore[arg-type] + ) @doc(storage_options=generic._shared_docs["storage_options"]) diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 12654388de904..bd778fb540778 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -138,7 +138,7 @@ def test_astype(self, ordered): tm.assert_numpy_array_equal(result, expected) result = cat.astype(int) - expected = np.array(cat, dtype="int64") + expected = np.array(cat, dtype="int") tm.assert_numpy_array_equal(result, expected) result = cat.astype(float) diff --git a/pandas/tests/arrays/categorical/test_warnings.py b/pandas/tests/arrays/categorical/test_warnings.py index 9e164a250cdb1..6ba3347796e08 100644 --- a/pandas/tests/arrays/categorical/test_warnings.py +++ b/pandas/tests/arrays/categorical/test_warnings.py @@ -15,15 +15,8 @@ async def test_tab_complete_warning(self, ip): code = "import pandas as pd; c = Categorical([])" await ip.run_code(code) - # GH 31324 newer jedi version raises Deprecation warning - import jedi - - if jedi.__version__ < "0.16.0": - warning = tm.assert_produces_warning(None) - else: - warning = tm.assert_produces_warning( - DeprecationWarning, check_stacklevel=False - ) - with warning: + # GH 31324 newer jedi version raises Deprecation warning; + # appears resolved 2021-02-02 + with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("c.", 1)) diff --git a/pandas/tests/frame/apply/test_frame_transform.py b/pandas/tests/frame/apply/test_frame_transform.py index db5b2f3d86dfe..d3a3b1482affd 100644 --- a/pandas/tests/frame/apply/test_frame_transform.py +++ b/pandas/tests/frame/apply/test_frame_transform.py @@ -258,3 +258,13 @@ def test_transform_missing_columns(axis): match = re.escape("Column(s) ['C'] do not exist") with pytest.raises(SpecificationError, match=match): df.transform({"C": "cumsum"}) + + +def test_transform_empty_dataframe(): + # https://github.com/pandas-dev/pandas/issues/39636 + df = DataFrame([], columns=["col1", "col2"]) + result = df.transform(lambda x: x + 10) + tm.assert_frame_equal(result, df) + + result = df["col1"].transform(lambda x: x + 10) + tm.assert_series_equal(result, df["col1"]) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index d79969eac0323..5264d4b432d34 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -611,3 +611,8 @@ def test_astype_tz_object_conversion(self, tz): # do real test: object dtype to a specified tz, different from construction tz. result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) + + def test_astype_bytes(self): + # GH#39474 + result = DataFrame(["foo", "bar", "baz"]).astype(bytes) + assert result.dtypes[0] == np.dtype("S3") diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index a7e2fa760b7e4..29a2d9c17202e 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -274,17 +274,9 @@ async def test_tab_complete_warning(self, ip, frame_or_series): await ip.run_code(code) - # TODO: remove it when Ipython updates - # GH 33567, jedi version raises Deprecation warning in Ipython - import jedi - - if jedi.__version__ < "0.17.0": - warning = tm.assert_produces_warning(None) - else: - warning = tm.assert_produces_warning( - DeprecationWarning, check_stacklevel=False - ) - with warning: + # GH 31324 newer jedi version raises Deprecation warning; + # appears resolved 2021-02-02 + with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("obj.", 1)) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 2300a8937991e..77287b6f1eab5 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1936,6 +1936,70 @@ def test_constructor_datetimes_with_nulls(self, arr): expected = Series([np.dtype("datetime64[ns]")]) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("order", ["K", "A", "C", "F"]) + @pytest.mark.parametrize( + "dtype", + [ + "datetime64[M]", + "datetime64[D]", + "datetime64[h]", + "datetime64[m]", + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + ], + ) + def test_constructor_datetimes_non_ns(self, order, dtype): + na = np.array( + [ + ["2015-01-01", "2015-01-02", "2015-01-03"], + ["2017-01-01", "2017-01-02", "2017-02-03"], + ], + dtype=dtype, + order=order, + ) + df = DataFrame(na) + expected = DataFrame( + [ + ["2015-01-01", "2015-01-02", "2015-01-03"], + ["2017-01-01", "2017-01-02", "2017-02-03"], + ] + ) + expected = expected.astype(dtype=dtype) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("order", ["K", "A", "C", "F"]) + @pytest.mark.parametrize( + "dtype", + [ + "timedelta64[D]", + "timedelta64[h]", + "timedelta64[m]", + "timedelta64[s]", + "timedelta64[ms]", + "timedelta64[us]", + "timedelta64[ns]", + ], + ) + def test_constructor_timedelta_non_ns(self, order, dtype): + na = np.array( + [ + [np.timedelta64(1, "D"), np.timedelta64(2, "D")], + [np.timedelta64(4, "D"), np.timedelta64(5, "D")], + ], + dtype=dtype, + order=order, + ) + df = DataFrame(na).astype("timedelta64[ns]") + expected = DataFrame( + [ + [Timedelta(1, "D"), Timedelta(2, "D")], + [Timedelta(4, "D"), Timedelta(5, "D")], + ], + ) + tm.assert_frame_equal(df, expected) + def test_constructor_for_list_with_dtypes(self): # test list of lists/ndarrays df = DataFrame([np.arange(5) for x in range(5)]) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 073918eda3deb..0858890514cf0 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1175,3 +1175,27 @@ def test_aggregate_datetime_objects(): result = df.groupby("A").B.max() expected = df.set_index("A")["B"] tm.assert_series_equal(result, expected) + + +def test_aggregate_numeric_object_dtype(): + # https://github.com/pandas-dev/pandas/issues/39329 + # simplified case: multiple object columns where one is all-NaN + # -> gets split as the all-NaN is inferred as float + df = DataFrame( + {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4}, + ).astype(object) + result = df.groupby("key").min() + expected = DataFrame( + {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]} + ).set_index("key") + tm.assert_frame_equal(result, expected) + + # same but with numbers + df = DataFrame( + {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)}, + ).astype(object) + result = df.groupby("key").min() + expected = DataFrame( + {"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]} + ).set_index("key") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 372a1d290bca0..8095943a34f4d 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1942,16 +1942,9 @@ async def test_tab_complete_warning(self, ip): code = "import pandas as pd; idx = Index([1, 2])" await ip.run_code(code) - # GH 31324 newer jedi version raises Deprecation warning - import jedi - - if jedi.__version__ < "0.16.0": - warning = tm.assert_produces_warning(None) - else: - warning = tm.assert_produces_warning( - DeprecationWarning, check_stacklevel=False - ) - with warning: + # GH 31324 newer jedi version raises Deprecation warning; + # appears resolved 2021-02-02 + with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("idx.", 4)) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index ff1632e33c0fb..d12e9465949b4 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -204,12 +204,17 @@ def test_constructor_invalid(self): ) with pytest.raises(TypeError, match=msg): Float64Index(0.0) - msg = ( - "String dtype not supported, " - "you may need to explicitly cast to a numeric type" + + # 2021-02-1 we get ValueError in numpy 1.20, but not on all builds + msg = "|".join( + [ + "String dtype not supported, you may need to explicitly cast ", + "could not convert string to float: 'a'", + ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises((TypeError, ValueError), match=msg): Float64Index(["a", "b", 0.0]) + msg = r"float\(\) argument must be a string or a number, not 'Timestamp'" with pytest.raises(TypeError, match=msg): Float64Index([Timestamp("20130101")]) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 68f12a939e061..11726bc5e31c8 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -7,7 +7,6 @@ import numpy as np import pytest -from pandas.compat.numpy import is_numpy_dev import pandas.util._test_decorators as td import pandas as pd @@ -981,7 +980,6 @@ def test_loc_setitem_empty_append_single_value(self): df.loc[0, "x"] = expected.loc[0, "x"] tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(is_numpy_dev, reason="gh-35481") def test_loc_setitem_empty_append_raises(self): # GH6173, various appends to an empty dataframe @@ -995,7 +993,12 @@ def test_loc_setitem_empty_append_raises(self): with pytest.raises(KeyError, match=msg): df.loc[[0, 1], "x"] = data - msg = "cannot copy sequence with size 2 to array axis with dimension 0" + msg = "|".join( + [ + "cannot copy sequence with size 2 to array axis with dimension 0", + r"could not broadcast input array from shape \(2,\) into shape \(0,\)", + ] + ) with pytest.raises(ValueError, match=msg): df.loc[0:2, "x"] = data diff --git a/pandas/tests/io/data/excel/dimension_large.xlsx b/pandas/tests/io/data/excel/dimension_large.xlsx new file mode 100644 index 0000000000000..d57abdf2fbbae Binary files /dev/null and b/pandas/tests/io/data/excel/dimension_large.xlsx differ diff --git a/pandas/tests/io/data/excel/dimension_missing.xlsx b/pandas/tests/io/data/excel/dimension_missing.xlsx new file mode 100644 index 0000000000000..9274896689a72 Binary files /dev/null and b/pandas/tests/io/data/excel/dimension_missing.xlsx differ diff --git a/pandas/tests/io/data/excel/dimension_small.xlsx b/pandas/tests/io/data/excel/dimension_small.xlsx new file mode 100644 index 0000000000000..78ce4723ebef4 Binary files /dev/null and b/pandas/tests/io/data/excel/dimension_small.xlsx differ diff --git a/pandas/tests/io/data/excel/empty_trailing_rows.xlsx b/pandas/tests/io/data/excel/empty_trailing_rows.xlsx new file mode 100644 index 0000000000000..920b03915a3c8 Binary files /dev/null and b/pandas/tests/io/data/excel/empty_trailing_rows.xlsx differ diff --git a/pandas/tests/io/data/excel/empty_with_blank_row.xlsx b/pandas/tests/io/data/excel/empty_with_blank_row.xlsx new file mode 100644 index 0000000000000..fe3bcfcc269d7 Binary files /dev/null and b/pandas/tests/io/data/excel/empty_with_blank_row.xlsx differ diff --git a/pandas/tests/io/excel/__init__.py b/pandas/tests/io/excel/__init__.py index b7ceb28573484..9dda54915ab1c 100644 --- a/pandas/tests/io/excel/__init__.py +++ b/pandas/tests/io/excel/__init__.py @@ -2,7 +2,7 @@ import pytest -from pandas.compat._optional import import_optional_dependency +from pandas.compat._optional import get_version, import_optional_dependency pytestmark = [ pytest.mark.filterwarnings( @@ -32,4 +32,4 @@ else: import xlrd - xlrd_version = LooseVersion(xlrd.__version__) + xlrd_version = LooseVersion(get_version(xlrd)) diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index d6c6399f082c6..c99d9ae62bf54 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -42,5 +42,5 @@ def test_nonexistent_sheetname_raises(read_ext): # GH-27676 # Specifying a non-existent sheet_name parameter should throw an error # with the sheet name. - with pytest.raises(ValueError, match="sheet xyz not found"): + with pytest.raises(ValueError, match="Worksheet named 'xyz' not found"): pd.read_excel("blank.ods", sheet_name="xyz") diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 3155e22d3ff5d..0962b719efd4d 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -1,6 +1,11 @@ +from distutils.version import LooseVersion +from pathlib import Path + import numpy as np import pytest +from pandas.compat._optional import get_version + import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -116,3 +121,111 @@ def test_to_excel_with_openpyxl_engine(ext): ).highlight_max() styled.to_excel(filename, engine="openpyxl") + + +@pytest.mark.parametrize("read_only", [True, False]) +def test_read_workbook(datapath, ext, read_only): + # GH 39528 + filename = datapath("io", "data", "excel", "test1" + ext) + wb = openpyxl.load_workbook(filename, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl") + wb.close() + expected = pd.read_excel(filename) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "header, expected_data", + [ + ( + 0, + { + "Title": [np.nan, "A", 1, 2, 3], + "Unnamed: 1": [np.nan, "B", 4, 5, 6], + "Unnamed: 2": [np.nan, "C", 7, 8, 9], + }, + ), + (2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}), + ], +) +@pytest.mark.parametrize( + "filename", ["dimension_missing", "dimension_small", "dimension_large"] +) +# When read_only is None, use read_excel instead of a workbook +@pytest.mark.parametrize("read_only", [True, False, None]) +def test_read_with_bad_dimension( + datapath, ext, header, expected_data, filename, read_only, request +): + # GH 38956, 39001 - no/incorrect dimension information + version = LooseVersion(get_version(openpyxl)) + if (read_only or read_only is None) and version < "3.0.0": + msg = "openpyxl read-only sheet is incorrect when dimension data is wrong" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + path = datapath("io", "data", "excel", f"{filename}{ext}") + if read_only is None: + result = pd.read_excel(path, header=header) + else: + wb = openpyxl.load_workbook(path, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl", header=header) + wb.close() + expected = DataFrame(expected_data) + tm.assert_frame_equal(result, expected) + + +def test_append_mode_file(ext): + # GH 39576 + df = DataFrame() + + with tm.ensure_clean(ext) as f: + df.to_excel(f, engine="openpyxl") + + with ExcelWriter(f, mode="a", engine="openpyxl") as writer: + df.to_excel(writer) + + # make sure that zip files are not concatenated by making sure that + # "docProps/app.xml" only occurs twice in the file + data = Path(f).read_bytes() + first = data.find(b"docProps/app.xml") + second = data.find(b"docProps/app.xml", first + 1) + third = data.find(b"docProps/app.xml", second + 1) + assert second != -1 and third == -1 + + +# When read_only is None, use read_excel instead of a workbook +@pytest.mark.parametrize("read_only", [True, False, None]) +def test_read_with_empty_trailing_rows(datapath, ext, read_only, request): + # GH 39181 + version = LooseVersion(get_version(openpyxl)) + if (read_only or read_only is None) and version < "3.0.0": + msg = "openpyxl read-only sheet is incorrect when dimension data is wrong" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + path = datapath("io", "data", "excel", f"empty_trailing_rows{ext}") + if read_only is None: + result = pd.read_excel(path) + else: + wb = openpyxl.load_workbook(path, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl") + wb.close() + expected = DataFrame( + { + "Title": [np.nan, "A", 1, 2, 3], + "Unnamed: 1": [np.nan, "B", 4, 5, 6], + "Unnamed: 2": [np.nan, "C", 7, 8, 9], + } + ) + tm.assert_frame_equal(result, expected) + + +# When read_only is None, use read_excel instead of a workbook +@pytest.mark.parametrize("read_only", [True, False, None]) +def test_read_empty_with_blank_row(datapath, ext, read_only): + # GH 39547 - empty excel file with a row that has no data + path = datapath("io", "data", "excel", f"empty_with_blank_row{ext}") + if read_only is None: + result = pd.read_excel(path) + else: + wb = openpyxl.load_workbook(path, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl") + wb.close() + expected = DataFrame() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 8b1a96f694e71..bd3bfa207c4b0 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -2,6 +2,7 @@ from functools import partial import os from urllib.error import URLError +from zipfile import BadZipFile import numpy as np import pytest @@ -622,6 +623,16 @@ def test_bad_engine_raises(self, read_ext): with pytest.raises(ValueError, match="Unknown engine: foo"): pd.read_excel("", engine=bad_engine) + @pytest.mark.parametrize( + "sheet_name", + [3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]], + ) + def test_bad_sheetname_raises(self, read_ext, sheet_name): + # GH 39250 + msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found" + with pytest.raises(ValueError, match=msg): + pd.read_excel("blank" + read_ext, sheet_name=sheet_name) + def test_missing_file_raises(self, read_ext): bad_file = f"foo{read_ext}" # CI tests with zh_CN.utf8, translates to "No such file or directory" @@ -632,7 +643,13 @@ def test_missing_file_raises(self, read_ext): def test_corrupt_bytes_raises(self, read_ext, engine): bad_stream = b"foo" - with pytest.raises(ValueError, match="File is not a recognized excel file"): + if engine is None or engine == "xlrd": + error = ValueError + msg = "File is not a recognized excel file" + else: + error = BadZipFile + msg = "File is not a zip file" + with pytest.raises(error, match=msg): pd.read_excel(bad_stream) @tm.network @@ -1159,6 +1176,17 @@ def test_sheet_name(self, read_ext, df_ref): tm.assert_frame_equal(df1_parse, df_ref, check_names=False) tm.assert_frame_equal(df2_parse, df_ref, check_names=False) + @pytest.mark.parametrize( + "sheet_name", + [3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]], + ) + def test_bad_sheetname_raises(self, read_ext, sheet_name): + # GH 39250 + msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found" + with pytest.raises(ValueError, match=msg): + with pd.ExcelFile("blank" + read_ext) as excel: + excel.parse(sheet_name=sheet_name) + def test_excel_read_buffer(self, engine, read_ext): pth = "test1" + read_ext expected = pd.read_excel(pth, sheet_name="Sheet1", index_col=0, engine=engine) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 6a2ac2f6003d7..af0de05965398 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -347,19 +347,9 @@ def test_excel_sheet_by_name_raise(self, path, engine): tm.assert_frame_equal(gt, df) - if engine == "odf": - msg = "sheet 0 not found" - with pytest.raises(ValueError, match=msg): - pd.read_excel(xl, "0") - elif engine == "xlwt": - import xlrd - - msg = "No sheet named <'0'>" - with pytest.raises(xlrd.XLRDError, match=msg): - pd.read_excel(xl, sheet_name="0") - else: - with pytest.raises(KeyError, match="Worksheet 0 does not exist."): - pd.read_excel(xl, sheet_name="0") + msg = "Worksheet named '0' not found" + with pytest.raises(ValueError, match=msg): + pd.read_excel(xl, "0") def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 2a1114a9570f0..1b4458d0437a1 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -43,9 +43,10 @@ def test_read_xlrd_book(read_ext, frame): # TODO: test for openpyxl as well def test_excel_table_sheet_by_index(datapath, read_ext): path = datapath("io", "data", "excel", f"test1{read_ext}") + msg = "Worksheet named 'invalid_sheet_name' not found" with ExcelFile(path, engine="xlrd") as excel: - with pytest.raises(xlrd.XLRDError): - pd.read_excel(excel, sheet_name="asdf") + with pytest.raises(ValueError, match=msg): + pd.read_excel(excel, sheet_name="invalid_sheet_name") def test_excel_file_warning_with_xlsx_file(datapath): diff --git a/pandas/tests/io/pytables/__init__.py b/pandas/tests/io/pytables/__init__.py index fb4b317a5e977..d3735f8863c3b 100644 --- a/pandas/tests/io/pytables/__init__.py +++ b/pandas/tests/io/pytables/__init__.py @@ -6,4 +6,7 @@ "ignore:a closed node found in the registry:UserWarning" ), pytest.mark.filterwarnings(r"ignore:tostring\(\) is deprecated:DeprecationWarning"), + pytest.mark.filterwarnings( + r"ignore:`np\.object` is a deprecated alias:DeprecationWarning" + ), ] diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 80e2b36764ba0..540f12841de1b 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -2,7 +2,7 @@ Tests for the pandas.io.common functionalities """ import codecs -from io import StringIO +from io import BytesIO, StringIO import mmap import os from pathlib import Path @@ -443,3 +443,33 @@ def test_codecs_encoding(encoding, format): else: df = pd.read_json(handle) tm.assert_frame_equal(expected, df) + + +def test_codecs_get_writer_reader(): + # GH39247 + expected = tm.makeDataFrame() + with tm.ensure_clean() as path: + with open(path, "wb") as handle: + with codecs.getwriter("utf-8")(handle) as encoded: + expected.to_csv(encoded) + with open(path, "rb") as handle: + with codecs.getreader("utf-8")(handle) as encoded: + df = pd.read_csv(encoded, index_col=0) + tm.assert_frame_equal(expected, df) + + +@pytest.mark.parametrize( + "io_class,mode,msg", + [ + (BytesIO, "t", "a bytes-like object is required, not 'str'"), + (StringIO, "b", "string argument expected, got 'bytes'"), + ], +) +def test_explicit_encoding(io_class, mode, msg): + # GH39247; this test makes sure that if a user provides mode="*t" or "*b", + # it is used. In the case of this test it leads to an error as intentionally the + # wrong mode is requested + expected = tm.makeDataFrame() + with io_class() as buffer: + with pytest.raises(TypeError, match=msg): + expected.to_csv(buffer, mode=f"w{mode}") diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 34b36e2549b62..24844c4f2eb85 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -13,6 +13,7 @@ import bz2 import datetime import functools +from functools import partial import glob import gzip import io @@ -588,3 +589,14 @@ def test_pickle_preserves_block_ndim(): # GH#37631 OP issue was about indexing, underlying problem was pickle tm.assert_series_equal(res[[True]], ser) + + +@pytest.mark.parametrize("protocol", [pickle.DEFAULT_PROTOCOL, pickle.HIGHEST_PROTOCOL]) +def test_pickle_big_dataframe_compression(protocol, compression): + # GH#39002 + df = pd.DataFrame(range(100000)) + result = tm.round_trip_pathlib( + partial(df.to_pickle, protocol=protocol, compression=compression), + partial(pd.read_pickle, compression=compression), + ) + tm.assert_frame_equal(df, result) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index da5bb0eb59f70..5e1a4246bfe12 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -31,15 +31,9 @@ async def test_tab_complete_ipython6_warning(ip): ) await ip.run_code(code) - # TODO: remove it when Ipython updates - # GH 33567, jedi version raises Deprecation warning in Ipython - import jedi - - if jedi.__version__ < "0.17.0": - warning = tm.assert_produces_warning(None) - else: - warning = tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False) - with warning: + # GH 31324 newer jedi version raises Deprecation warning; + # appears resolved 2021-02-02 + with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("rs.", 1)) @@ -398,3 +392,34 @@ def test_resample_groupby_agg(): result = resampled.agg({"num": "sum"}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("consolidate", [True, False]) +def test_resample_groupby_agg_object_dtype_all_nan(consolidate): + # https://github.com/pandas-dev/pandas/issues/39329 + + dates = pd.date_range("2020-01-01", periods=15, freq="D") + df1 = DataFrame({"key": "A", "date": dates, "col1": range(15), "col_object": "val"}) + df2 = DataFrame({"key": "B", "date": dates, "col1": range(15)}) + df = pd.concat([df1, df2], ignore_index=True) + if consolidate: + df = df._consolidate() + + result = df.groupby(["key"]).resample("W", on="date").min() + idx = pd.MultiIndex.from_arrays( + [ + ["A"] * 3 + ["B"] * 3, + pd.to_datetime(["2020-01-05", "2020-01-12", "2020-01-19"] * 2), + ], + names=["key", "date"], + ) + expected = DataFrame( + { + "key": ["A"] * 3 + ["B"] * 3, + "date": pd.to_datetime(["2020-01-01", "2020-01-06", "2020-01-13"] * 2), + "col1": [0, 5, 12] * 2, + "col_object": ["val"] * 3 + [np.nan] * 3, + }, + index=idx, + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 3cd9d52f8e754..a3624162a08d8 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -337,6 +337,11 @@ def test_astype_unicode(self): reload(sys) sys.setdefaultencoding(former_encoding) + def test_astype_bytes(self): + # GH#39474 + result = Series(["foo", "bar", "baz"]).astype(bytes) + assert result.dtypes == np.dtype("S3") + class TestAstypeCategorical: def test_astype_categorical_invalid_conversions(self): diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 865ae565b6501..4ebfc7b264d56 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -68,7 +68,7 @@ def test_astype_categorical_to_other(self): exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) - exp2 = Series([1, 2, 3, 4]).astype("int64") + exp2 = Series([1, 2, 3, 4]).astype("int") tm.assert_series_equal(s2.astype("int"), exp2) # object don't sort correctly, so just compare that we have the same @@ -109,6 +109,17 @@ def test_astype_categorical_invalid_conversions(self): with pytest.raises(TypeError, match=msg): ser.astype("object").astype(Categorical) + def test_categorical_astype_to_int(self, any_int_or_nullable_int_dtype): + # GH 39402 + + df = DataFrame(data={"col1": pd.array([2.0, 1.0, 3.0])}) + df.col1 = df.col1.astype("category") + df.col1 = df.col1.astype(any_int_or_nullable_int_dtype) + expected = DataFrame( + {"col1": pd.array([2, 1, 3], dtype=any_int_or_nullable_int_dtype)} + ) + tm.assert_frame_equal(df, expected) + def test_series_to_categorical(self): # see gh-16524: test conversion of Series to Categorical series = Series(["a", "b", "c"]) diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 8d9b54cf3f0df..edb0f8c7dd662 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -10,6 +10,7 @@ import warnings from hypothesis import assume, given, strategies as st +from hypothesis.errors import Flaky from hypothesis.extra.dateutil import timezones as dateutil_timezones from hypothesis.extra.pytz import timezones as pytz_timezones import pytest @@ -103,6 +104,7 @@ def test_on_offset_implementations(dt, offset): assert offset.is_on_offset(dt) == (compare == dt) +@pytest.mark.xfail(strict=False, raises=Flaky, reason="unreliable test timings") @given(gen_yqm_offset) def test_shift_across_dst(offset): # GH#18319 check that 1) timezone is correctly normalized and diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index bf80a1410e7d9..f8539e9031d28 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -305,3 +305,19 @@ def test_assert_frame_equal_columns_mixed_dtype(): # GH#39168 df = DataFrame([[0, 1, 2]], columns=["foo", "bar", 42], index=[1, "test", 2]) tm.assert_frame_equal(df, df, check_like=True) + + +def test_frame_equal_extension_dtype(frame_or_series, any_numeric_dtype): + # GH#39410 + obj = frame_or_series([1, 2], dtype=any_numeric_dtype) + tm.assert_equal(obj, obj, check_exact=True) + + +@pytest.mark.parametrize("indexer", [(0, 1), (1, 0)]) +def test_frame_equal_mixed_dtypes(frame_or_series, any_numeric_dtype, indexer): + dtypes = (any_numeric_dtype, "int64") + obj1 = frame_or_series([1, 2], dtype=dtypes[indexer[0]]) + obj2 = frame_or_series([1, 2], dtype=dtypes[indexer[1]]) + msg = r'(Series|DataFrame.iloc\[:, 0\] \(column name="0"\) classes) are different' + with pytest.raises(AssertionError, match=msg): + tm.assert_equal(obj1, obj2, check_exact=True, check_dtype=False) diff --git a/pandas/tests/window/__init__.py b/pandas/tests/window/__init__.py index e69de29bb2d1d..757bdfe755038 100644 --- a/pandas/tests/window/__init__.py +++ b/pandas/tests/window/__init__.py @@ -0,0 +1,8 @@ +import pytest + +pytestmark = [ + # 2021-02-01 needed until numba updates their usage + pytest.mark.filterwarnings( + r"ignore:`np\.int` is a deprecated alias:DeprecationWarning" + ), +] diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 52c629f96b713..0c1d565a9bd5f 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -319,3 +319,17 @@ def test_multiple_agg_funcs(func, window_size, expected_vals): result = window.agg({"low": ["mean", "max"], "high": ["mean", "min"]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.filterwarnings("ignore:min_periods:FutureWarning") +def test_dont_modify_attributes_after_methods( + arithmetic_win_operators, closed, center, min_periods +): + # GH 39554 + roll_obj = Series(range(1)).rolling( + 1, center=center, closed=closed, min_periods=min_periods + ) + expected = {attr: getattr(roll_obj, attr) for attr in roll_obj._attributes} + getattr(roll_obj, arithmetic_win_operators)() + result = {attr: getattr(roll_obj, attr) for attr in roll_obj._attributes} + assert result == expected diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 1723330ec40e1..381edb89747b6 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -170,7 +170,10 @@ def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs) # Check that the function output matches applying an alternative function # if min_periods isn't specified - rolling3 = constructor(values).rolling(window=indexer) + # GH 39604: After count-min_periods deprecation, apply(lambda x: len(x)) + # is equivalent to count after setting min_periods=0 + min_periods = 0 if func == "count" else None + rolling3 = constructor(values).rolling(window=indexer, min_periods=min_periods) result3 = getattr(rolling3, func)() expected3 = constructor(rolling3.apply(lambda x: np_func(x, **np_kwargs))) tm.assert_equal(result3, expected3) diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 5256cc29d5543..5b951cab1e3dc 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -8,7 +8,7 @@ from typing import Dict, Optional, Union from pandas._typing import JSONSerializable -from pandas.compat._optional import VERSIONS, _get_version, import_optional_dependency +from pandas.compat._optional import VERSIONS, get_version, import_optional_dependency def _get_commit_hash() -> Optional[str]: @@ -83,7 +83,7 @@ def _get_dependency_info() -> Dict[str, JSONSerializable]: mod = import_optional_dependency( modname, raise_on_missing=False, on_version="ignore" ) - result[modname] = _get_version(mod) if mod else None + result[modname] = get_version(mod) if mod else None return result diff --git a/requirements-dev.txt b/requirements-dev.txt index f0d65104ead8e..2591dcbf7b1ad 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -76,5 +76,5 @@ cftime pyreadstat tabulate>=0.8.3 natsort -git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master +git+https://github.com/pandas-dev/pydata-sphinx-theme.git@2488b7defbd3d753dd5fcfc890fc4a7e79d25103 git+https://github.com/numpy/numpydoc diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 9c58a55cb907e..8f48d518a737b 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -29,7 +29,6 @@ "_doc_template", "_agg_template", "_pipe_template", - "_get_version", "__main__", "_transform_template", "_flex_comp_doc_FRAME",