diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 85a9cf4ea0f9f..7ff407094a0a5 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -40,6 +40,7 @@ import pandas.tslib as tslib from contextlib import contextmanager +from distutils.version import LooseVersion # versioning attribute _version = '0.10.1' @@ -47,6 +48,7 @@ # PY3 encoding if we don't specify _default_encoding = 'UTF-8' +_np_version_under_172 = LooseVersion(np.__version__) < '1.7.2' def _ensure_decoded(s): """ if we have bytes, decode them to unicde """ @@ -4165,7 +4167,6 @@ def _convert_string_array(data, encoding, itemsize=None): data = np.array(data, dtype="S%d" % itemsize) return data - def _unconvert_string_array(data, nan_rep=None, encoding=None): """ deserialize a string array, possibly decoding """ shape = data.shape @@ -4175,8 +4176,14 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): # where the passed encoding is actually None) encoding = _ensure_encoding(encoding) if encoding is not None and len(data): + if _np_version_under_172: + itemsize = lib.max_len_string_array(data) + dtype = (str, itemsize) + else: + dtype = str + try: - data = data.astype(string_types).astype(object) + data = data.astype(dtype).astype(object) except: f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object]) data = f(data) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index dcdd5408c3376..51c254b2d422f 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -176,6 +176,17 @@ def roundtrip(key, obj,**kwargs): finally: safe_remove(self.path) + def test_long_strings(self): + df = DataFrame({'a': [tm.rands(100) for _ in range(10)]}, + index=[tm.rands(100) for _ in range(10)]) + + with ensure_clean_store(self.path) as store: + store.append('df', df, data_columns=['a']) + + result = store.select('df') + assert_frame_equal(df, result) + + def test_api(self): # GH4584