home / github / issues

Menu
  • GraphQL API
  • Search all tables

issues: 1260047355

This data as json

id node_id number title user state locked assignee milestone comments created_at updated_at closed_at author_association active_lock_reason draft pull_request body reactions performed_via_github_app state_reason repo type
1260047355 I_kwDOAMm_X85LGsv7 6662 Obscure h5netcdf http serialization issue with python's http.server 1197350 closed 0     6 2022-06-03T15:28:15Z 2022-06-04T22:13:05Z 2022-06-04T22:13:05Z MEMBER      

What is your issue?

In Pangeo Forge, we try to test our ability to read data over http. This often surfaces edge cases involving xarray and fsspec. This is one such edge case. However, it is kind of important, because it affects our ability to reliably test http-based datasets using python's built-in http server.

Here is some code that: - Creates a tiny dataset on disk - Serves it over http via python -m http.server - Opens the dataset with fsspec and xarray with the h5netcdf engine - Pickles the dataset, loads it, and calls .load() to load the data into memory

As you can see, this works with a local file, but not with the http file, with h5py raising a checksum-related error.

```python import fsspec import xarray as xr from pickle import dumps, loads

ds_orig = xr.tutorial.load_dataset('tiny') ds_orig

fname = 'tiny.nc' ds_orig.to_netcdf(fname, engine='netcdf4')

now start an http server in a terminal in the same working directory

$ python -m http.server

def open_pickle_and_reload(path): with fsspec.open(path, mode='rb') as fp: with xr.open_dataset(fp, engine='h5netcdf') as ds1: pass

# pickle it and reload it
ds2 = loads(dumps(ds1))
ds2.load()

open_pickle_and_reload(fname) # works url = f'http://127.0.0.1:8000/{fname}' open_pickle_and_reload(url) # OSError: Unable to open file (incorrect metadata checksum after all read attempts) ```

full traceback ``` --------------------------------------------------------------------------- KeyError Traceback (most recent call last) ~/Code/xarray/xarray/backends/file_manager.py in _acquire_with_cache_info(self, needs_lock) 198 try: --> 199 file = self._cache[self._key] 200 except KeyError: ~/Code/xarray/xarray/backends/lru_cache.py in __getitem__(self, key) 52 with self._lock: ---> 53 value = self._cache[key] 54 self._cache.move_to_end(key) KeyError: [<class 'h5netcdf.core.File'>, (<File-like object HTTPFileSystem, http://127.0.0.1:8000/tiny.nc>,), 'r', (('decode_vlen_strings', True), ('invalid_netcdf', None))] During handling of the above exception, another exception occurred: OSError Traceback (most recent call last) <ipython-input-2-195ac3fcdb43> in <module> 24 open_pickle_and_reload(fname) # works 25 url = f'[http://127.0.0.1:8000/{fname}'](http://127.0.0.1:8000/%7Bfname%7D'%3C/span%3E) ---> 26 open_pickle_and_reload(url) # OSError: Unable to open file (incorrect metadata checksum after all read attempts) <ipython-input-2-195ac3fcdb43> in open_pickle_and_reload(path) 20 # pickle it and reload it 21 ds2 = loads(dumps(ds1)) ---> 22 ds2.load() # works 23 24 open_pickle_and_reload(fname) # works ~/Code/xarray/xarray/core/dataset.py in load(self, **kwargs) 687 for k, v in self.variables.items(): 688 if k not in lazy_data: --> 689 v.load() 690 691 return self ~/Code/xarray/xarray/core/variable.py in load(self, **kwargs) 442 self._data = as_compatible_data(self._data.compute(**kwargs)) 443 elif not is_duck_array(self._data): --> 444 self._data = np.asarray(self._data) 445 return self 446 ~/Code/xarray/xarray/core/indexing.py in __array__(self, dtype) 654 655 def __array__(self, dtype=None): --> 656 self._ensure_cached() 657 return np.asarray(self.array, dtype=dtype) 658 ~/Code/xarray/xarray/core/indexing.py in _ensure_cached(self) 651 def _ensure_cached(self): 652 if not isinstance(self.array, NumpyIndexingAdapter): --> 653 self.array = NumpyIndexingAdapter(np.asarray(self.array)) 654 655 def __array__(self, dtype=None): ~/Code/xarray/xarray/core/indexing.py in __array__(self, dtype) 624 625 def __array__(self, dtype=None): --> 626 return np.asarray(self.array, dtype=dtype) 627 628 def __getitem__(self, key): ~/Code/xarray/xarray/core/indexing.py in __array__(self, dtype) 525 def __array__(self, dtype=None): 526 array = as_indexable(self.array) --> 527 return np.asarray(array[self.key], dtype=None) 528 529 def transpose(self, order): ~/Code/xarray/xarray/backends/h5netcdf_.py in __getitem__(self, key) 49 50 def __getitem__(self, key): ---> 51 return indexing.explicit_indexing_adapter( 52 key, self.shape, indexing.IndexingSupport.OUTER_1VECTOR, self._getitem 53 ) ~/Code/xarray/xarray/core/indexing.py in explicit_indexing_adapter(key, shape, indexing_support, raw_indexing_method) 814 """ 815 raw_key, numpy_indices = decompose_indexer(key, shape, indexing_support) --> 816 result = raw_indexing_method(raw_key.tuple) 817 if numpy_indices.tuple: 818 # index the loaded np.ndarray ~/Code/xarray/xarray/backends/h5netcdf_.py in _getitem(self, key) 58 key = tuple(list(k) if isinstance(k, np.ndarray) else k for k in key) 59 with self.datastore.lock: ---> 60 array = self.get_array(needs_lock=False) 61 return array[key] 62 ~/Code/xarray/xarray/backends/h5netcdf_.py in get_array(self, needs_lock) 45 class H5NetCDFArrayWrapper(BaseNetCDF4Array): 46 def get_array(self, needs_lock=True): ---> 47 ds = self.datastore._acquire(needs_lock) 48 return ds.variables[self.variable_name] 49 ~/Code/xarray/xarray/backends/h5netcdf_.py in _acquire(self, needs_lock) 180 181 def _acquire(self, needs_lock=True): --> 182 with self._manager.acquire_context(needs_lock) as root: 183 ds = _nc4_require_group( 184 root, self._group, self._mode, create_group=_h5netcdf_create_group /opt/miniconda3/envs/pangeo-forge-recipes/lib/python3.9/contextlib.py in __enter__(self) 117 del self.args, self.kwds, self.func 118 try: --> 119 return next(self.gen) 120 except StopIteration: 121 raise RuntimeError("generator didn't yield") from None ~/Code/xarray/xarray/backends/file_manager.py in acquire_context(self, needs_lock) 185 def acquire_context(self, needs_lock=True): 186 """Context manager for acquiring a file.""" --> 187 file, cached = self._acquire_with_cache_info(needs_lock) 188 try: 189 yield file ~/Code/xarray/xarray/backends/file_manager.py in _acquire_with_cache_info(self, needs_lock) 203 kwargs = kwargs.copy() 204 kwargs["mode"] = self._mode --> 205 file = self._opener(*self._args, **kwargs) 206 if self._mode == "w": 207 # ensure file doesn't get overridden when opened again /opt/miniconda3/envs/pangeo-forge-recipes/lib/python3.9/site-packages/h5netcdf/core.py in __init__(self, path, mode, invalid_netcdf, phony_dims, **kwargs) 719 else: 720 self._preexisting_file = mode in {"r", "r+", "a"} --> 721 self._h5file = h5py.File(path, mode, **kwargs) 722 except Exception: 723 self._closed = True /opt/miniconda3/envs/pangeo-forge-recipes/lib/python3.9/site-packages/h5py/_hl/files.py in __init__(self, name, mode, driver, libver, userblock_size, swmr, rdcc_nslots, rdcc_nbytes, rdcc_w0, track_order, fs_strategy, fs_persist, fs_threshold, fs_page_size, page_buf_size, min_meta_keep, min_raw_keep, locking, **kwds) 505 fs_persist=fs_persist, fs_threshold=fs_threshold, 506 fs_page_size=fs_page_size) --> 507 fid = make_fid(name, mode, userblock_size, fapl, fcpl, swmr=swmr) 508 509 if isinstance(libver, tuple): /opt/miniconda3/envs/pangeo-forge-recipes/lib/python3.9/site-packages/h5py/_hl/files.py in make_fid(name, mode, userblock_size, fapl, fcpl, swmr) 218 if swmr and swmr_support: 219 flags |= h5f.ACC_SWMR_READ --> 220 fid = h5f.open(name, flags, fapl=fapl) 221 elif mode == 'r+': 222 fid = h5f.open(name, h5f.ACC_RDWR, fapl=fapl) h5py/_objects.pyx in h5py._objects.with_phil.wrapper() h5py/_objects.pyx in h5py._objects.with_phil.wrapper() h5py/h5f.pyx in h5py.h5f.open() OSError: Unable to open file (incorrect metadata checksum after all read attempts) (external_url) ```

Strangely, a similar workflow does work with http files hosted elsewhere, e.g.

python external_url = 'https://power-datastore.s3.amazonaws.com/v9/climatology/power_901_rolling_zones_utc.nc' open_pickle_and_reload(external_url)

This suggests there is something peculiar about python's http.server as compared to other http servers that makes this break.

I would appreciate any thoughts or ideas about what might be going on here (pinging @martindurant and @shoyer)

xref: - https://github.com/pangeo-forge/pangeo-forge-recipes/pull/373 - https://github.com/pydata/xarray/issues/4242 - https://github.com/google/xarray-beam/issues/49

{
    "url": "https://api.github.com/repos/pydata/xarray/issues/6662/reactions",
    "total_count": 0,
    "+1": 0,
    "-1": 0,
    "laugh": 0,
    "hooray": 0,
    "confused": 0,
    "heart": 0,
    "rocket": 0,
    "eyes": 0
}
  completed 13221727 issue

Links from other tables

  • 2 rows from issues_id in issues_labels
  • 6 rows from issue in issue_comments
Powered by Datasette · Queries took 0.728ms · About: xarray-datasette