issues: 393214032
This data as json
id | node_id | number | title | user | state | locked | assignee | milestone | comments | created_at | updated_at | closed_at | author_association | active_lock_reason | draft | pull_request | body | reactions | performed_via_github_app | state_reason | repo | type |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
393214032 | MDU6SXNzdWUzOTMyMTQwMzI= | 2624 | Xarray to Zarr error (in compress / numcodecs functions) | 1961038 | closed | 0 | 9 | 2018-12-20T21:14:46Z | 2019-01-13T03:54:07Z | 2019-01-13T03:54:07Z | NONE | I'm trying to use xarray (0.11.0) to convert a (NetCDF) dataset from the CFSR archive that we maintain locally, consisting of two files opened and joined with mfdataset to zarr using xarray's to_zarr . The data array contains float32 values of geopotential height, of shape (361 x 720 x 32 x 2920 ) ... 1/2 deg. CFSR, 32 isobaric levels, 2920 times (4x/day, 2 years' worth). to_zarr slowly crunches through the dataset (10 minutes or so ... RAM use creeps up towards ~80 GB) before failing, ultimately with an error from numcodecs ("Codec does not support buffers of (2GB) ") A similar error occurs if I just read in a single data file (i.e. one year of data). I thought I might try specifying no compression, as supported in Zarr, by adding "compressor = None" as a kwarg in the to_zarr call in xarray, but that is not supported. (The data files exist on our THREDDS server, e.g. http://thredds.atmos.albany.edu:8080/thredds/dodsC/CFSR/test/g.2013.0p5.anl.nc , but in my example below I'm reading in directly from the NFS-served directory path; OpenDAP times out due to a different issue) Code block and error is below:```python import xarray as xr f1 = '/network/daes/cfsr/data/test/g.2013.0p5.anl.nc' f2 = '/network/daes/cfsr/data/test/g.2014.0p5.anl.nc' ds = xr.open_mfdataset([f1,f2]) zarr_ds = ds.to_zarr('/network/rit/lab/ktyle_rit/zarr/z500','w')
ValueError Traceback (most recent call last) <timed exec> in <module> /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/xarray/core/dataset.py in to_zarr(self, store, mode, synchronizer, group, encoding, compute) 1257 from ..backends.api import to_zarr 1258 return to_zarr(self, store=store, mode=mode, synchronizer=synchronizer, -> 1259 group=group, encoding=encoding, compute=compute) 1260 1261 def unicode(self): /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/xarray/backends/api.py in to_zarr(dataset, store, mode, synchronizer, group, encoding, compute) 884 # TODO: figure out how to properly handle unlimited_dims 885 dump_to_store(dataset, store, writer, encoding=encoding) --> 886 writes = writer.sync(compute=compute) 887 888 if not compute: /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/xarray/backends/common.py in sync(self, compute) 177 delayed_store = da.store(self.sources, self.targets, 178 lock=self.lock, compute=compute, --> 179 flush=True) 180 self.sources = [] 181 self.targets = [] /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/dask/array/core.py in store(sources, targets, lock, regions, compute, return_stored, kwargs) 864 865 if compute: --> 866 result.compute(kwargs) 867 return None 868 else: /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/dask/base.py in compute(self, kwargs) 154 dask.base.compute 155 """ --> 156 (result,) = compute(self, traverse=False, kwargs) 157 return result 158 /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/dask/base.py in compute(args, kwargs) 395 keys = [x.dask_keys() for x in collections] 396 postcomputes = [x.dask_postcompute() for x in collections] --> 397 results = schedule(dsk, keys, kwargs) 398 return repack([f(r, a) for r, (f, a) in zip(results, postcomputes)]) 399 /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/dask/threaded.py in get(dsk, result, cache, num_workers, kwargs) 74 results = get_async(pool.apply_async, len(pool._pool), dsk, result, 75 cache=cache, get_id=_thread_get_id, ---> 76 pack_exception=pack_exception, kwargs) 77 78 # Cleanup pools associated to dead threads /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs) 499 _execute_task(task, data) # Re-execute locally 500 else: --> 501 raise_exception(exc, tb) 502 res, worker_id = loads(res_info) 503 state['cache'][key] = res /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/dask/compatibility.py in reraise(exc, tb) 110 if exc.traceback is not tb: 111 raise exc.with_traceback(tb) --> 112 raise exc 113 114 else: /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception) 270 try: 271 task, data = loads(task_info) --> 272 result = _execute_task(task, data) 273 id = get_id() 274 result = dumps((result, id)) /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/dask/local.py in _execute_task(arg, cache, dsk) 251 func, args = arg[0], arg[1:] 252 args2 = [_execute_task(a, cache) for a in args] --> 253 return func(*args2) 254 elif not ishashable(arg): 255 return arg /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/dask/array/core.py in store_chunk(x, out, index, lock, return_stored) 2920 2921 def store_chunk(x, out, index, lock, return_stored): -> 2922 return load_store_chunk(x, out, index, lock, return_stored, False) 2923 2924 /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/dask/array/core.py in load_store_chunk(x, out, index, lock, return_stored, load_stored) 2909 try: 2910 if x is not None: -> 2911 out[index] = np.asanyarray(x) 2912 if return_stored and load_stored: 2913 result = out[index] /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/zarr/core.py in setitem(self, selection, value) 1100 1101 fields, selection = pop_fields(selection) -> 1102 self.set_basic_selection(selection, value, fields=fields) 1103 1104 def set_basic_selection(self, selection, value, fields=None): /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/zarr/core.py in set_basic_selection(self, selection, value, fields) 1195 return self._set_basic_selection_zd(selection, value, fields=fields) 1196 else: -> 1197 return self._set_basic_selection_nd(selection, value, fields=fields) 1198 1199 def set_orthogonal_selection(self, selection, value, fields=None): /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/zarr/core.py in _set_basic_selection_nd(self, selection, value, fields) 1486 indexer = BasicIndexer(selection, self) 1487 -> 1488 self._set_selection(indexer, value, fields=fields) 1489 1490 def _set_selection(self, indexer, value, fields=None): /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/zarr/core.py in _set_selection(self, indexer, value, fields) 1534 1535 # put data -> 1536 self._chunk_setitem(chunk_coords, chunk_selection, chunk_value, fields=fields) 1537 1538 def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/zarr/core.py in _chunk_setitem(self, chunk_coords, chunk_selection, value, fields) 1642 with lock: 1643 self._chunk_setitem_nosync(chunk_coords, chunk_selection, value, -> 1644 fields=fields) 1645 1646 def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=None): /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/zarr/core.py in _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields) 1716 1717 # encode chunk -> 1718 cdata = self._encode_chunk(chunk) 1719 1720 # store /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/zarr/core.py in _encode_chunk(self, chunk) 1766 # compress 1767 if self._compressor: -> 1768 cdata = self._compressor.encode(chunk) 1769 else: 1770 cdata = chunk numcodecs/blosc.pyx in numcodecs.blosc.Blosc.encode() /network/rit/lab/snowclus/anaconda31/envs/pangeo/lib/python3.6/site-packages/numcodecs/compat.py in ensure_contiguous_ndarray(buf, max_buffer_size) 140 if max_buffer_size is not None and arr.nbytes > max_buffer_size: 141 msg = "Codec does not support buffers of > {} bytes".format(max_buffer_size) --> 142 raise ValueError(msg) 143 144 return arr ValueError: Codec does not support buffers of > 2147483647 bytes ``` |
{ "url": "https://api.github.com/repos/pydata/xarray/issues/2624/reactions", "total_count": 0, "+1": 0, "-1": 0, "laugh": 0, "hooray": 0, "confused": 0, "heart": 0, "rocket": 0, "eyes": 0 } |
completed | 13221727 | issue |