issue_comments: 1499791533
This data as json
html_url | issue_url | id | node_id | user | created_at | updated_at | author_association | body | reactions | performed_via_github_app | issue |
---|---|---|---|---|---|---|---|---|---|---|---|
https://github.com/pydata/xarray/pull/7019#issuecomment-1499791533 | https://api.github.com/repos/pydata/xarray/issues/7019 | 1499791533 | IC_kwDOAMm_X85ZZQCt | 35968931 | 2023-04-07T00:32:47Z | 2023-04-07T00:59:03Z | MEMBER |
Update on this rabbit hole: This commit to dask changed the behaviour of dask's auto-chunking logic, such that if I run my little test script
```python
from xarray.core.variable import IndexVariable
from dask.array.core import normalize_chunks # import the
import itertools
from numbers import Number
import dask
import dask.array as da
import xarray as xr
import numpy as np
# This function is copied from xarray, but calls dask.array.core.normalize_chunks
# It is used in open_dataset, but not in Dataset.chunk
def _get_chunk(var, chunks):
"""
Return map from each dim to chunk sizes, accounting for backend's preferred chunks.
"""
if isinstance(var, IndexVariable):
return {}
dims = var.dims
shape = var.shape
# Determine the explicit requested chunks.
preferred_chunks = var.encoding.get("preferred_chunks", {})
preferred_chunk_shape = tuple(
preferred_chunks.get(dim, size) for dim, size in zip(dims, shape)
)
if isinstance(chunks, Number) or (chunks == "auto"):
chunks = dict.fromkeys(dims, chunks)
chunk_shape = tuple(
chunks.get(dim, None) or preferred_chunk_sizes
for dim, preferred_chunk_sizes in zip(dims, preferred_chunk_shape)
)
chunk_shape = normalize_chunks(
chunk_shape, shape=shape, dtype=var.dtype, previous_chunks=preferred_chunk_shape
)
# Warn where requested chunks break preferred chunks, provided that the variable
# contains data.
if var.size:
for dim, size, chunk_sizes in zip(dims, shape, chunk_shape):
try:
preferred_chunk_sizes = preferred_chunks[dim]
except KeyError:
continue
# Determine the stop indices of the preferred chunks, but omit the last stop
# (equal to the dim size). In particular, assume that when a sequence
# expresses the preferred chunks, the sequence sums to the size.
preferred_stops = (
range(preferred_chunk_sizes, size, preferred_chunk_sizes)
if isinstance(preferred_chunk_sizes, Number)
else itertools.accumulate(preferred_chunk_sizes[:-1])
)
# Gather any stop indices of the specified chunks that are not a stop index
# of a preferred chunk. Again, omit the last stop, assuming that it equals
# the dim size.
breaks = set(itertools.accumulate(chunk_sizes[:-1])).difference(
preferred_stops
)
if breaks:
warnings.warn(
"The specified Dask chunks separate the stored chunks along "
f'dimension "{dim}" starting at index {min(breaks)}. This could '
"degrade performance. Instead, consider rechunking after loading."
)
return dict(zip(dims, chunk_shape))
chunks = 'auto'
encoded_chunks = 100
dask_arr = da.from_array(
np.ones((500, 500), dtype="float64"), chunks=encoded_chunks
)
var = xr.core.variable.Variable(data=dask_arr, dims=['x', 'y'])
with dask.config.set({"array.chunk-size": "1MiB"}):
chunks_suggested = _get_chunk(var, chunks)
print(chunks_suggested)
```
Anyway what this means is as this PR vendors I think one simple way to fix this failure without should be to upgrade the minimum version of dask to >=2022.9.2 (from 2022.1.1 where it currently is). EDIT: I tried changing the minimum version of dask-core in EDIT2: Another way to fix this should be to un-vendor |
{ "total_count": 0, "+1": 0, "-1": 0, "laugh": 0, "hooray": 0, "confused": 0, "heart": 0, "rocket": 0, "eyes": 0 } |
1368740629 |