issue_comments: 960133879

This data as json

html_url	issue_url	id	node_id	user	created_at	updated_at	author_association	body	reactions	performed_via_github_app	issue
https://github.com/pydata/xarray/issues/5629#issuecomment-960133879	https://api.github.com/repos/pydata/xarray/issues/5629	960133879	IC_kwDOAMm_X845Onr3	2448579	2021-11-03T21:39:08Z	2023-01-24T20:19:19Z	MEMBER	In reading through #5933, I realized there's an alternate solution that preserves the desirable feature of fitting along chunked dimensions (at least for skipna=False) Since we are only ever fitting along one dimension, we can apply the reshape-other-dimensions operation blockwise. As long as we reshape back in a consistent manner (again blockwise) this is identical to our current stacking approach, but substantially more efficient. In particular, it avoids merging chunks. ```python def reshape_block(arr, output_chunks): """ Blockwise reshape array to match output_chunks This method applies a reshape operation blockwise, so it only makes sense if you do not care about the order of elements (along reshaped axes) after reshaping. The order of elements will depend on the chunking of ``arr``. It is substantially more efficient than a usual reshape, which makes sure array elements appear in a numpy-like order regardless of chunking. Based on https://github.com/dask/dask/issues/4855#issuecomment-497062002 """ from itertools import product import numpy as np from dask.array import Array from dask.base import tokenize from dask.highlevelgraph import HighLevelGraph name = "reshape-block-" + tokenize(arr) ichunks = tuple(range(len(chunks_v)) for chunks_v in arr.chunks) ochunks = tuple(range(len(chunks_v)) for chunks_v in output_chunks) dsk = { (name, ochunk): (np.reshape, (arr.name, ichunk), oshape) for ichunk, ochunk, oshape in zip( product(ichunks), product(ochunks), product(output_chunks) ) } graph = HighLevelGraph.from_collections(name, dsk, dependencies=[arr]) res = Array(graph, name, output_chunks, arr.dtype, meta=arr._meta) return res def polyfit(da, dim, skipna=False): import itertools import numpy as np da = da.transpose(..., dim) arr = da.data do_blockwise_reshape = arr.ndim > 1 and any( len(chunks) > 1 for chunks in arr.chunks ) if do_blockwise_reshape: output_chunks = (tuple(map(np.product, product(arr.chunks[:-1]))),) + ( arr.chunks[-1], ) other_dims = tuple(dim_ for dim_ in da.dims if dim_ != dim) reshaped = xr.DataArray( reshape_block(arr, output_chunks), dims=("__reshaped__", dim) ) else: reshaped = da fit = reshaped.polyfit(dim, 1, skipna=skipna) if do_blockwise_reshape: result = xr.Dataset() for var in fit: result[var] = ( ("degree",) + other_dims, reshape_block(fit[var].data, ((2,),) + arr.chunks[:-1]), ) for dim_ in other_dims: result[dim_] = da[dim_] result["degree"] = fit["degree"] return result else: return fit ``` Here's the graph 1 chunk along time (like @jbusecke's example, and skipna=True) And when the time dimension is chunked (and skipna=False) older version ``` python from itertools import product import numpy as np from dask.array import Array from dask.array.utils import assert_eq from dask.base import tokenize from dask.highlevelgraph import HighLevelGraph def reshape_block(arr, output_chunks): """ Blockwise reshape array to match output_chunks This method applies a reshape operation blockwise, so it only makes sense if you do not care about the order of elements (along reshaped axes) after reshaping. The order of elements will depend on the chunking of ``arr``. It is substantially more efficient than a usual reshape, which makes sure array elements appear in a numpy-like order regardless of chunking. Based on https://github.com/dask/dask/issues/4855#issuecomment-497062002 """ name = "reshape-block-" + tokenize(arr) ichunks = tuple(range(len(chunks_v)) for chunks_v in arr.chunks) ochunks = tuple(range(len(chunks_v)) for chunks_v in output_chunks) dsk = { (name, ochunk): (np.reshape, (arr.name, ichunk), oshape) for ichunk, ochunk, oshape in zip( product(ichunks), product(ochunks), product(output_chunks) ) } graph = HighLevelGraph.from_collections(name, dsk, dependencies=[arr]) res = Array(graph, name, output_chunks, arr.dtype, meta=arr._meta) return res # assumes we're collapsing axes (0,1) and preserving axis 2 output_chunks = (tuple(map(np.product, product(arr.chunks[:2]))),) + (arr.chunks[-1],) reshaped = reshape_block(arr, output_chunks) # make sure that we roundtrip actual = reshape_block(reshaped, arr.chunks) assert_eq(actual, arr) # True ``` ``` python def new_polyfit(da): arr = da.data output_chunks = (tuple(map(np.product, product(*arr.chunks[:2]))),) + ( arr.chunks[-1], ) reshaped = xr.DataArray(reshape_block(arr, output_chunks), dims=("xy", "time")) fit = reshaped.polyfit("time", 1, skipna=True) result = xr.Dataset() for var in fit: result[var] = ( ("degree", "x", "y"), reshape_block(fit[var].data, ((2,),) + arr.chunks[:-1]), ) result["x"] = da["x"] result["y"] = da["y"] result["degree"] = fit["degree"] return result poly = da.polyfit("time", 1, skipna=False) xr.testing.assert_allclose(poly, new_polyfit(da)) # True! ```	{ "total_count": 1, "+1": 1, "-1": 0, "laugh": 0, "hooray": 0, "confused": 0, "heart": 0, "rocket": 0, "eyes": 0 }		950882492

issue_comments: 960133879

older version