home / github / issue_comments

Menu
  • GraphQL API
  • Search all tables

issue_comments: 960133879

This data as json

html_url issue_url id node_id user created_at updated_at author_association body reactions performed_via_github_app issue
https://github.com/pydata/xarray/issues/5629#issuecomment-960133879 https://api.github.com/repos/pydata/xarray/issues/5629 960133879 IC_kwDOAMm_X845Onr3 2448579 2021-11-03T21:39:08Z 2023-01-24T20:19:19Z MEMBER

In reading through #5933, I realized there's an alternate solution that preserves the desirable feature of fitting along chunked dimensions (at least for skipna=False)

Since we are only ever fitting along one dimension, we can apply the reshape-other-dimensions operation blockwise. As long as we reshape back in a consistent manner (again blockwise) this is identical to our current stacking approach, but substantially more efficient. In particular, it avoids merging chunks.

```python def reshape_block(arr, output_chunks): """ Blockwise reshape array to match output_chunks

This method applies a reshape operation blockwise, so it only makes sense if
you do not care about the order of elements (along reshaped axes) after reshaping.
The order of elements will depend on the chunking of ``arr``.
It is substantially more efficient than a usual reshape, which makes sure array elements
appear in a numpy-like order regardless of chunking.

Based on https://github.com/dask/dask/issues/4855#issuecomment-497062002
"""

from itertools import product

import numpy as np
from dask.array import Array
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph

name = "reshape-block-" + tokenize(arr)
ichunks = tuple(range(len(chunks_v)) for chunks_v in arr.chunks)
ochunks = tuple(range(len(chunks_v)) for chunks_v in output_chunks)

dsk = {
    (name, *ochunk): (np.reshape, (arr.name, *ichunk), oshape)
    for ichunk, ochunk, oshape in zip(
        product(*ichunks), product(*ochunks), product(*output_chunks)
    )
}

graph = HighLevelGraph.from_collections(name, dsk, dependencies=[arr])
res = Array(graph, name, output_chunks, arr.dtype, meta=arr._meta)

return res

def polyfit(da, dim, skipna=False): import itertools

import numpy as np

da = da.transpose(..., dim)
arr = da.data

do_blockwise_reshape = arr.ndim > 1 and any(
    len(chunks) > 1 for chunks in arr.chunks
)
if do_blockwise_reshape:
    output_chunks = (tuple(map(np.product, product(*arr.chunks[:-1]))),) + (
        arr.chunks[-1],
    )
    other_dims = tuple(dim_ for dim_ in da.dims if dim_ != dim)
    reshaped = xr.DataArray(
        reshape_block(arr, output_chunks), dims=("__reshaped__", dim)
    )
else:
    reshaped = da

fit = reshaped.polyfit(dim, 1, skipna=skipna)

if do_blockwise_reshape:
    result = xr.Dataset()
    for var in fit:
        result[var] = (
            ("degree",) + other_dims,
            reshape_block(fit[var].data, ((2,),) + arr.chunks[:-1]),
        )

    for dim_ in other_dims:
        result[dim_] = da[dim_]
    result["degree"] = fit["degree"]
    return result
else:
    return fit

```

Here's the graph 1 chunk along time (like @jbusecke's example, and skipna=True)

And when the time dimension is chunked (and skipna=False)

older version

``` python from itertools import product import numpy as np from dask.array import Array from dask.array.utils import assert_eq from dask.base import tokenize from dask.highlevelgraph import HighLevelGraph def reshape_block(arr, output_chunks): """ Blockwise reshape array to match output_chunks This method applies a reshape operation blockwise, so it only makes sense if you do not care about the order of elements (along reshaped axes) after reshaping. The order of elements will depend on the chunking of ``arr``. It is substantially more efficient than a usual reshape, which makes sure array elements appear in a numpy-like order regardless of chunking. Based on https://github.com/dask/dask/issues/4855#issuecomment-497062002 """ name = "reshape-block-" + tokenize(arr) ichunks = tuple(range(len(chunks_v)) for chunks_v in arr.chunks) ochunks = tuple(range(len(chunks_v)) for chunks_v in output_chunks) dsk = { (name, *ochunk): (np.reshape, (arr.name, *ichunk), oshape) for ichunk, ochunk, oshape in zip( product(*ichunks), product(*ochunks), product(*output_chunks) ) } graph = HighLevelGraph.from_collections(name, dsk, dependencies=[arr]) res = Array(graph, name, output_chunks, arr.dtype, meta=arr._meta) return res # assumes we're collapsing axes (0,1) and preserving axis 2 output_chunks = (tuple(map(np.product, product(*arr.chunks[:2]))),) + (arr.chunks[-1],) reshaped = reshape_block(arr, output_chunks) # make sure that we roundtrip actual = reshape_block(reshaped, arr.chunks) assert_eq(actual, arr) # True ``` ``` python def new_polyfit(da): arr = da.data output_chunks = (tuple(map(np.product, product(*arr.chunks[:2]))),) + ( arr.chunks[-1], ) reshaped = xr.DataArray(reshape_block(arr, output_chunks), dims=("xy", "time")) fit = reshaped.polyfit("time", 1, skipna=True) result = xr.Dataset() for var in fit: result[var] = ( ("degree", "x", "y"), reshape_block(fit[var].data, ((2,),) + arr.chunks[:-1]), ) result["x"] = da["x"] result["y"] = da["y"] result["degree"] = fit["degree"] return result poly = da.polyfit("time", 1, skipna=False) xr.testing.assert_allclose(poly, new_polyfit(da)) # True! ```
{
    "total_count": 1,
    "+1": 1,
    "-1": 0,
    "laugh": 0,
    "hooray": 0,
    "confused": 0,
    "heart": 0,
    "rocket": 0,
    "eyes": 0
}
  950882492
Powered by Datasette · Queries took 0.834ms · About: xarray-datasette