home / github / issue_comments

Menu
  • GraphQL API
  • Search all tables

issue_comments: 1247351003

This data as json

html_url issue_url id node_id user created_at updated_at author_association body reactions performed_via_github_app issue
https://github.com/pydata/xarray/issues/7031#issuecomment-1247351003 https://api.github.com/repos/pydata/xarray/issues/7031 1247351003 IC_kwDOAMm_X85KWRDb 35968931 2022-09-14T22:10:01Z 2022-09-14T22:15:26Z MEMBER

I had another go and now I have this (the .sel method is just copied from PandasIndex.sel with minor changes):

```python from xarray.core.indexes import ( PandasIndex, is_scalar, as_scalar, get_indexer_nd, IndexSelResult, _query_slice, is_dict_like, normalize_label, )

class PeriodicBoundaryIndex(PandasIndex): """ An index representing any 1D periodic numberline.

Implementation subclasses a normal xarray PandasIndex object but intercepts indexer queries.
"""
period: float

def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.period = 360  # TODO work out where this input should be passed in instead of hard-coding

def _wrap_periodically(self, label_value):
    return self.index.min() + (label_value - self.index.max()) % self.period

def sel(
    self, labels: dict[Any, Any], method=None, tolerance=None
) -> IndexSelResult:
    """Remaps labels outside of the indexes' range back to integer indices inside the range."""

    print("sel called")

    from xarray import DataArray
    from xarray import Variable

    if method is not None and not isinstance(method, str):
        raise TypeError("``method`` must be a string")

    assert len(labels) == 1
    coord_name, label = next(iter(labels.items()))

    if isinstance(label, slice):
        print(label)
        indexer = _query_slice(self.index, label, coord_name, method, tolerance)
        print(indexer)
    elif is_dict_like(label):
        raise ValueError(
            "cannot use a dict-like object for selection on "
            "a dimension that does not have a MultiIndex"
        )
    else:
        label_array = normalize_label(label, dtype=self.coord_dtype)
        if label_array.ndim == 0:
            label_array = self._wrap_periodically(label_array)
            label_value = as_scalar(label_array)
            if isinstance(self.index, pd.CategoricalIndex):
                if method is not None:
                    raise ValueError(
                        "'method' is not supported when indexing using a CategoricalIndex."
                    )
                if tolerance is not None:
                    raise ValueError(
                        "'tolerance' is not supported when indexing using a CategoricalIndex."
                    )
                indexer = self.index.get_loc(label_value)
            else:
                if method is not None:
                    print(label_array)
                    indexer = get_indexer_nd(
                        self.index, label_array, method, tolerance
                    )
                    if np.any(indexer < 0):
                        raise KeyError(
                            f"not all values found in index {coord_name!r}"
                        )
                else:
                    try:
                        print(label_value)
                        indexer = self.index.get_loc(label_value)
                    except KeyError as e:
                        raise KeyError(
                            f"not all values found in index {coord_name!r}. "
                            "Try setting the `method` keyword argument (example: method='nearest')."
                        ) from e

        elif label_array.dtype.kind == "b":
            indexer = label_array
        else:
            indexer = get_indexer_nd(self.index, label_array, method, tolerance)
            if np.any(indexer < 0):
                raise KeyError(f"not all values found in index {coord_name!r}")

        # attach dimension names and/or coordinates to positional indexer
        if isinstance(label, Variable):
            indexer = Variable(label.dims, indexer)
        elif isinstance(label, DataArray):
            indexer = DataArray(indexer, coords=label._coords, dims=label.dims)

    return IndexSelResult({self.dim: indexer})

def isel(
    self, indexers: Mapping[Any, Union[int, slice, np.ndarray, Variable]]
) -> Union["PeriodicBoundaryIndex", None]:

    print("isel called")
    return super().isel(indexers=indexers)

```

This works for integer indexing with sel!

python lon_coord = xr.DataArray(data=np.linspace(-180, 180, 19), dims="lon") da = xr.DataArray(data=np.random.randn(19), dims="lon", coords={"lon": lon_coord}) <xarray.DataArray (lon: 19)> array([-0.67423202, 0.14173693, -0.51427002, 1.25764101, 0.23863066, 0.05703135, -0.65350384, -0.74356356, 0.98524252, -0.94975665, 0.63314842, -0.7144752 , 0.47282375, 0.31555171, -0.13179154, -1.10255267, 0.88180541, 1.28461459, 1.61273741]) Coordinates: * lon (lon) float64 -180.0 -160.0 -140.0 -120.0 ... 140.0 160.0 180.0

```python world = da.drop_indexes("lon").set_xindex("lon", index_cls=PeriodicBoundaryIndex)

world.sel(lon=200, method="nearest") <xarray.DataArray ()> array(0.14173693) Coordinates: lon float64 -160.0 ``` Yay! :champagne:

Q: Best way to do this for slicing?

I want this to work python world.sel(lon=slice(170, 190)) Internally that means PeriodicBoundaryIndex.sel has to return an indexer that points to values at both the start and end of the array data. I'm not sure what the best way of doing this is. Originally I imagined returning two slices but I don't think that's a valid argument to Dataset.isel().

So I guess I have to turn my slice into a list of specific integer positions and pass that to .isel()? How do I do that? Is that going to be inefficient somehow?

I guess I also want to reorder the result before returning it, otherwise the two sides of the dateline won't be stitched together in the right order...

Q: Where should I pass in period?

If I want the period of the PeriodicBoundaryIndex to be a general parameter, independent of the data in the array, the attributes, or the values of the index labels, where would be the most sensible place to pass this in? .set_indexes only accepts a class, not an instance, and I can't use .from_variables as it can't be deduced from the variables in general, so where can I feed it in?

Q: How to avoid just copying all of PandasIndex.sel's implementation?

I find myself copying the entire implementation of PandasIndex.sel just to insert 1 or two lines in predictable places.

Also pointing me to looking at the implementation of PandasIndex is going to lead to me using lots of private functions from xarray.indexes, because I have to import them in order to copy-paste code from PandasIndex.

I wonder if these problems could be ameliorated by providing public entry methods in the Index superclass? I'm thinking about how in anytree (which I used to make the first prototype of datatree) there are these methods that do nothing by default but are intended to be overridden to insert functionality at key steps. The pattern is basically this:

```python

library code

class NodeMixin: """Inherit from this to create your own TreeNode class with parent and children"""

def _pre_detach_children(self, children):
    """Method call before detaching `children`."""
    pass

def _post_detach_children(self, children):
    """Method call after detaching `children`."""
    pass

def _pre_attach_children(self, children):
    """Method call before attaching `children`."""
    pass

def _post_attach_children(self, children):
    """Method call after attaching `children`."""
    pass

user code

class MyHappyTreeNode(NodeMixin): def _pre_attach_children(self, children): """Celebrates the gift of children""" print("A child is born!") ```

What if we put similar methods on the PandasIndex superclass? Like

```python class PandasIndex: def _post_process_label_value(self, label_value: float) -> float: """Method call after determining scalar label value.""" return label_value

def sel(
    self, labels: dict[Any, Any], method=None, tolerance=None
) -> IndexSelResult:
    # rest of the function as before
    ...

    if isinstance(label, slice):
        indexer = _query_slice(self.index, label, coord_name, method, tolerance)
    elif is_dict_like(label):
        raise ValueError(
            "cannot use a dict-like object for selection on "
            "a dimension that does not have a MultiIndex"
        )
    else:
        label_array = normalize_label(label, dtype=self.coord_dtype)
        if label_array.ndim == 0:
            label_value = as_scalar(label_array)
            label_value = self._post_process_label_value(label_value)  # new bit
            ...
    # rest of the function as before

```

Then in my case I would not have had to copy so much of the implementation, I could have simply done

```python class PeriodicBoundaryIndex(PandasIndex): period: float

def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.period = 360  # TODO work out where this input should be passed in instead of hard-coding

def _wrap_periodically(self, label_value):
    return self.index.min() + (label_value - self.index.max()) % self.period

def _post_process_label_value(self, label_value):
    return self._wrap_periodically(label_value)

```

Maybe this is a bad idea / doesn't make sense but I thought I would suggest it anyway.

{
    "total_count": 0,
    "+1": 0,
    "-1": 0,
    "laugh": 0,
    "hooray": 0,
    "confused": 0,
    "heart": 0,
    "rocket": 0,
    "eyes": 0
}
  1372035441
Powered by Datasette · Queries took 0.618ms · About: xarray-datasette