home / github / issue_comments

Menu
  • GraphQL API
  • Search all tables

issue_comments: 399584169

This data as json

html_url issue_url id node_id user created_at updated_at author_association body reactions performed_via_github_app issue
https://github.com/pydata/xarray/issues/2217#issuecomment-399584169 https://api.github.com/repos/pydata/xarray/issues/2217 399584169 MDEyOklzc3VlQ29tbWVudDM5OTU4NDE2OQ== 291576 2018-06-22T21:15:06Z 2018-06-22T21:15:06Z CONTRIBUTOR

Actually, I disagree. Pandas's set operations methods are mostly index-based. For union and intersection, they have an optimization that dives down into some c-code when the Indexes are monotonic, but everywhere else, it all works off of results from get_indexer(). I have made a quick toy demo code that seems to work. Note, I didn't know how to properly make a constructor for a subclassed Index, so I added the tolerance attribute after construction just for the purposes of this demo.

``` python from future import print_function import warnings from pandas import Index import numpy as np

from pandas.indexes.base import is_object_dtype, algos, is_dtype_equal from pandas.indexes.base import _ensure_index, _concat, _values_from_object, _unsortable_types from pandas.indexes.numeric import Float64Index

def _choose_tolerance(this, that, tolerance): if tolerance is None: tolerance = max(this.tolerance, getattr(that, 'tolerance', 0.0)) return tolerance

class ImpreciseIndex(Float64Index): def astype(self, dtype, copy=True): return ImpreciseIndex(self.values.astype(dtype=dtype, copy=copy), name=self.name, dtype=dtype)

@property
def tolerance(self):
    return self._tolerance

@tolerance.setter
def tolerance(self, tolerance):
    self._tolerance = self._convert_tolerance(tolerance)

def union(self, other, tolerance=None):
    self._assert_can_do_setop(other)
    other = _ensure_index(other)

    if len(other) == 0 or self.equals(other, tolerance=tolerance):
        return self._get_consensus_name(other)

    if len(self) == 0:
        return other._get_consensus_name(self)

    if not is_dtype_equal(self.dtype, other.dtype):
        this = self.astype('O')
        other = other.astype('O')
        return this.union(other, tolerance=tolerance)

    tolerance = _choose_tolerance(self, other, tolerance)

    indexer = self.get_indexer(other, tolerance=tolerance)
    indexer, = (indexer == -1).nonzero()

    if len(indexer) > 0:
        other_diff = algos.take_nd(other._values, indexer,
                                   allow_fill=False)
        result = _concat._concat_compat((self._values, other_diff))

        try:
            self._values[0] < other_diff[0]
        except TypeError as e:
            warnings.warn("%s, sort order is undefined for "
                          "incomparable objects" % e, RuntimeWarning,
                          stacklevel=3)
        else:
            types = frozenset((self.inferred_type,
                               other.inferred_type))
            if not types & _unsortable_types:
                result.sort()
   else:
        result = self._values

        try:
            result = np.sort(result)
        except TypeError as e:
            warnings.warn("%s, sort order is undefined for "
                          "incomparable objects" % e, RuntimeWarning,
                          stacklevel=3)

    # for subclasses
    return self._wrap_union_result(other, result)


def equals(self, other, tolerance=None):
    if self.is_(other):
        return True

    if not isinstance(other, Index):
        return False

    if is_object_dtype(self) and not is_object_dtype(other):
        # if other is not object, use other's logic for coercion
        if isinstance(other, ImpreciseIndex):
            return other.equals(self, tolerance=tolerance)
        else:
            return other.equals(self)

    if len(self) != len(other):
        return False

    tolerance = _choose_tolerance(self, other, tolerance)
    diff = np.abs(_values_from_object(self) -
                  _values_from_object(other))
    return np.all(diff < tolerance)

def intersection(self, other, tolerance=None):
    self._assert_can_do_setop(other)
    other = _ensure_index(other)

    if self.equals(other, tolerance=tolerance):
        return self._get_consensus_name(other)

    if not is_dtype_equal(self.dtype, other.dtype):
        this = self.astype('O')
        other = other.astype('O')
        return this.intersection(other, tolerance=tolerance)

    tolerance = _choose_tolerance(self, other, tolerance)
    try:
        indexer = self.get_indexer(other._values, tolerance=tolerance)
        indexer = indexer.take((indexer != -1).nonzero()[0])
    except:
        # duplicates
        # FIXME: get_indexer_non_unique() doesn't take a tolerance argument
        indexer = Index(self._values).get_indexer_non_unique(
            other._values)[0].unique()
        indexer = indexer[indexer != -1]

    taken = self.take(indexer)
    if self.name != other.name:
        taken.name = None
    return taken

# TODO: Do I need to re-implement _get_unique_index()?

def get_loc(self, key, method=None, tolerance=None):
    if tolerance is None:
        tolerance = self.tolerance
    if tolerance > 0 and method is None:
        method = 'nearest'
    return super(ImpreciseIndex, self).get_loc(key, method, tolerance)

def get_indexer(self, target, method=None, limit=None, tolerance=None):
    if tolerance is None:
        tolerance = self.tolerance
    if tolerance > 0 and method is None:
        method = 'nearest'
    return super(ImpreciseIndex, self).get_indexer(target, method, limit, tolerance)

if name == 'main': a = ImpreciseIndex([0.1, 0.2, 0.3, 0.4]) a.tolerance = 0.01 b = ImpreciseIndex([0.301, 0.401, 0.501, 0.601]) b.tolerance = 0.025 print(a, b) print("a | b :", a.union(b)) print("a & b :", a.intersection(b)) print("a.get_indexer(b):", a.get_indexer(b)) print("b.get_indexer(a):", b.get_indexer(a)) ```

Run this and get the following results: ImpreciseIndex([0.1, 0.2, 0.3, 0.4], dtype='float64') ImpreciseIndex([0.301, 0.401, 0.501, 0.601], dtype='float64') a | b : ImpreciseIndex([0.1, 0.2, 0.3, 0.4, 0.501, 0.601], dtype='float64') a & b : ImpreciseIndex([0.3, 0.4], dtype='float64') a.get_indexer(b): [ 2 3 -1 -1] b.get_indexer(a): [-1 -1 0 1]

This is mostly lifted from the Index base class methods, just with me taking out the monotonic optimization path, and supplying the tolerance argument to the respective calls to get_indexer. The choice of tolerance for a given operation is that unless provided as a keyword argument, then use the larger tolerance of the two objects being compared (with a failback if the other isn't an ImpreciseIndex).

{
    "total_count": 0,
    "+1": 0,
    "-1": 0,
    "laugh": 0,
    "hooray": 0,
    "confused": 0,
    "heart": 0,
    "rocket": 0,
    "eyes": 0
}
  329575874
Powered by Datasette · Queries took 0.723ms · About: xarray-datasette