issue_comments: 399584169

This data as json

html_url issue_url id node_id user created_at updated_at author_association body reactions performed_via_github_app issue

html_url	issue_url	id	node_id	user	created_at	updated_at	author_association	body	reactions	performed_via_github_app	issue
https://github.com/pydata/xarray/issues/2217#issuecomment-399584169	https://api.github.com/repos/pydata/xarray/issues/2217	399584169	MDEyOklzc3VlQ29tbWVudDM5OTU4NDE2OQ==	291576	2018-06-22T21:15:06Z	2018-06-22T21:15:06Z	CONTRIBUTOR	Actually, I disagree. Pandas's set operations methods are mostly index-based. For union and intersection, they have an optimization that dives down into some c-code when the Indexes are monotonic, but everywhere else, it all works off of results from `get_indexer()`. I have made a quick toy demo code that seems to work. Note, I didn't know how to properly make a constructor for a subclassed Index, so I added the `tolerance` attribute after construction just for the purposes of this demo. ``` python from future import print_function import warnings from pandas import Index import numpy as np from pandas.indexes.base import is_object_dtype, algos, is_dtype_equal from pandas.indexes.base import _ensure_index, _concat, _values_from_object, _unsortable_types from pandas.indexes.numeric import Float64Index def _choose_tolerance(this, that, tolerance): if tolerance is None: tolerance = max(this.tolerance, getattr(that, 'tolerance', 0.0)) return tolerance class ImpreciseIndex(Float64Index): def astype(self, dtype, copy=True): return ImpreciseIndex(self.values.astype(dtype=dtype, copy=copy), name=self.name, dtype=dtype) @property def tolerance(self): return self._tolerance @tolerance.setter def tolerance(self, tolerance): self._tolerance = self._convert_tolerance(tolerance) def union(self, other, tolerance=None): self._assert_can_do_setop(other) other = _ensure_index(other) if len(other) == 0 or self.equals(other, tolerance=tolerance): return self._get_consensus_name(other) if len(self) == 0: return other._get_consensus_name(self) if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.union(other, tolerance=tolerance) tolerance = _choose_tolerance(self, other, tolerance) indexer = self.get_indexer(other, tolerance=tolerance) indexer, = (indexer == -1).nonzero() if len(indexer) > 0: other_diff = algos.take_nd(other._values, indexer, allow_fill=False) result = _concat._concat_compat((self._values, other_diff)) try: self._values[0] < other_diff[0] except TypeError as e: warnings.warn("%s, sort order is undefined for " "incomparable objects" % e, RuntimeWarning, stacklevel=3) else: types = frozenset((self.inferred_type, other.inferred_type)) if not types & _unsortable_types: result.sort() else: result = self._values try: result = np.sort(result) except TypeError as e: warnings.warn("%s, sort order is undefined for " "incomparable objects" % e, RuntimeWarning, stacklevel=3) # for subclasses return self._wrap_union_result(other, result) def equals(self, other, tolerance=None): if self.is_(other): return True if not isinstance(other, Index): return False if is_object_dtype(self) and not is_object_dtype(other): # if other is not object, use other's logic for coercion if isinstance(other, ImpreciseIndex): return other.equals(self, tolerance=tolerance) else: return other.equals(self) if len(self) != len(other): return False tolerance = _choose_tolerance(self, other, tolerance) diff = np.abs(_values_from_object(self) - _values_from_object(other)) return np.all(diff < tolerance) def intersection(self, other, tolerance=None): self._assert_can_do_setop(other) other = _ensure_index(other) if self.equals(other, tolerance=tolerance): return self._get_consensus_name(other) if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.intersection(other, tolerance=tolerance) tolerance = _choose_tolerance(self, other, tolerance) try: indexer = self.get_indexer(other._values, tolerance=tolerance) indexer = indexer.take((indexer != -1).nonzero()[0]) except: # duplicates # FIXME: get_indexer_non_unique() doesn't take a tolerance argument indexer = Index(self._values).get_indexer_non_unique( other._values)[0].unique() indexer = indexer[indexer != -1] taken = self.take(indexer) if self.name != other.name: taken.name = None return taken # TODO: Do I need to re-implement _get_unique_index()? def get_loc(self, key, method=None, tolerance=None): if tolerance is None: tolerance = self.tolerance if tolerance > 0 and method is None: method = 'nearest' return super(ImpreciseIndex, self).get_loc(key, method, tolerance) def get_indexer(self, target, method=None, limit=None, tolerance=None): if tolerance is None: tolerance = self.tolerance if tolerance > 0 and method is None: method = 'nearest' return super(ImpreciseIndex, self).get_indexer(target, method, limit, tolerance) if name == 'main': a = ImpreciseIndex([0.1, 0.2, 0.3, 0.4]) a.tolerance = 0.01 b = ImpreciseIndex([0.301, 0.401, 0.501, 0.601]) b.tolerance = 0.025 print(a, b) print("a \| b :", a.union(b)) print("a & b :", a.intersection(b)) print("a.get_indexer(b):", a.get_indexer(b)) print("b.get_indexer(a):", b.get_indexer(a)) ``` Run this and get the following results: `ImpreciseIndex([0.1, 0.2, 0.3, 0.4], dtype='float64') ImpreciseIndex([0.301, 0.401, 0.501, 0.601], dtype='float64') a \| b : ImpreciseIndex([0.1, 0.2, 0.3, 0.4, 0.501, 0.601], dtype='float64') a & b : ImpreciseIndex([0.3, 0.4], dtype='float64') a.get_indexer(b): [ 2 3 -1 -1] b.get_indexer(a): [-1 -1 0 1]` This is mostly lifted from the `Index` base class methods, just with me taking out the monotonic optimization path, and supplying the tolerance argument to the respective calls to `get_indexer`. The choice of tolerance for a given operation is that unless provided as a keyword argument, then use the larger tolerance of the two objects being compared (with a failback if the other isn't an ImpreciseIndex).	{ "total_count": 0, "+1": 0, "-1": 0, "laugh": 0, "hooray": 0, "confused": 0, "heart": 0, "rocket": 0, "eyes": 0 }		329575874

https://github.com/pydata/xarray/issues/2217#issuecomment-399584169

https://api.github.com/repos/pydata/xarray/issues/2217

399584169

MDEyOklzc3VlQ29tbWVudDM5OTU4NDE2OQ==

291576

2018-06-22T21:15:06Z

CONTRIBUTOR

Actually, I disagree. Pandas's set operations methods are mostly index-based. For union and intersection, they have an optimization that dives down into some c-code when the Indexes are monotonic, but everywhere else, it all works off of results from get_indexer(). I have made a quick toy demo code that seems to work. Note, I didn't know how to properly make a constructor for a subclassed Index, so I added the tolerance attribute after construction just for the purposes of this demo.

``` python from future import print_function import warnings from pandas import Index import numpy as np

from pandas.indexes.base import is_object_dtype, algos, is_dtype_equal from pandas.indexes.base import _ensure_index, _concat, _values_from_object, _unsortable_types from pandas.indexes.numeric import Float64Index

def _choose_tolerance(this, that, tolerance): if tolerance is None: tolerance = max(this.tolerance, getattr(that, 'tolerance', 0.0)) return tolerance

class ImpreciseIndex(Float64Index): def astype(self, dtype, copy=True): return ImpreciseIndex(self.values.astype(dtype=dtype, copy=copy), name=self.name, dtype=dtype)

@property
def tolerance(self):
    return self._tolerance

@tolerance.setter
def tolerance(self, tolerance):
    self._tolerance = self._convert_tolerance(tolerance)

def union(self, other, tolerance=None):
    self._assert_can_do_setop(other)
    other = _ensure_index(other)

    if len(other) == 0 or self.equals(other, tolerance=tolerance):
        return self._get_consensus_name(other)

    if len(self) == 0:
        return other._get_consensus_name(self)

    if not is_dtype_equal(self.dtype, other.dtype):
        this = self.astype('O')
        other = other.astype('O')
        return this.union(other, tolerance=tolerance)

    tolerance = _choose_tolerance(self, other, tolerance)

    indexer = self.get_indexer(other, tolerance=tolerance)
    indexer, = (indexer == -1).nonzero()

    if len(indexer) > 0:
        other_diff = algos.take_nd(other._values, indexer,
                                   allow_fill=False)
        result = _concat._concat_compat((self._values, other_diff))

        try:
            self._values[0] < other_diff[0]
        except TypeError as e:
            warnings.warn("%s, sort order is undefined for "
                          "incomparable objects" % e, RuntimeWarning,
                          stacklevel=3)
        else:
            types = frozenset((self.inferred_type,
                               other.inferred_type))
            if not types & _unsortable_types:
                result.sort()
   else:
        result = self._values

        try:
            result = np.sort(result)
        except TypeError as e:
            warnings.warn("%s, sort order is undefined for "
                          "incomparable objects" % e, RuntimeWarning,
                          stacklevel=3)

    # for subclasses
    return self._wrap_union_result(other, result)


def equals(self, other, tolerance=None):
    if self.is_(other):
        return True

    if not isinstance(other, Index):
        return False

    if is_object_dtype(self) and not is_object_dtype(other):
        # if other is not object, use other's logic for coercion
        if isinstance(other, ImpreciseIndex):
            return other.equals(self, tolerance=tolerance)
        else:
            return other.equals(self)

    if len(self) != len(other):
        return False

    tolerance = _choose_tolerance(self, other, tolerance)
    diff = np.abs(_values_from_object(self) -
                  _values_from_object(other))
    return np.all(diff < tolerance)

def intersection(self, other, tolerance=None):
    self._assert_can_do_setop(other)
    other = _ensure_index(other)

    if self.equals(other, tolerance=tolerance):
        return self._get_consensus_name(other)

    if not is_dtype_equal(self.dtype, other.dtype):
        this = self.astype('O')
        other = other.astype('O')
        return this.intersection(other, tolerance=tolerance)

    tolerance = _choose_tolerance(self, other, tolerance)
    try:
        indexer = self.get_indexer(other._values, tolerance=tolerance)
        indexer = indexer.take((indexer != -1).nonzero()[0])
    except:
        # duplicates
        # FIXME: get_indexer_non_unique() doesn't take a tolerance argument
        indexer = Index(self._values).get_indexer_non_unique(
            other._values)[0].unique()
        indexer = indexer[indexer != -1]

    taken = self.take(indexer)
    if self.name != other.name:
        taken.name = None
    return taken

# TODO: Do I need to re-implement _get_unique_index()?

def get_loc(self, key, method=None, tolerance=None):
    if tolerance is None:
        tolerance = self.tolerance
    if tolerance > 0 and method is None:
        method = 'nearest'
    return super(ImpreciseIndex, self).get_loc(key, method, tolerance)

def get_indexer(self, target, method=None, limit=None, tolerance=None):
    if tolerance is None:
        tolerance = self.tolerance
    if tolerance > 0 and method is None:
        method = 'nearest'
    return super(ImpreciseIndex, self).get_indexer(target, method, limit, tolerance)

if name == 'main': a = ImpreciseIndex([0.1, 0.2, 0.3, 0.4]) a.tolerance = 0.01 b = ImpreciseIndex([0.301, 0.401, 0.501, 0.601]) b.tolerance = 0.025 print(a, b) print("a | b :", a.union(b)) print("a & b :", a.intersection(b)) print("a.get_indexer(b):", a.get_indexer(b)) print("b.get_indexer(a):", b.get_indexer(a)) ```

Run this and get the following results: ImpreciseIndex([0.1, 0.2, 0.3, 0.4], dtype='float64') ImpreciseIndex([0.301, 0.401, 0.501, 0.601], dtype='float64') a | b : ImpreciseIndex([0.1, 0.2, 0.3, 0.4, 0.501, 0.601], dtype='float64') a & b : ImpreciseIndex([0.3, 0.4], dtype='float64') a.get_indexer(b): [ 2 3 -1 -1] b.get_indexer(a): [-1 -1 0 1]

This is mostly lifted from the Index base class methods, just with me taking out the monotonic optimization path, and supplying the tolerance argument to the respective calls to get_indexer. The choice of tolerance for a given operation is that unless provided as a keyword argument, then use the larger tolerance of the two objects being compared (with a failback if the other isn't an ImpreciseIndex).

{
    "total_count": 0,
    "+1": 0,
    "-1": 0,
    "laugh": 0,
    "hooray": 0,
    "confused": 0,
    "heart": 0,
    "rocket": 0,
    "eyes": 0
}

329575874