issue_comments: 451052107

This data as json

html_url issue_url id node_id user created_at updated_at author_association body reactions performed_via_github_app issue

html_url	issue_url	id	node_id	user	created_at	updated_at	author_association	body	reactions	performed_via_github_app	issue
https://github.com/pydata/xarray/issues/1115#issuecomment-451052107	https://api.github.com/repos/pydata/xarray/issues/1115	451052107	MDEyOklzc3VlQ29tbWVudDQ1MTA1MjEwNw==	6334793	2019-01-03T04:10:35Z	2019-01-03T04:14:54Z	NONE	Okay. Here's what I have come up with. I have tested it against two 1-d dataarrays, 2 N-D dataarrays, and one 1-D, and another N-D dataarrays, all cases having misaligned and having missing values. Before going forward, 1. What do you think of it? Any improvements? 2. Steps 1 and 2 (broadcasting and ignoring common missing values) are identical in both cov() and corr(). Is there a better way to reduce the duplication while still retaining both functions as standalone? ``` def cov(self, other, dim = None): """Compute covariance between two DataArray objects along a shared dimension. Parameters ---------- other: DataArray The other array with which the covariance will be computed dim: The dimension along which the covariance will be computed Returns ------- covariance: DataArray """ # 1. Broadcast the two arrays self, other = xr.broadcast(self, other) # 2. Ignore the nans valid_values = self.notnull() & other.notnull() self = self.where(valid_values, drop=True) other = other.where(valid_values, drop=True) valid_count = valid_values.sum(dim) #3. Compute mean and standard deviation along the given dim demeaned_self = self - self.mean(dim = dim) demeaned_other = other - other.mean(dim = dim) #4. Compute covariance along the given dim if dim: axis = self.get_axis_num(dim = dim) else: axis = None cov = np.sum(demeaned_selfdemeaned_other, axis=axis)/(valid_count) return cov def corr(self, other, dim = None): """Compute correlation between two DataArray objects along a shared dimension. Parameters ---------- other: DataArray The other array with which the correlation will be computed dim: The dimension along which the correlation will be computed Returns ------- correlation: DataArray """ # 1. Broadcast the two arrays self, other = xr.broadcast(self, other) # 2. Ignore the nans valid_values = self.notnull() & other.notnull() self = self.where(valid_values, drop=True) other = other.where(valid_values, drop=True) # 3. Compute correlation based on standard deviations and cov() self_std = self.std(dim=dim) other_std = other.std(dim=dim) return cov(self, other, dim = dim)/(self_stdother_std) ``` For testing: ``` # self: Load demo data and trim it's size ds = xr.tutorial.load_dataset('air_temperature') air = ds.air[:18,...] # other: select missaligned data, and smooth it to dampen the correlation with self. air_smooth = ds.air[2:20,...].rolling(time= 3, center=True).mean(dim='time') #. # A handy function to select an example grid def select_pts(da): return da.sel(lat=45, lon=250) #Test #1: Misaligned 1-D dataarrays with missing values ts1 = select_pts(air.copy()) ts2 = select_pts(air_smooth.copy()) def pd_corr(ts1,ts2): """Ensure the ts are aligned and missing values ignored""" # ts1,ts2 = xr.align(ts1,ts2) valid_values = ts1.notnull() & ts2.notnull() ts1 = ts1.where(valid_values, drop = True) ts2 = ts2.where(valid_values, drop = True) return ts1.to_series().corr(ts2.to_series()) expected = pd_corr(ts1, ts2) actual = corr(ts1,ts2) np.allclose(expected, actual) #Test #2: Misaligned N-D dataarrays with missing values actual_ND = corr(air,air_smooth, dim = 'time') actual = select_pts(actual_ND) np.allclose(expected, actual) # Test #3: One 1-D dataarray and another N-D dataarray; misaligned and having missing values actual_ND = corr(air_smooth,ts1, dim = 'time') actual = select_pts(actual_ND) np.allclose(actual, expected) ```	{ "total_count": 0, "+1": 0, "-1": 0, "laugh": 0, "hooray": 0, "confused": 0, "heart": 0, "rocket": 0, "eyes": 0 }		188996339

https://github.com/pydata/xarray/issues/1115#issuecomment-451052107

https://api.github.com/repos/pydata/xarray/issues/1115

451052107

MDEyOklzc3VlQ29tbWVudDQ1MTA1MjEwNw==

6334793

2019-01-03T04:10:35Z

2019-01-03T04:14:54Z

NONE

Okay. Here's what I have come up with. I have tested it against two 1-d dataarrays, 2 N-D dataarrays, and one 1-D, and another N-D dataarrays, all cases having misaligned and having missing values.

Before going forward, 1. What do you think of it? Any improvements? 2. Steps 1 and 2 (broadcasting and ignoring common missing values) are identical in both cov() and corr(). Is there a better way to reduce the duplication while still retaining both functions as standalone?

``` def cov(self, other, dim = None): """Compute covariance between two DataArray objects along a shared dimension.

Parameters
----------
other: DataArray
    The other array with which the covariance will be computed
dim: The dimension along which the covariance will be computed

Returns
-------
covariance: DataArray
"""
# 1. Broadcast the two arrays
self, other     = xr.broadcast(self, other)

# 2. Ignore the nans
valid_values    = self.notnull() & other.notnull()
self            = self.where(valid_values, drop=True)
other           = other.where(valid_values, drop=True)
valid_count     = valid_values.sum(dim)

#3. Compute mean and standard deviation along the given dim
demeaned_self   = self - self.mean(dim = dim)
demeaned_other  = other - other.mean(dim = dim)

#4. Compute  covariance along the given dim
if dim:
    axis = self.get_axis_num(dim = dim)
else:
    axis = None
cov             =  np.sum(demeaned_self*demeaned_other, axis=axis)/(valid_count)

return cov

def corr(self, other, dim = None): """Compute correlation between two DataArray objects along a shared dimension.

Parameters
----------
other: DataArray
    The other array with which the correlation will be computed
dim: The dimension along which the correlation will be computed

Returns
-------
correlation: DataArray
"""
# 1. Broadcast the two arrays
self, other     = xr.broadcast(self, other)

# 2. Ignore the nans
valid_values    = self.notnull() & other.notnull()
self            = self.where(valid_values, drop=True)
other           = other.where(valid_values, drop=True)

# 3. Compute correlation based on standard deviations and cov()
self_std        = self.std(dim=dim)
other_std       = other.std(dim=dim)

return cov(self, other, dim = dim)/(self_std*other_std)

```

For testing: ``` # self: Load demo data and trim it's size ds = xr.tutorial.load_dataset('air_temperature') air = ds.air[:18,...] # other: select missaligned data, and smooth it to dampen the correlation with self. air_smooth = ds.air[2:20,...].rolling(time= 3, center=True).mean(dim='time') #. # A handy function to select an example grid def select_pts(da): return da.sel(lat=45, lon=250)

#Test #1: Misaligned 1-D dataarrays with missing values
ts1 = select_pts(air.copy())
ts2 = select_pts(air_smooth.copy())

def pd_corr(ts1,ts2):
    """Ensure the ts are aligned and missing values ignored"""
    # ts1,ts2 = xr.align(ts1,ts2)
    valid_values = ts1.notnull() & ts2.notnull()

    ts1  = ts1.where(valid_values, drop = True)
    ts2  = ts2.where(valid_values, drop = True)

    return ts1.to_series().corr(ts2.to_series())

expected = pd_corr(ts1, ts2)
actual   = corr(ts1,ts2)
np.allclose(expected, actual)

#Test #2: Misaligned N-D dataarrays with missing values
actual_ND = corr(air,air_smooth, dim = 'time')
actual = select_pts(actual_ND)
np.allclose(expected, actual)

# Test #3: One 1-D dataarray and another N-D dataarray; misaligned and having missing values
actual_ND = corr(air_smooth,ts1, dim = 'time')
actual    = select_pts(actual_ND)
np.allclose(actual, expected)

```

{
    "total_count": 0,
    "+1": 0,
    "-1": 0,
    "laugh": 0,
    "hooray": 0,
    "confused": 0,
    "heart": 0,
    "rocket": 0,
    "eyes": 0
}

188996339