id,node_id,number,title,user,state,locked,assignee,milestone,comments,created_at,updated_at,closed_at,author_association,active_lock_reason,draft,pull_request,body,reactions,performed_via_github_app,state_reason,repo,type
466994138,MDU6SXNzdWU0NjY5OTQxMzg=,3096,Support parallel writes to zarr store,18643609,closed,0,,,9,2019-07-11T16:31:25Z,2022-04-08T04:43:15Z,2022-04-08T04:43:14Z,NONE,,,,"#### MCVE Code Sample
```python
import multiprocessing
import xarray as xr
import numpy as np
from s3fs import S3FileSystem, S3Map
from time import sleep
from main import set_aws_credentials
set_aws_credentials()
def download_file(lead_time):
return 'path_to_your_file'
def make_xarray_dataset(file_path, lead_time):
var1 = np.random.rand(1, 721, 1440, 22)
var2 = np.random.rand(1, 721, 1440, 22)
lat = np.linspace(-90, 90, 721)
lon = np.linspace(0, 360, 1440)
height = range(22)
ds = xr.Dataset({'var1': (['lead_time', 'lat', 'lon', 'height'], var1),
'var2': (['lead_time', 'lat', 'lon', 'height'], var2)},
coords={'lat': lat,
'lon': lon,
'height': height,
'lead_time': [lead_time]})
return ds
def upload_to_s3(dataset, append):
s3 = S3FileSystem()
s3map = S3Map('S3_path_to_your_zarr', s3=s3)
# If we are appending to an already existing dataset
if append:
dataset.to_zarr(store=s3map, mode='a', append_dim='lead_time')
else:
dataset.to_zarr(store=s3map, mode='w')
def lead_time_worker(lead_time, append=True):
file_path = download_file(lead_time)
dataset = make_xarray_dataset(file_path, lead_time)
upload_to_s3(dataset, append=True)
return 0
if __name__ == '__main__':
lead_times = range(10)
first_lead_time = True
processes = []
for lead_time in lead_times:
if first_lead_time:
process = multiprocessing.Process(target=lead_time_worker,
args=(lead_time, False))
process.start()
process.join()
first_lead_time = False
else:
process = multiprocessing.Process(target=lead_time_worker,
args=(lead_time,))
process.start()
processes.append(process)
sleep(5) # Sleep in order to shift the different processes so that they don't begin at the same time
for p in processes:
p.join()
```
will raise
> ValueError: conflicting sizes for dimension 'lead_time': length X on 'Var1' and length Y on 'Var2'
```
Traceback (most recent call last):
File ""main.py"", line 200, in lead_time_worker
upload_to_s3(dataset, cloud_zarr_path, append=True)
File ""main.py"", line 167, in upload_to_gcloud
ds.to_zarr(store=s3map, mode='a', append_dim='lead_time')
File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/core/dataset.py"", line 1414, in to_zarr
consolidated=consolidated, append_dim=append_dim)
File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/backends/api.py"", line 1101, in to_zarr
dump_to_store(dataset, zstore, writer, encoding=encoding)
File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/backends/api.py"", line 929, in dump_to_store
unlimited_dims=unlimited_dims)
File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/backends/zarr.py"", line 354, in store
ds = open_zarr(self.ds.store, chunks=None)
File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/backends/zarr.py"", line 557, in open_zarr
ds = maybe_decode_store(zarr_store)
File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/backends/zarr.py"", line 545, in maybe_decode_store
drop_variables=drop_variables)
File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/conventions.py"", line 527, in decode_cf
ds = Dataset(vars, attrs=attrs)
File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/core/dataset.py"", line 423, in __init__
self._set_init_vars_and_dims(data_vars, coords, compat)
File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/core/dataset.py"", line 445, in _set_init_vars_and_dims
data_vars, coords, compat=compat)
File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/core/merge.py"", line 379, in merge_data_and_coords
indexes=indexes)
File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/core/merge.py"", line 460, in merge_core
dims = calculate_dimensions(variables)
File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/core/dataset.py"", line 125, in calculate_dimensions
(dim, size, k, dims[dim], last_used[dim]))
ValueError: conflicting sizes for dimension 'lead_time': length X on 'var1' and length Y on 'var2'
```
#### Problem Description
First of all, thanks a lot to the community for the PR #2706, I was really looking forward to it. I already experienced using the new append parameter, and got some problems trying to do it in a parallel way.
I want to upload a very big zarr (global numerical weather prediction, output of GFS model that you can check out [here](https://ftp.ncep.noaa.gov/data/nccf/com/gfs/prod/)) on a S3 bucket. In order to speed this up, as each single file of the source contains the data for one lead time(length of time between the issuance of a forecast and the occurrence of the phenomena that were predicted) and I want to concatenate them all, I tried to have one process per lead time and all of them to append to the same data store using [Dataset.to_zarr()](https://github.com/pydata/xarray/blob/8f0d9e5c9909c93a90306ed7cb5a80c1c2e1c97d/xarray/core/dataset.py#L1400) with `append=True`.
However, when doing that, I get the error described above. Indeed, the processes are appending simultaneously, so the data is not necessarily consistent when a new process tries to append, some variables will already have the values of one lead time and some will not because the process is not finished, which will lead to [calculate_dimensions()](https://github.com/pydata/xarray/blob/8f0d9e5c9909c93a90306ed7cb5a80c1c2e1c97d/xarray/core/dataset.py#L113) raising this error.
I wonder if there is a way I haven't found to work around this using simply a synchronizer? If not, do you think it would be possible (and reasonable) to implement a parameter allowing to bypass this check on the append dimension in an 'eventually consistent' approach?
#### Output of ``xr.show_versions()``
INSTALLED VERSIONS
------------------
commit: None
python: 3.7.3 (default, Mar 27 2019, 22:11:17)
[GCC 7.3.0]
python-bits: 64
OS: Linux
OS-release: 4.15.0-1032-aws
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: C.UTF-8
LOCALE: en_US.UTF-8
libhdf5: None
libnetcdf: None
xarray: 0.12.2
pandas: 0.24.2
numpy: 1.16.4
scipy: None
netCDF4: None
pydap: None
h5netcdf: None
h5py: None
Nio: 1.5.5
zarr: 2.3.2
cftime: None
nc_time_axis: None
PseudonetCDF: None
rasterio: None
cfgrib: None
iris: None
bottleneck: None
dask: 2.0.0
distributed: 2.0.1
matplotlib: None
cartopy: None
seaborn: None
numbagg: None
setuptools: 41.0.1
pip: 19.1.1
conda: None
pytest: None
IPython: None
sphinx: None
","{""url"": ""https://api.github.com/repos/pydata/xarray/issues/3096/reactions"", ""total_count"": 1, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 1, ""rocket"": 0, ""eyes"": 0}",,completed,13221727,issue
474582412,MDU6SXNzdWU0NzQ1ODI0MTI=,3170,Dataset.to_zarr() with mode='a' does not work with groups,18643609,closed,0,,,0,2019-07-30T13:22:58Z,2020-03-02T12:19:16Z,2020-03-02T12:19:16Z,NONE,,,,"#### MCVE Code Sample
```python
import xarray as xr
import numpy as np
from s3fs import S3FileSystem, S3Map
s3 = S3FileSystem()
bucket_name = 'your-bucket-name'
s3_path = bucket_name + 'some_path.zarr'
store = S3Map(s3_path, s3=s3)
for i in range(6):
if i%2 == 0:
group = 'Group1'
else:
group = 'Group2'
lead_time = i//2
var1 = np.random.rand(1)
ds = xr.Dataset({'var1': (['lead_time'], var1)},
coords={'lead_time': [lead_time]})
ds.to_zarr(store=store, mode='a', append_dim='lead_time', group=group)
```
#### Output
This code returns the following error:
```
Traceback (most recent call last):
File ""/home/vincent/anaconda3/envs/hanover_backend/lib/python3.7/site-packages/xarray/core/dataset.py"", line 1019, in _construct_dataarray
variable = self._variables[name]
KeyError: 'var1'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File ""/home/vincent/Documents/Greenlytics/SiteForecast/debugging-script.py"", line 201, in
ds.to_zarr(store=store, mode='a', append_dim='lead_time', group=group)
File ""/home/vincent/anaconda3/envs/hanover_backend/lib/python3.7/site-packages/xarray/core/dataset.py"", line 1433, in to_zarr
consolidated=consolidated, append_dim=append_dim)
File ""/home/vincent/anaconda3/envs/hanover_backend/lib/python3.7/site-packages/xarray/backends/api.py"", line 1101, in to_zarr
dump_to_store(dataset, zstore, writer, encoding=encoding)
File ""/home/vincent/anaconda3/envs/hanover_backend/lib/python3.7/site-packages/xarray/backends/api.py"", line 929, in dump_to_store
unlimited_dims=unlimited_dims)
File ""/home/vincent/anaconda3/envs/hanover_backend/lib/python3.7/site-packages/xarray/backends/zarr.py"", line 358, in store
variables_with_encoding[vn].encoding = ds[vn].encoding
File ""/home/vincent/anaconda3/envs/hanover_backend/lib/python3.7/site-packages/xarray/core/dataset.py"", line 1103, in __getitem__
return self._construct_dataarray(key)
File ""/home/vincent/anaconda3/envs/hanover_backend/lib/python3.7/site-packages/xarray/core/dataset.py"", line 1022, in _construct_dataarray
self._variables, name, self._level_coords, self.dims)
File ""/home/vincent/anaconda3/envs/hanover_backend/lib/python3.7/site-packages/xarray/core/dataset.py"", line 91, in _get_virtual_variable
ref_var = variables[ref_name]
KeyError: 'var1'
```
The KeyError can happen on a variable name as well as on a dimension name, it depends on the runs.
#### Problem Description
I am trying to use the append mode introduced in the PR #2706 on zarr groups. This raises a KeyError as you can see in the trace above. Is it a bug or a feature that is not supported (yet)?
#### Output of ``xr.show_versions()``
INSTALLED VERSIONS
------------------
commit: None
python: 3.7.1 (default, Dec 14 2018, 19:28:38)
[GCC 7.3.0]
python-bits: 64
OS: Linux
OS-release: 4.18.0-20-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: en_US.UTF-8
libhdf5: 1.10.2
libnetcdf: 4.6.3
xarray: 0.12.3
pandas: 0.24.2
numpy: 1.16.2
scipy: 1.2.1
netCDF4: 1.5.1.2
pydap: None
h5netcdf: None
h5py: None
Nio: None
zarr: 2.3.1
cftime: 1.0.3.4
nc_time_axis: None
PseudoNetCDF: None
rasterio: None
cfgrib: 0.9.6.1.post1
iris: None
bottleneck: None
dask: 1.1.5
distributed: None
matplotlib: 3.0.3
cartopy: None
seaborn: None
numbagg: None
setuptools: 40.8.0
pip: 19.0.3
conda: None
pytest: None
IPython: 7.5.0
sphinx: None
","{""url"": ""https://api.github.com/repos/pydata/xarray/issues/3170/reactions"", ""total_count"": 0, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,completed,13221727,issue