id,node_id,number,title,user,state,locked,assignee,milestone,comments,created_at,updated_at,closed_at,author_association,active_lock_reason,draft,pull_request,body,reactions,performed_via_github_app,state_reason,repo,type 466994138,MDU6SXNzdWU0NjY5OTQxMzg=,3096,Support parallel writes to zarr store,18643609,closed,0,,,9,2019-07-11T16:31:25Z,2022-04-08T04:43:15Z,2022-04-08T04:43:14Z,NONE,,,,"#### MCVE Code Sample ```python import multiprocessing import xarray as xr import numpy as np from s3fs import S3FileSystem, S3Map from time import sleep from main import set_aws_credentials set_aws_credentials() def download_file(lead_time): return 'path_to_your_file' def make_xarray_dataset(file_path, lead_time): var1 = np.random.rand(1, 721, 1440, 22) var2 = np.random.rand(1, 721, 1440, 22) lat = np.linspace(-90, 90, 721) lon = np.linspace(0, 360, 1440) height = range(22) ds = xr.Dataset({'var1': (['lead_time', 'lat', 'lon', 'height'], var1), 'var2': (['lead_time', 'lat', 'lon', 'height'], var2)}, coords={'lat': lat, 'lon': lon, 'height': height, 'lead_time': [lead_time]}) return ds def upload_to_s3(dataset, append): s3 = S3FileSystem() s3map = S3Map('S3_path_to_your_zarr', s3=s3) # If we are appending to an already existing dataset if append: dataset.to_zarr(store=s3map, mode='a', append_dim='lead_time') else: dataset.to_zarr(store=s3map, mode='w') def lead_time_worker(lead_time, append=True): file_path = download_file(lead_time) dataset = make_xarray_dataset(file_path, lead_time) upload_to_s3(dataset, append=True) return 0 if __name__ == '__main__': lead_times = range(10) first_lead_time = True processes = [] for lead_time in lead_times: if first_lead_time: process = multiprocessing.Process(target=lead_time_worker, args=(lead_time, False)) process.start() process.join() first_lead_time = False else: process = multiprocessing.Process(target=lead_time_worker, args=(lead_time,)) process.start() processes.append(process) sleep(5) # Sleep in order to shift the different processes so that they don't begin at the same time for p in processes: p.join() ``` will raise > ValueError: conflicting sizes for dimension 'lead_time': length X on 'Var1' and length Y on 'Var2'
``` Traceback (most recent call last): File ""main.py"", line 200, in lead_time_worker upload_to_s3(dataset, cloud_zarr_path, append=True) File ""main.py"", line 167, in upload_to_gcloud ds.to_zarr(store=s3map, mode='a', append_dim='lead_time') File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/core/dataset.py"", line 1414, in to_zarr consolidated=consolidated, append_dim=append_dim) File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/backends/api.py"", line 1101, in to_zarr dump_to_store(dataset, zstore, writer, encoding=encoding) File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/backends/api.py"", line 929, in dump_to_store unlimited_dims=unlimited_dims) File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/backends/zarr.py"", line 354, in store ds = open_zarr(self.ds.store, chunks=None) File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/backends/zarr.py"", line 557, in open_zarr ds = maybe_decode_store(zarr_store) File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/backends/zarr.py"", line 545, in maybe_decode_store drop_variables=drop_variables) File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/conventions.py"", line 527, in decode_cf ds = Dataset(vars, attrs=attrs) File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/core/dataset.py"", line 423, in __init__ self._set_init_vars_and_dims(data_vars, coords, compat) File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/core/dataset.py"", line 445, in _set_init_vars_and_dims data_vars, coords, compat=compat) File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/core/merge.py"", line 379, in merge_data_and_coords indexes=indexes) File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/core/merge.py"", line 460, in merge_core dims = calculate_dimensions(variables) File ""/home/ubuntu/anaconda3/envs/GFS-data-retrieval/lib/python3.7/site-packages/xarray/core/dataset.py"", line 125, in calculate_dimensions (dim, size, k, dims[dim], last_used[dim])) ValueError: conflicting sizes for dimension 'lead_time': length X on 'var1' and length Y on 'var2' ```
#### Problem Description First of all, thanks a lot to the community for the PR #2706, I was really looking forward to it. I already experienced using the new append parameter, and got some problems trying to do it in a parallel way. I want to upload a very big zarr (global numerical weather prediction, output of GFS model that you can check out [here](https://ftp.ncep.noaa.gov/data/nccf/com/gfs/prod/)) on a S3 bucket. In order to speed this up, as each single file of the source contains the data for one lead time(length of time between the issuance of a forecast and the occurrence of the phenomena that were predicted) and I want to concatenate them all, I tried to have one process per lead time and all of them to append to the same data store using [Dataset.to_zarr()](https://github.com/pydata/xarray/blob/8f0d9e5c9909c93a90306ed7cb5a80c1c2e1c97d/xarray/core/dataset.py#L1400) with `append=True`. However, when doing that, I get the error described above. Indeed, the processes are appending simultaneously, so the data is not necessarily consistent when a new process tries to append, some variables will already have the values of one lead time and some will not because the process is not finished, which will lead to [calculate_dimensions()](https://github.com/pydata/xarray/blob/8f0d9e5c9909c93a90306ed7cb5a80c1c2e1c97d/xarray/core/dataset.py#L113) raising this error. I wonder if there is a way I haven't found to work around this using simply a synchronizer? If not, do you think it would be possible (and reasonable) to implement a parameter allowing to bypass this check on the append dimension in an 'eventually consistent' approach? #### Output of ``xr.show_versions()``
INSTALLED VERSIONS ------------------ commit: None python: 3.7.3 (default, Mar 27 2019, 22:11:17) [GCC 7.3.0] python-bits: 64 OS: Linux OS-release: 4.15.0-1032-aws machine: x86_64 processor: x86_64 byteorder: little LC_ALL: None LANG: C.UTF-8 LOCALE: en_US.UTF-8 libhdf5: None libnetcdf: None xarray: 0.12.2 pandas: 0.24.2 numpy: 1.16.4 scipy: None netCDF4: None pydap: None h5netcdf: None h5py: None Nio: 1.5.5 zarr: 2.3.2 cftime: None nc_time_axis: None PseudonetCDF: None rasterio: None cfgrib: None iris: None bottleneck: None dask: 2.0.0 distributed: 2.0.1 matplotlib: None cartopy: None seaborn: None numbagg: None setuptools: 41.0.1 pip: 19.1.1 conda: None pytest: None IPython: None sphinx: None
","{""url"": ""https://api.github.com/repos/pydata/xarray/issues/3096/reactions"", ""total_count"": 1, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 1, ""rocket"": 0, ""eyes"": 0}",,completed,13221727,issue 474582412,MDU6SXNzdWU0NzQ1ODI0MTI=,3170,Dataset.to_zarr() with mode='a' does not work with groups,18643609,closed,0,,,0,2019-07-30T13:22:58Z,2020-03-02T12:19:16Z,2020-03-02T12:19:16Z,NONE,,,,"#### MCVE Code Sample ```python import xarray as xr import numpy as np from s3fs import S3FileSystem, S3Map s3 = S3FileSystem() bucket_name = 'your-bucket-name' s3_path = bucket_name + 'some_path.zarr' store = S3Map(s3_path, s3=s3) for i in range(6): if i%2 == 0: group = 'Group1' else: group = 'Group2' lead_time = i//2 var1 = np.random.rand(1) ds = xr.Dataset({'var1': (['lead_time'], var1)}, coords={'lead_time': [lead_time]}) ds.to_zarr(store=store, mode='a', append_dim='lead_time', group=group) ``` #### Output This code returns the following error:
``` Traceback (most recent call last): File ""/home/vincent/anaconda3/envs/hanover_backend/lib/python3.7/site-packages/xarray/core/dataset.py"", line 1019, in _construct_dataarray variable = self._variables[name] KeyError: 'var1' During handling of the above exception, another exception occurred: Traceback (most recent call last): File ""/home/vincent/Documents/Greenlytics/SiteForecast/debugging-script.py"", line 201, in ds.to_zarr(store=store, mode='a', append_dim='lead_time', group=group) File ""/home/vincent/anaconda3/envs/hanover_backend/lib/python3.7/site-packages/xarray/core/dataset.py"", line 1433, in to_zarr consolidated=consolidated, append_dim=append_dim) File ""/home/vincent/anaconda3/envs/hanover_backend/lib/python3.7/site-packages/xarray/backends/api.py"", line 1101, in to_zarr dump_to_store(dataset, zstore, writer, encoding=encoding) File ""/home/vincent/anaconda3/envs/hanover_backend/lib/python3.7/site-packages/xarray/backends/api.py"", line 929, in dump_to_store unlimited_dims=unlimited_dims) File ""/home/vincent/anaconda3/envs/hanover_backend/lib/python3.7/site-packages/xarray/backends/zarr.py"", line 358, in store variables_with_encoding[vn].encoding = ds[vn].encoding File ""/home/vincent/anaconda3/envs/hanover_backend/lib/python3.7/site-packages/xarray/core/dataset.py"", line 1103, in __getitem__ return self._construct_dataarray(key) File ""/home/vincent/anaconda3/envs/hanover_backend/lib/python3.7/site-packages/xarray/core/dataset.py"", line 1022, in _construct_dataarray self._variables, name, self._level_coords, self.dims) File ""/home/vincent/anaconda3/envs/hanover_backend/lib/python3.7/site-packages/xarray/core/dataset.py"", line 91, in _get_virtual_variable ref_var = variables[ref_name] KeyError: 'var1' ```
The KeyError can happen on a variable name as well as on a dimension name, it depends on the runs. #### Problem Description I am trying to use the append mode introduced in the PR #2706 on zarr groups. This raises a KeyError as you can see in the trace above. Is it a bug or a feature that is not supported (yet)? #### Output of ``xr.show_versions()``
INSTALLED VERSIONS ------------------ commit: None python: 3.7.1 (default, Dec 14 2018, 19:28:38) [GCC 7.3.0] python-bits: 64 OS: Linux OS-release: 4.18.0-20-generic machine: x86_64 processor: x86_64 byteorder: little LC_ALL: None LANG: en_US.UTF-8 LOCALE: en_US.UTF-8 libhdf5: 1.10.2 libnetcdf: 4.6.3 xarray: 0.12.3 pandas: 0.24.2 numpy: 1.16.2 scipy: 1.2.1 netCDF4: 1.5.1.2 pydap: None h5netcdf: None h5py: None Nio: None zarr: 2.3.1 cftime: 1.0.3.4 nc_time_axis: None PseudoNetCDF: None rasterio: None cfgrib: 0.9.6.1.post1 iris: None bottleneck: None dask: 1.1.5 distributed: None matplotlib: 3.0.3 cartopy: None seaborn: None numbagg: None setuptools: 40.8.0 pip: 19.0.3 conda: None pytest: None IPython: 7.5.0 sphinx: None
","{""url"": ""https://api.github.com/repos/pydata/xarray/issues/3170/reactions"", ""total_count"": 0, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,completed,13221727,issue