html_url,issue_url,id,node_id,user,created_at,updated_at,author_association,body,reactions,performed_via_github_app,issue
https://github.com/pydata/xarray/issues/5706#issuecomment-1545346823,https://api.github.com/repos/pydata/xarray/issues/5706,1545346823,IC_kwDOAMm_X85cHB8H,5821660,2023-05-12T08:06:06Z,2023-05-12T08:06:06Z,MEMBER,This is resolved in recent `netcdf-c`/`netcdf4-python` and works with recent Xarray.,"{""total_count"": 1, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 1, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,970619131
https://github.com/pydata/xarray/issues/5706#issuecomment-1170812062,https://api.github.com/repos/pydata/xarray/issues/5706,1170812062,IC_kwDOAMm_X85FySye,5821660,2022-06-30T06:17:49Z,2022-06-30T06:17:49Z,MEMBER,Problem source identified in netcdf-c: https://github.com/Unidata/netcdf-c/issues/2159,"{""total_count"": 0, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,970619131
https://github.com/pydata/xarray/issues/5706#issuecomment-1012189867,https://api.github.com/repos/pydata/xarray/issues/5706,1012189867,IC_kwDOAMm_X848VMqr,5821660,2022-01-13T14:31:31Z,2022-01-13T14:31:31Z,MEMBER,"@scottstanie I'll check my h5py/hdf5 settings. But I doubt that might be the difference. I've experienced that the trailing garbage is changing from run to run, sometimes disappearing. ","{""total_count"": 1, ""+1"": 1, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,970619131
https://github.com/pydata/xarray/issues/5706#issuecomment-1012003403,https://api.github.com/repos/pydata/xarray/issues/5706,1012003403,IC_kwDOAMm_X848UfJL,5821660,2022-01-13T10:31:25Z,2022-01-13T10:31:25Z,MEMBER,"@scottstanie Here is the output of ncdump:

```
netcdf test_str_list {
dimensions:
	phony_dim_0 = 2 ;
	phony_dim_1 = 2 ;
variables:
	string pairs(phony_dim_0, phony_dim_1) ;
data:

 pairs =
  ""2020010120200201�\f\033��U"", NIL,
  ""2020010120200301 "", NIL ;
}
```

You see the trailing garbage. This is obviously a problem with netcdf-c/netcdf4-python, as it is not there with pure hdf5 (h5py/h5netcdf).

But, there is a difference with Attributes and Datasets:

```pathon
import h5py
import xarray as xr

with h5py.File(""test_str_list_attr.h5"", ""w"") as hf:
    sid = h5py.h5s.create_simple((2, 2), (2, 2))
    tid1 = h5py.h5t.TypeID.copy(h5py.h5t.C_S1)
    tid1.set_size(8)
    tid1.set_strpad(h5py.h5t.STR_NULLPAD)
    
    tid2 = h5py.h5t.TypeID.copy(h5py.h5t.C_S1)
    tid2.set_size(9)
    tid2.set_strpad(h5py.h5t.STR_NULLTERM)
    
    blob = np.array([[""20200101"", ""20200201""], [""20200101"", ""20200301""]]).astype(""S"")
    
    # Attributes
    aid = h5py.h5a.create(hf.id, b""NULLPAD"", tid1, sid)
    ret = aid.write(blob)
    
    aid = h5py.h5a.create(hf.id, b""NULLTERM"", tid2, sid)
    ret = aid.write(blob)
    
    hf.attrs[""numpy_S""] = blob
    hf.attrs[""numpy_O""] = blob.astype(""O"")
    
    
!h5dump test_str_list_attr.h5
!ncdump test_str_list_attr.h5

with xr.load_dataset(""test_str_list_attr.h5"", engine=""h5netcdf"", phony_dims=""sort"") as ds:
    display(ds)
with xr.load_dataset(""test_str_list_attr.h5"", engine=""netcdf4"") as ds:
    display(ds)
with nc.Dataset(""test_str_list_attr.h5"") as ds:
    display(ds)
    display(ds.NULLTERM)
    display(ds.NULLPAD)
    display(ds.numpy_O)
    display(ds.numpy_S)
```

<br/>
Output:
<details>

```
HDF5 ""test_str_list_attr.h5"" {
GROUP ""/"" {
   ATTRIBUTE ""NULLPAD"" {
      DATATYPE  H5T_STRING {
         STRSIZE 8;
         STRPAD H5T_STR_NULLPAD;
         CSET H5T_CSET_ASCII;
         CTYPE H5T_C_S1;
      }
      DATASPACE  SIMPLE { ( 2, 2 ) / ( 2, 2 ) }
      DATA {
      (0,0): ""20200101"", ""20200201"",
      (1,0): ""20200101"", ""20200301""
      }
   }
   ATTRIBUTE ""NULLTERM"" {
      DATATYPE  H5T_STRING {
         STRSIZE 9;
         STRPAD H5T_STR_NULLTERM;
         CSET H5T_CSET_ASCII;
         CTYPE H5T_C_S1;
      }
      DATASPACE  SIMPLE { ( 2, 2 ) / ( 2, 2 ) }
      DATA {
      (0,0): ""20200101"", ""20200201"",
      (1,0): ""20200101"", ""20200301""
      }
   }
   ATTRIBUTE ""numpy_O"" {
      DATATYPE  H5T_STRING {
         STRSIZE H5T_VARIABLE;
         STRPAD H5T_STR_NULLTERM;
         CSET H5T_CSET_ASCII;
         CTYPE H5T_C_S1;
      }
      DATASPACE  SIMPLE { ( 2, 2 ) / ( 2, 2 ) }
      DATA {
      (0,0): ""20200101"", ""20200201"",
      (1,0): ""20200101"", ""20200301""
      }
   }
   ATTRIBUTE ""numpy_S"" {
      DATATYPE  H5T_STRING {
         STRSIZE 8;
         STRPAD H5T_STR_NULLPAD;
         CSET H5T_CSET_ASCII;
         CTYPE H5T_C_S1;
      }
      DATASPACE  SIMPLE { ( 2, 2 ) / ( 2, 2 ) }
      DATA {
      (0,0): ""20200101"", ""20200201"",
      (1,0): ""20200101"", ""20200301""
      }
   }
}
}
netcdf test_str_list_attr {

// global attributes:
		string :NULLPAD = ""20200101"", ""20200201"", ""20200101"", ""20200301"" ;
		string :NULLTERM = ""20200101"", ""20200201"", ""20200101"", ""20200301"" ;
		string :numpy_S = ""20200101"", ""20200201@�s}�U"", ""20200101"", ""20200301�6t}�U"" ;
		string :numpy_O = ""20200101"", ""20200201"", ""20200101"", ""20200301"" ;
}
<xarray.Dataset>
Dimensions:  ()
Data variables:
    *empty*
Attributes:
    NULLPAD:   [[b'20200101' b'20200201']\n [b'20200101' b'20200301']]
    NULLTERM:  [[b'20200101' b'20200201']\n [b'20200101' b'20200301']]
    numpy_O:   [['20200101' '20200201']\n ['20200101' '20200301']]
    numpy_S:   [[b'20200101' b'20200201']\n [b'20200101' b'20200301']]
<xarray.Dataset>
Dimensions:  ()
Data variables:
    *empty*
Attributes:
    NULLPAD:   ['20200101', '20200201', '20200101', '20200301']
    NULLTERM:  ['20200101', '20200201', '20200101', '20200301']
    numpy_S:   ['20200101', '20200201', '20200101p��i�U', '20200301']
    numpy_O:   ['20200101', '20200201', '20200101', '20200301']
<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    NULLPAD: ['20200101', '20200201', '20200101', '20200301']
    NULLTERM: ['20200101', '20200201', '20200101', '20200301']
    numpy_S: ['20200101', '20200201', '20200101', '20200301']
    numpy_O: ['20200101', '20200201', '20200101', '20200301']
    dimensions(sizes): 
    variables(dimensions): 
    groups: 
['20200101', '20200201', '20200101', '20200301']
['20200101', '20200201', '20200101', '20200301']
['20200101', '20200201', '20200101', '20200301']
['20200101', '20200201', '20200101', '20200301']
```
</details>


It's clearly seen, that the Datasets are correct in hdf5 dump, but somehow netcdf-c has issues with the string NULLPAD/NULLTERM. But at least there is no segfault with attributes. Othe than with Datasets/Variables:

```python
import h5py
import xarray as xr

with h5py.File(""test_str_list_ds.h5"", ""w"") as hf:
    blob = np.array([[""20200101"", ""20200201""], [""20200101"", ""20200301""]]).astype(""S"")
    
    # Datasets
    sid = h5py.h5s.create_simple((2, 2), (2, 2))
    
    tid3 = h5py.h5t.TypeID.copy(h5py.h5t.C_S1)
    tid3.set_size(8)
    tid3.set_strpad(h5py.h5t.STR_NULLPAD)
    
    tid4 = h5py.h5t.TypeID.copy(h5py.h5t.C_S1)
    tid4.set_size(9)
    tid4.set_strpad(h5py.h5t.STR_NULLTERM)
    
    aid = h5py.h5d.create(hf.id, b""NULLPAD"", tid3, sid)
    ret = aid.write(sid, h5py.h5s.ALL, blob)
    
    aid = h5py.h5d.create(hf.id, b""NULLTERM"", tid4, sid)
    ret = aid.write(sid, h5py.h5s.ALL, blob)
    
    hf[""numpy_S""] = blob
    hf[""numpy_O""] = blob.astype(""O"")
    
!h5dump test_str_list_ds.h5
!ncdump test_str_list_ds.h5    

with xr.load_dataset(""test_str_list_ds.h5"", engine=""h5netcdf"", phony_dims=""sort"") as ds:
    display(ds)

# with xr.load_dataset(""test_str_list_ds.h5"", engine=""netcdf4"") as ds:
#     display(ds[""numpy_O""])
    
# with nc.Dataset(""test_str_list_ds.h5"") as ds:
#     display(ds)
#     #display(""NULLTERM:"", ds[""NULLTERM""][:])
#     #display(""NULLPAD:"", ds[""NULLPAD""][:])
#     display(""numpy_O"", ds[""numpy_O""][:])
#     #display(""numpy_S"", ds[""numpy_S""][:])
```
<br/>
Output:
<details>

```
HDF5 ""test_str_list_ds.h5"" {
GROUP ""/"" {
   DATASET ""NULLPAD"" {
      DATATYPE  H5T_STRING {
         STRSIZE 8;
         STRPAD H5T_STR_NULLPAD;
         CSET H5T_CSET_ASCII;
         CTYPE H5T_C_S1;
      }
      DATASPACE  SIMPLE { ( 2, 2 ) / ( 2, 2 ) }
      DATA {
      (0,0): ""20200101"", ""20200201"",
      (1,0): ""20200101"", ""20200301""
      }
   }
   DATASET ""NULLTERM"" {
      DATATYPE  H5T_STRING {
         STRSIZE 9;
         STRPAD H5T_STR_NULLTERM;
         CSET H5T_CSET_ASCII;
         CTYPE H5T_C_S1;
      }
      DATASPACE  SIMPLE { ( 2, 2 ) / ( 2, 2 ) }
      DATA {
      (0,0): ""20200101"", ""20200201"",
      (1,0): ""20200101"", ""20200301""
      }
   }
   DATASET ""numpy_O"" {
      DATATYPE  H5T_STRING {
         STRSIZE H5T_VARIABLE;
         STRPAD H5T_STR_NULLTERM;
         CSET H5T_CSET_ASCII;
         CTYPE H5T_C_S1;
      }
      DATASPACE  SIMPLE { ( 2, 2 ) / ( 2, 2 ) }
      DATA {
      (0,0): ""20200101"", ""20200201"",
      (1,0): ""20200101"", ""20200301""
      }
   }
   DATASET ""numpy_S"" {
      DATATYPE  H5T_STRING {
         STRSIZE 8;
         STRPAD H5T_STR_NULLPAD;
         CSET H5T_CSET_ASCII;
         CTYPE H5T_C_S1;
      }
      DATASPACE  SIMPLE { ( 2, 2 ) / ( 2, 2 ) }
      DATA {
      (0,0): ""20200101"", ""20200201"",
      (1,0): ""20200101"", ""20200301""
      }
   }
}
}
netcdf test_str_list_ds {
dimensions:
	phony_dim_0 = 2 ;
	phony_dim_1 = 2 ;
variables:
	string NULLPAD(phony_dim_0, phony_dim_1) ;
	string NULLTERM(phony_dim_0, phony_dim_1) ;
	string numpy_O(phony_dim_0, phony_dim_1) ;
	string numpy_S(phony_dim_0, phony_dim_1) ;
data:

 NULLPAD =
  ""2020010120200201�4k�U"", NIL,
  ""2020010120200301 "", NIL ;

 NULLTERM =
  ""20200101"", NIL,
  ""20200101"", NIL ;

 numpy_O =
  ""20200101"", ""20200201"",
  ""20200101"", ""20200301"" ;

 numpy_S =
  ""2020010120200201"", NIL,
  ""2020010120200301 "", NIL ;
}
<xarray.Dataset>
Dimensions:   (phony_dim_0: 2, phony_dim_1: 2)
Dimensions without coordinates: phony_dim_0, phony_dim_1
Data variables:
    NULLPAD   (phony_dim_0, phony_dim_1) |S8 b'20200101' ... b'20200301'
    NULLTERM  (phony_dim_0, phony_dim_1) |S9 b'20200101' ... b'20200301'
    numpy_O   (phony_dim_0, phony_dim_1) object '20200101' ... '20200301'
    numpy_S   (phony_dim_0, phony_dim_1) |S8 b'20200101' ... b'20200301'
```
</details>

So here, netcdf-c/netcdf4-python will segfault for all variables beside `numpy_O`. 

It looks like the only option to achieve this for datasets/variables is to use numpy opaque dtype.
","{""total_count"": 0, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,970619131
https://github.com/pydata/xarray/issues/5706#issuecomment-1011242728,https://api.github.com/repos/pydata/xarray/issues/5706,1011242728,IC_kwDOAMm_X848Rlbo,5821660,2022-01-12T16:43:33Z,2022-01-12T16:43:33Z,MEMBER,"@scottstanie Could you please provide the output of `h5dump test_str_list.h5`? I've a hunch but want to be sure. Also, what is the output with `ncdump`?","{""total_count"": 0, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,970619131