html_url,issue_url,id,node_id,user,created_at,updated_at,author_association,body,reactions,performed_via_github_app,issue
https://github.com/pydata/xarray/issues/2780#issuecomment-1112552981,https://api.github.com/repos/pydata/xarray/issues/2780,1112552981,IC_kwDOAMm_X85CUDYV,22566757,2022-04-28T18:57:26Z,2022-04-28T19:01:34Z,CONTRIBUTOR,"I found a way to get the sample dataset to save to a smaller netCDF:
```python
import os
import numpy as np
import numpy.testing as np_tst
import pandas as pd
import xarray as xr
##################################################
# Original example
# Create pandas DataFrame
df = pd.DataFrame(
np.random.randint(low=0, high=10, size=(100000, 5)),
columns=[""a"", ""b"", ""c"", ""d"", ""e""],
)
# Make 'e' a column of strings
df[""e""] = df[""e""].astype(str)
# Make 'f' a column of floats
DIGITS = 1
df[""f""] = np.around(10 ** DIGITS * np.random.random(size=df.shape[0]), DIGITS)
# Save to csv
df.to_csv(""df.csv"")
# Convert to an xarray's Dataset
ds = xr.Dataset.from_dataframe(df)
# Save NetCDF file
ds.to_netcdf(""ds.nc"")
##################################################
# Additions
def dtype_for_int_array(arry: ""array of integers"") -> np.dtype:
""""""Find the smallest integer dtype that will encode arry.
Parameters
----------
arry : array of integers
The array to compress
Returns
-------
smallest: dtype
The smallest dtype that will represent arry
""""""
largest = max(abs(arry.min()), abs(arry.max()))
typecode = ""i{bytes:d}"".format(
bytes=2
** np.nonzero(
[
np.iinfo(""i{bytes:d}"".format(bytes=2**i)).max >= largest
for i in range(4)
]
)[0][0]
)
return np.dtype(typecode)
def dtype_for_str_array(
arry: ""xr.DataArray of strings"", for_disk: bool = True
) -> np.dtype:
""""""Find a good string dtype for encoding arry.
Parameters
----------
arry : xr.DataArray of strings
The array to compress
for_disk : bool
True if meant for encoding argument of to_netcdf()
False if meant for in-memory datasets
Returns
-------
smallest: dtype
The smallest dtype that will represent arry
""""""
lengths = arry.str.len()
largest = lengths.max()
if not for_disk:
# Variant for in-memory datasets
# Makes dask happier about strings
typecode = ""S{bytes:d}"".format(
largest
)
else:
# Variant for on-disk datasets
# 0.2 and 0.6 are both guesses
# If there's ""a lot"" of strings ""much shorter than"" the longest
# use vlen str where available
# otherwise use a string concatenation dimension
if lengths.quantile(0.2) < 0.6 * largest:
typecode = ""O""
else:
typecode = ""S1""
return np.dtype(typecode)
# Set up encoding for saving to netCDF
encoding = {}
for name, var in ds.items():
encoding[name] = {}
var_kind = var.dtype.kind
# Perhaps we should assume ""u"" means people know what they're
# doing
if var_kind in (""u"", ""i""):
dtype = dtype_for_int_array(var)
if var_kind == ""u"":
dtype = dtype.replace(""i"", ""u"")
elif var_kind == ""f"":
finfo = np.finfo(var.dtype)
abs_var = np.abs(var)
dynamic_range = abs_var.max() / abs_var[abs_var > 0].min()
if dynamic_range > 10**finfo.precision:
# Dynamic range too high for quantization
dtype = var.dtype
else:
# set scale_factor and add_offset for quantization
# Also figure out what dtype compresses best
var_min = var.min()
var_range = var.max() - var_min
mid_range = var_min + var_range / 2
# Rescale to -1 to 1
values_to_compress = (var - mid_range) / (0.5 * var_range)
# for digits in range(finfo.precision):
for digits in (2, 4, 9, 18):
if np.allclose(
values_to_compress,
np.around(values_to_compress, digits),
rtol=finfo.precision,
):
dtype = var.dtype
# Convert digits to integer dtype
# digits <= 2 to i1
# digits <= 4 to i2
# digits <= 9 to i4
# digits <= 18 to i8
if digits <= 2:
dtype = np.dtype(""i1"")
elif digits <= 4:
dtype = np.dtype(""i2"")
elif digits <= 9:
dtype = np.dtype(""i4"")
else:
dtype = np.dtype(""i8"")
if dtype.itemsize >= var.dtype.itemsize:
# Quantization saves space
dtype = var.dtype
else:
# Quantization does not save space
storage_iinfo = np.iinfo(dtype)
encoding[name][""add_offset""] = mid_range.values
encoding[name][""scale_factor""] = (
2 * var_range / storage_iinfo.max
).values
encoding[name][""_FillValue""] = storage_iinfo.min
break
else:
# Quantization would lose information
dtype = var.dtype
elif var_kind == ""O"":
dtype = dtype_for_str_array(var)
else:
dtype = var.dtype
encoding[name][""dtype""] = dtype
ds.to_netcdf(""ds_encoded.nc"", encoding=encoding)
# Display results
stat_csv = os.stat(""df.csv"")
stat_nc = os.stat(""ds.nc"")
stat_enc = os.stat(""ds_encoded.nc"")
sizes = pd.Series(
index=[""CSV"", ""default netCDF"", ""encoded netCDF""],
data=[stats.st_size for stats in [stat_csv, stat_nc, stat_enc]],
name=""File sizes"",
)
print(""File sizes (kB):"", np.right_shift(sizes, 10), sep=""\n"", end=""\n\n"")
print(""Sizes relative to CSV:"", sizes / sizes.iloc[0], sep=""\n"", end=""\n\n"")
# Check that I didn't break the floats
from_disk = xr.open_dataset(""ds_encoded.nc"")
np_tst.assert_allclose(ds[""f""], from_disk[""f""], rtol=10**-DIGITS, atol=10**-DIGITS)
```
```bash
$ python xarray_auto_small_output.py && ls -sSh *.csv *.nc
File sizes (kB):
CSV 1942
default netCDF 10161
encoded netCDF 1375
Name: File sizes, dtype: int64
Sizes relative to CSV:
CSV 1.000000
default netCDF 5.230366
encoded netCDF 0.708063
Name: File sizes, dtype: float64
10M ds.nc 1.9M df.csv 1.4M ds_encoded.nc
```
I added a column of floats with one digit before and after the decimal point to the example dataset, because why not.
Does this satisfy your use-case?
Should I turn the giant loop into a function to go into xarray somewhere? If so, I should probably tie the float handling in with [the new `least_significant_digit` feature in netCDF4-python](https://unidata.github.io/netcdf4-python/#efficient-compression-of-netcdf-variables) so the data gets read in the same way it was before getting written out.","{""total_count"": 0, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,412180435
https://github.com/pydata/xarray/issues/2780#issuecomment-1112215165,https://api.github.com/repos/pydata/xarray/issues/2780,1112215165,IC_kwDOAMm_X85CSw59,26384082,2022-04-28T13:37:57Z,2022-04-28T13:37:57Z,NONE,"In order to maintain a list of currently relevant issues, we mark issues as stale after a period of inactivity
If this issue remains relevant, please comment here or remove the `stale` label; otherwise it will be marked as closed automatically
","{""total_count"": 0, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,412180435
https://github.com/pydata/xarray/issues/2780#issuecomment-633296515,https://api.github.com/repos/pydata/xarray/issues/2780,633296515,MDEyOklzc3VlQ29tbWVudDYzMzI5NjUxNQ==,22566757,2020-05-24T20:45:43Z,2020-05-24T20:45:43Z,CONTRIBUTOR,"For the example given, this would mean finding `largest = max(abs(ds.min()), abs(ds.max()))` and finding the first integer dtype wide enough to write that: `[np.iinfo(""i{bytes:d}"".format(bytes=2 ** i)).max >= largest for i in range(4)]` would help there. The function below should help with this; I would tend to use this at array creation time rather than at save time so you get these benefits in memory as well as on disk.
For the character/string variables, the smallest representation varies a bit more: a fixed-width encoding (`dtype=S6`) will probably be smaller if all the strings are about the same size, while variable-width strings are probably smaller if there are many short strings and only a few long strings. If you happen to know that a given field is a five-character identifier or a one-character status code, you can again set these types to be used in memory (which I think makes dask happier when it comes time to save), while free-form survey responses will likely be better as a variable-length string. It may be possible use the distribution of string lengths (perhaps using [numpy.char.str_len](https://numpy.org/doc/stable/reference/generated/numpy.char.str_len.html)) to see whether most of the strings are at least 90% as long as the longest, but it's probably simpler to test.
Doing this correctly for floating-point types would be difficult, but I think that's outside the scope of this issue.
Hopefully this gives you something to work with.
```python
import numpy as np
def dtype_for_int_array(arry: ""array of integers"") -> np.dtype:
""""""Find the smallest integer dtype that will encode arry.
Parameters
----------
arry : array of integers
The array to compress
Returns
-------
smallest: dtype
The smallest dtype that will represent arry
""""""
largest = max(abs(arry.min()), abs(arry.max()))
typecode = ""i{bytes:d}"".format(
bytes=2 ** np.nonzero([
np.iinfo(""i{bytes:d}"".format(bytes=2 ** i)).max >= largest
for i in range(4)
])[0][0]
)
return np.dtype(typecode)
```
Looking at [`df.memory_usage()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.memory_usage.html) will explain why I do this early.
If I extend your example with this new function, I see the following:
```python
>>> df_small = df.copy()
>>> for col in df_small:
... df_small[col] = df_small[col].astype(
... dtype_for_int_array(df_small[col]) if df_small[col].dtype.kind == ""i"" else ""S1""
... )
...
>>> df_small.memory_usage()
Index 80
a 100000
b 100000
c 100000
d 100000
e 800000
dtype: int64
>>> df.memory_usage()
Index 80
a 800000
b 800000
c 800000
d 800000
e 800000
dtype: int64
```
It looks like pandas always uses object dtype for string arrays, so the numbers in that column likely reflect the size of an array of pointers. XArray lets you use a dtype of ""S1"" or ""U1"", but I haven't found the equivalent of the `memory_usage` method.","{""total_count"": 0, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,412180435
https://github.com/pydata/xarray/issues/2780#issuecomment-465362210,https://api.github.com/repos/pydata/xarray/issues/2780,465362210,MDEyOklzc3VlQ29tbWVudDQ2NTM2MjIxMA==,43126798,2019-02-20T00:00:41Z,2019-02-20T00:00:41Z,CONTRIBUTOR,related: https://github.com/pydata/xarray/issues/2780,"{""total_count"": 0, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,412180435