html_url,issue_url,id,node_id,user,created_at,updated_at,author_association,body,reactions,performed_via_github_app,issue
https://github.com/pydata/xarray/issues/2780#issuecomment-1112552981,https://api.github.com/repos/pydata/xarray/issues/2780,1112552981,IC_kwDOAMm_X85CUDYV,22566757,2022-04-28T18:57:26Z,2022-04-28T19:01:34Z,CONTRIBUTOR,"I found a way to get the  sample dataset to save to a smaller netCDF:
```python
import os

import numpy as np
import numpy.testing as np_tst
import pandas as pd
import xarray as xr

##################################################
# Original example

# Create pandas DataFrame
df = pd.DataFrame(
    np.random.randint(low=0, high=10, size=(100000, 5)),
    columns=[""a"", ""b"", ""c"", ""d"", ""e""],
)

# Make 'e' a column of strings
df[""e""] = df[""e""].astype(str)

# Make 'f' a column of floats
DIGITS = 1
df[""f""] = np.around(10 ** DIGITS * np.random.random(size=df.shape[0]), DIGITS)

# Save to csv
df.to_csv(""df.csv"")

# Convert to an xarray's Dataset
ds = xr.Dataset.from_dataframe(df)

# Save NetCDF file
ds.to_netcdf(""ds.nc"")


##################################################
# Additions


def dtype_for_int_array(arry: ""array of integers"") -> np.dtype:
    """"""Find the smallest integer dtype that will encode arry.

    Parameters
    ----------
    arry : array of integers
        The array to compress

    Returns
    -------
    smallest: dtype
        The smallest dtype that will represent arry
    """"""
    largest = max(abs(arry.min()), abs(arry.max()))
    typecode = ""i{bytes:d}"".format(
        bytes=2
        ** np.nonzero(
            [
                np.iinfo(""i{bytes:d}"".format(bytes=2**i)).max >= largest
                for i in range(4)
            ]
        )[0][0]
    )
    return np.dtype(typecode)


def dtype_for_str_array(
    arry: ""xr.DataArray of strings"", for_disk: bool = True
) -> np.dtype:
    """"""Find a good string dtype for encoding arry.

    Parameters
    ----------
    arry : xr.DataArray of strings
        The array to compress
    for_disk : bool
        True if meant for encoding argument of to_netcdf()
        False if meant for in-memory datasets

    Returns
    -------
    smallest: dtype
        The smallest dtype that will represent arry
    """"""
    lengths = arry.str.len()
    largest = lengths.max()

    if not for_disk:
        # Variant for in-memory datasets
        # Makes dask happier about strings
        typecode = ""S{bytes:d}"".format(
            largest
        )
    else:
        # Variant for on-disk datasets
        # 0.2 and 0.6 are both guesses
        # If there's ""a lot"" of strings ""much shorter than"" the longest
        # use vlen str where available
        # otherwise use a string concatenation dimension
        if lengths.quantile(0.2) < 0.6 * largest:
            typecode = ""O""
        else:
            typecode = ""S1""
    return np.dtype(typecode)


# Set up encoding for saving to netCDF
encoding = {}
for name, var in ds.items():
    encoding[name] = {}

    var_kind = var.dtype.kind
    # Perhaps we should assume ""u"" means people know what they're
    # doing
    if var_kind in (""u"", ""i""):
        dtype = dtype_for_int_array(var)
        if var_kind == ""u"":
            dtype = dtype.replace(""i"", ""u"")
    elif var_kind == ""f"":
        finfo = np.finfo(var.dtype)

        abs_var = np.abs(var)
        dynamic_range = abs_var.max() / abs_var[abs_var > 0].min()
        if dynamic_range > 10**finfo.precision:
            # Dynamic range too high for quantization
            dtype = var.dtype
        else:
            # set scale_factor and add_offset for quantization
            # Also figure out what dtype compresses best
            var_min = var.min()
            var_range = var.max() - var_min
            mid_range = var_min + var_range / 2

            # Rescale to -1 to 1
            values_to_compress = (var - mid_range) / (0.5 * var_range)
            # for digits in range(finfo.precision):
            for digits in (2, 4, 9, 18):
                if np.allclose(
                    values_to_compress,
                    np.around(values_to_compress, digits),
                    rtol=finfo.precision,
                ):
                    dtype = var.dtype
                    # Convert digits to integer dtype
                    # digits <= 2 to i1
                    # digits <= 4 to i2
                    # digits <= 9 to i4
                    # digits <= 18 to i8
                    if digits <= 2:
                        dtype = np.dtype(""i1"")
                    elif digits <= 4:
                        dtype = np.dtype(""i2"")
                    elif digits <= 9:
                        dtype = np.dtype(""i4"")
                    else:
                        dtype = np.dtype(""i8"")

                    if dtype.itemsize >= var.dtype.itemsize:
                        # Quantization saves space
                        dtype = var.dtype
                    else:
                        # Quantization does not save space
                        storage_iinfo = np.iinfo(dtype)
                        encoding[name][""add_offset""] = mid_range.values
                        encoding[name][""scale_factor""] = (
                            2 * var_range / storage_iinfo.max
                        ).values
                        encoding[name][""_FillValue""] = storage_iinfo.min
                    break
            else:
                # Quantization would lose information
                dtype = var.dtype
    elif var_kind == ""O"":
        dtype = dtype_for_str_array(var)
    else:
        dtype = var.dtype
    encoding[name][""dtype""] = dtype

ds.to_netcdf(""ds_encoded.nc"", encoding=encoding)

# Display results
stat_csv = os.stat(""df.csv"")
stat_nc = os.stat(""ds.nc"")
stat_enc = os.stat(""ds_encoded.nc"")

sizes = pd.Series(
    index=[""CSV"", ""default netCDF"", ""encoded netCDF""],
    data=[stats.st_size for stats in [stat_csv, stat_nc, stat_enc]],
    name=""File sizes"",
)

print(""File sizes (kB):"", np.right_shift(sizes, 10), sep=""\n"", end=""\n\n"")

print(""Sizes relative to CSV:"", sizes / sizes.iloc[0], sep=""\n"", end=""\n\n"")

# Check that I didn't break the floats
from_disk = xr.open_dataset(""ds_encoded.nc"")
np_tst.assert_allclose(ds[""f""], from_disk[""f""], rtol=10**-DIGITS, atol=10**-DIGITS)
```
```bash
$ python xarray_auto_small_output.py && ls -sSh *.csv *.nc
File sizes (kB):
CSV                1942
default netCDF    10161
encoded netCDF     1375
Name: File sizes, dtype: int64

Sizes relative to CSV:
CSV               1.000000
default netCDF    5.230366
encoded netCDF    0.708063
Name: File sizes, dtype: float64

 10M ds.nc  1.9M df.csv  1.4M ds_encoded.nc
```
I added a column of floats with one digit before and after the decimal point to the example dataset, because why not.

Does this satisfy your use-case?

Should I turn the giant loop into a function to go into xarray somewhere?  If so, I should probably tie the float handling in with [the new `least_significant_digit` feature in netCDF4-python](https://unidata.github.io/netcdf4-python/#efficient-compression-of-netcdf-variables) so the data gets read in the same way it was before getting written out.","{""total_count"": 0, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,412180435
https://github.com/pydata/xarray/issues/2780#issuecomment-633296515,https://api.github.com/repos/pydata/xarray/issues/2780,633296515,MDEyOklzc3VlQ29tbWVudDYzMzI5NjUxNQ==,22566757,2020-05-24T20:45:43Z,2020-05-24T20:45:43Z,CONTRIBUTOR,"For the example given, this would mean finding `largest = max(abs(ds.min()), abs(ds.max()))` and finding the first integer dtype wide enough to write that: `[np.iinfo(""i{bytes:d}"".format(bytes=2 ** i)).max >= largest for i in range(4)]` would help there.  The function below should help with this; I would tend to use this at array creation time rather than at save time so you get these benefits in memory as well as on disk.

For the character/string variables, the smallest representation varies a bit more: a fixed-width encoding (`dtype=S6`) will probably be smaller if all the strings are about the same size, while variable-width strings are probably smaller if there are many short strings and only a few long strings.  If you happen to know that a given field is a five-character identifier or a one-character status code, you can again set these types to be used in memory (which I think makes dask happier when it comes time to save), while free-form survey responses will likely be better as a variable-length string.  It may be possible use the distribution of string lengths (perhaps using [numpy.char.str_len](https://numpy.org/doc/stable/reference/generated/numpy.char.str_len.html)) to see whether most of the strings are at least 90% as long as the longest, but it's probably simpler to test.

Doing this correctly for floating-point types would be difficult, but I think that's outside the scope of this issue.

Hopefully this gives you something to work with.

```python
import numpy as np

def dtype_for_int_array(arry: ""array of integers"") -> np.dtype:
    """"""Find the smallest integer dtype that will encode arry.

    Parameters
    ----------
    arry : array of integers
        The array to compress

    Returns
    -------
    smallest: dtype
        The smallest dtype that will represent arry
    """"""
    largest = max(abs(arry.min()), abs(arry.max()))
    typecode = ""i{bytes:d}"".format(
        bytes=2 ** np.nonzero([
            np.iinfo(""i{bytes:d}"".format(bytes=2 ** i)).max >= largest
            for i in range(4)
        ])[0][0]
    )
    return np.dtype(typecode)
```

Looking at [`df.memory_usage()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.memory_usage.html) will explain why I do this early.
If I extend your example with this new function, I see the following:
```python
>>> df_small = df.copy()
>>> for col in df_small:
...     df_small[col] = df_small[col].astype(
...         dtype_for_int_array(df_small[col]) if df_small[col].dtype.kind == ""i"" else ""S1""
...     )
...
>>> df_small.memory_usage()
Index        80
a        100000
b        100000
c        100000
d        100000
e        800000
dtype: int64
>>> df.memory_usage()
Index        80
a        800000
b        800000
c        800000
d        800000
e        800000
dtype: int64
```

It looks like pandas always uses object dtype for string arrays, so the numbers in that column likely reflect the size of an array of pointers.  XArray lets you use a dtype of ""S1"" or ""U1"", but I haven't found the equivalent of the `memory_usage` method.","{""total_count"": 0, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,412180435