issue_comments: 1112552981

This data as json

html_url issue_url id node_id user created_at updated_at author_association body reactions performed_via_github_app issue

html_url	issue_url	id	node_id	user	created_at	updated_at	author_association	body	reactions	performed_via_github_app	issue
https://github.com/pydata/xarray/issues/2780#issuecomment-1112552981	https://api.github.com/repos/pydata/xarray/issues/2780	1112552981	IC_kwDOAMm_X85CUDYV	22566757	2022-04-28T18:57:26Z	2022-04-28T19:01:34Z	CONTRIBUTOR	I found a way to get the sample dataset to save to a smaller netCDF: ```python import os import numpy as np import numpy.testing as np_tst import pandas as pd import xarray as xr Original example Create pandas DataFrame df = pd.DataFrame( np.random.randint(low=0, high=10, size=(100000, 5)), columns=["a", "b", "c", "d", "e"], ) Make 'e' a column of strings df["e"] = df["e"].astype(str) Make 'f' a column of floats DIGITS = 1 df["f"] = np.around(10 ** DIGITS * np.random.random(size=df.shape[0]), DIGITS) Save to csv df.to_csv("df.csv") Convert to an xarray's Dataset ds = xr.Dataset.from_dataframe(df) Save NetCDF file ds.to_netcdf("ds.nc") Additions def dtype_for_int_array(arry: "array of integers") -> np.dtype: """Find the smallest integer dtype that will encode arry. `Parameters ---------- arry : array of integers The array to compress Returns ------- smallest: dtype The smallest dtype that will represent arry """ largest = max(abs(arry.min()), abs(arry.max())) typecode = "i{bytes:d}".format( bytes=2 np.nonzero( [ np.iinfo("i{bytes:d}".format(bytes=2i)).max >= largest for i in range(4) ] )[0][0] ) return np.dtype(typecode)` def dtype_for_str_array( arry: "xr.DataArray of strings", for_disk: bool = True ) -> np.dtype: """Find a good string dtype for encoding arry. Parameters ---------- arry : xr.DataArray of strings The array to compress for_disk : bool True if meant for encoding argument of to_netcdf() False if meant for in-memory datasets Returns ------- smallest: dtype The smallest dtype that will represent arry """ lengths = arry.str.len() largest = lengths.max() if not for_disk: # Variant for in-memory datasets # Makes dask happier about strings typecode = "S{bytes:d}".format( largest ) else: # Variant for on-disk datasets # 0.2 and 0.6 are both guesses # If there's "a lot" of strings "much shorter than" the longest # use vlen str where available # otherwise use a string concatenation dimension if lengths.quantile(0.2) < 0.6 * largest: typecode = "O" else: typecode = "S1" return np.dtype(typecode) Set up encoding for saving to netCDF encoding = {} for name, var in ds.items(): encoding[name] = {} var_kind = var.dtype.kind # Perhaps we should assume "u" means people know what they're # doing if var_kind in ("u", "i"): dtype = dtype_for_int_array(var) if var_kind == "u": dtype = dtype.replace("i", "u") elif var_kind == "f": finfo = np.finfo(var.dtype) abs_var = np.abs(var) dynamic_range = abs_var.max() / abs_var[abs_var > 0].min() if dynamic_range > 10*finfo.precision: # Dynamic range too high for quantization dtype = var.dtype else: # set scale_factor and add_offset for quantization # Also figure out what dtype compresses best var_min = var.min() var_range = var.max() - var_min mid_range = var_min + var_range / 2 # Rescale to -1 to 1 values_to_compress = (var - mid_range) / (0.5 var_range) # for digits in range(finfo.precision): for digits in (2, 4, 9, 18): if np.allclose( values_to_compress, np.around(values_to_compress, digits), rtol=finfo.precision, ): dtype = var.dtype # Convert digits to integer dtype # digits <= 2 to i1 # digits <= 4 to i2 # digits <= 9 to i4 # digits <= 18 to i8 if digits <= 2: dtype = np.dtype("i1") elif digits <= 4: dtype = np.dtype("i2") elif digits <= 9: dtype = np.dtype("i4") else: dtype = np.dtype("i8") if dtype.itemsize >= var.dtype.itemsize: # Quantization saves space dtype = var.dtype else: # Quantization does not save space storage_iinfo = np.iinfo(dtype) encoding[name]["add_offset"] = mid_range.values encoding[name]["scale_factor"] = ( 2 * var_range / storage_iinfo.max ).values encoding[name]["_FillValue"] = storage_iinfo.min break else: # Quantization would lose information dtype = var.dtype elif var_kind == "O": dtype = dtype_for_str_array(var) else: dtype = var.dtype encoding[name]["dtype"] = dtype ds.to_netcdf("ds_encoded.nc", encoding=encoding) Display results stat_csv = os.stat("df.csv") stat_nc = os.stat("ds.nc") stat_enc = os.stat("ds_encoded.nc") sizes = pd.Series( index=["CSV", "default netCDF", "encoded netCDF"], data=[stats.st_size for stats in [stat_csv, stat_nc, stat_enc]], name="File sizes", ) print("File sizes (kB):", np.right_shift(sizes, 10), sep="\n", end="\n\n") print("Sizes relative to CSV:", sizes / sizes.iloc[0], sep="\n", end="\n\n") Check that I didn't break the floats from_disk = xr.open_dataset("ds_encoded.nc") np_tst.assert_allclose(ds["f"], from_disk["f"], rtol=10-DIGITS, atol=10-DIGITS) bash $ python xarray_auto_small_output.py && ls -sSh .csv .nc File sizes (kB): CSV 1942 default netCDF 10161 encoded netCDF 1375 Name: File sizes, dtype: int64 Sizes relative to CSV: CSV 1.000000 default netCDF 5.230366 encoded netCDF 0.708063 Name: File sizes, dtype: float64 10M ds.nc 1.9M df.csv 1.4M ds_encoded.nc ``` I added a column of floats with one digit before and after the decimal point to the example dataset, because why not. Does this satisfy your use-case? Should I turn the giant loop into a function to go into xarray somewhere? If so, I should probably tie the float handling in with the new `least_significant_digit` feature in netCDF4-python so the data gets read in the same way it was before getting written out.	{ "total_count": 0, "+1": 0, "-1": 0, "laugh": 0, "hooray": 0, "confused": 0, "heart": 0, "rocket": 0, "eyes": 0 }		412180435

https://github.com/pydata/xarray/issues/2780#issuecomment-1112552981

https://api.github.com/repos/pydata/xarray/issues/2780

1112552981

IC_kwDOAMm_X85CUDYV

22566757

2022-04-28T18:57:26Z

2022-04-28T19:01:34Z

CONTRIBUTOR

I found a way to get the sample dataset to save to a smaller netCDF: ```python import os

import numpy as np import numpy.testing as np_tst import pandas as pd import xarray as xr

Original example

Create pandas DataFrame

df = pd.DataFrame( np.random.randint(low=0, high=10, size=(100000, 5)), columns=["a", "b", "c", "d", "e"], )

Make 'e' a column of strings

df["e"] = df["e"].astype(str)

Make 'f' a column of floats

DIGITS = 1 df["f"] = np.around(10 ** DIGITS * np.random.random(size=df.shape[0]), DIGITS)

Save to csv

df.to_csv("df.csv")

Convert to an xarray's Dataset

ds = xr.Dataset.from_dataframe(df)

Save NetCDF file

ds.to_netcdf("ds.nc")

Additions

def dtype_for_int_array(arry: "array of integers") -> np.dtype: """Find the smallest integer dtype that will encode arry.

Parameters
----------
arry : array of integers
    The array to compress

Returns
-------
smallest: dtype
    The smallest dtype that will represent arry
"""
largest = max(abs(arry.min()), abs(arry.max()))
typecode = "i{bytes:d}".format(
    bytes=2
    ** np.nonzero(
        [
            np.iinfo("i{bytes:d}".format(bytes=2**i)).max >= largest
            for i in range(4)
        ]
    )[0][0]
)
return np.dtype(typecode)

def dtype_for_str_array( arry: "xr.DataArray of strings", for_disk: bool = True ) -> np.dtype: """Find a good string dtype for encoding arry.

Parameters
----------
arry : xr.DataArray of strings
    The array to compress
for_disk : bool
    True if meant for encoding argument of to_netcdf()
    False if meant for in-memory datasets

Returns
-------
smallest: dtype
    The smallest dtype that will represent arry
"""
lengths = arry.str.len()
largest = lengths.max()

if not for_disk:
    # Variant for in-memory datasets
    # Makes dask happier about strings
    typecode = "S{bytes:d}".format(
        largest
    )
else:
    # Variant for on-disk datasets
    # 0.2 and 0.6 are both guesses
    # If there's "a lot" of strings "much shorter than" the longest
    # use vlen str where available
    # otherwise use a string concatenation dimension
    if lengths.quantile(0.2) < 0.6 * largest:
        typecode = "O"
    else:
        typecode = "S1"
return np.dtype(typecode)

Set up encoding for saving to netCDF

encoding = {} for name, var in ds.items(): encoding[name] = {}

var_kind = var.dtype.kind
# Perhaps we should assume "u" means people know what they're
# doing
if var_kind in ("u", "i"):
    dtype = dtype_for_int_array(var)
    if var_kind == "u":
        dtype = dtype.replace("i", "u")
elif var_kind == "f":
    finfo = np.finfo(var.dtype)

    abs_var = np.abs(var)
    dynamic_range = abs_var.max() / abs_var[abs_var > 0].min()
    if dynamic_range > 10**finfo.precision:
        # Dynamic range too high for quantization
        dtype = var.dtype
    else:
        # set scale_factor and add_offset for quantization
        # Also figure out what dtype compresses best
        var_min = var.min()
        var_range = var.max() - var_min
        mid_range = var_min + var_range / 2

        # Rescale to -1 to 1
        values_to_compress = (var - mid_range) / (0.5 * var_range)
        # for digits in range(finfo.precision):
        for digits in (2, 4, 9, 18):
            if np.allclose(
                values_to_compress,
                np.around(values_to_compress, digits),
                rtol=finfo.precision,
            ):
                dtype = var.dtype
                # Convert digits to integer dtype
                # digits <= 2 to i1
                # digits <= 4 to i2
                # digits <= 9 to i4
                # digits <= 18 to i8
                if digits <= 2:
                    dtype = np.dtype("i1")
                elif digits <= 4:
                    dtype = np.dtype("i2")
                elif digits <= 9:
                    dtype = np.dtype("i4")
                else:
                    dtype = np.dtype("i8")

                if dtype.itemsize >= var.dtype.itemsize:
                    # Quantization saves space
                    dtype = var.dtype
                else:
                    # Quantization does not save space
                    storage_iinfo = np.iinfo(dtype)
                    encoding[name]["add_offset"] = mid_range.values
                    encoding[name]["scale_factor"] = (
                        2 * var_range / storage_iinfo.max
                    ).values
                    encoding[name]["_FillValue"] = storage_iinfo.min
                break
        else:
            # Quantization would lose information
            dtype = var.dtype
elif var_kind == "O":
    dtype = dtype_for_str_array(var)
else:
    dtype = var.dtype
encoding[name]["dtype"] = dtype

ds.to_netcdf("ds_encoded.nc", encoding=encoding)

Display results

stat_csv = os.stat("df.csv") stat_nc = os.stat("ds.nc") stat_enc = os.stat("ds_encoded.nc")

sizes = pd.Series( index=["CSV", "default netCDF", "encoded netCDF"], data=[stats.st_size for stats in [stat_csv, stat_nc, stat_enc]], name="File sizes", )

print("File sizes (kB):", np.right_shift(sizes, 10), sep="\n", end="\n\n")

print("Sizes relative to CSV:", sizes / sizes.iloc[0], sep="\n", end="\n\n")

Check that I didn't break the floats

from_disk = xr.open_dataset("ds_encoded.nc") np_tst.assert_allclose(ds["f"], from_disk["f"], rtol=10-DIGITS, atol=10-DIGITS) bash $ python xarray_auto_small_output.py && ls -sSh .csv .nc File sizes (kB): CSV 1942 default netCDF 10161 encoded netCDF 1375 Name: File sizes, dtype: int64

Sizes relative to CSV: CSV 1.000000 default netCDF 5.230366 encoded netCDF 0.708063 Name: File sizes, dtype: float64

10M ds.nc 1.9M df.csv 1.4M ds_encoded.nc ``` I added a column of floats with one digit before and after the decimal point to the example dataset, because why not.

Does this satisfy your use-case?

Should I turn the giant loop into a function to go into xarray somewhere? If so, I should probably tie the float handling in with the new least_significant_digit feature in netCDF4-python so the data gets read in the same way it was before getting written out.

{
    "total_count": 0,
    "+1": 0,
    "-1": 0,
    "laugh": 0,
    "hooray": 0,
    "confused": 0,
    "heart": 0,
    "rocket": 0,
    "eyes": 0
}

412180435