home / github / issue_comments

Menu
  • GraphQL API
  • Search all tables

issue_comments: 1112552981

This data as json

html_url issue_url id node_id user created_at updated_at author_association body reactions performed_via_github_app issue
https://github.com/pydata/xarray/issues/2780#issuecomment-1112552981 https://api.github.com/repos/pydata/xarray/issues/2780 1112552981 IC_kwDOAMm_X85CUDYV 22566757 2022-04-28T18:57:26Z 2022-04-28T19:01:34Z CONTRIBUTOR

I found a way to get the sample dataset to save to a smaller netCDF: ```python import os

import numpy as np import numpy.testing as np_tst import pandas as pd import xarray as xr

Original example

Create pandas DataFrame

df = pd.DataFrame( np.random.randint(low=0, high=10, size=(100000, 5)), columns=["a", "b", "c", "d", "e"], )

Make 'e' a column of strings

df["e"] = df["e"].astype(str)

Make 'f' a column of floats

DIGITS = 1 df["f"] = np.around(10 ** DIGITS * np.random.random(size=df.shape[0]), DIGITS)

Save to csv

df.to_csv("df.csv")

Convert to an xarray's Dataset

ds = xr.Dataset.from_dataframe(df)

Save NetCDF file

ds.to_netcdf("ds.nc")

Additions

def dtype_for_int_array(arry: "array of integers") -> np.dtype: """Find the smallest integer dtype that will encode arry.

Parameters
----------
arry : array of integers
    The array to compress

Returns
-------
smallest: dtype
    The smallest dtype that will represent arry
"""
largest = max(abs(arry.min()), abs(arry.max()))
typecode = "i{bytes:d}".format(
    bytes=2
    ** np.nonzero(
        [
            np.iinfo("i{bytes:d}".format(bytes=2**i)).max >= largest
            for i in range(4)
        ]
    )[0][0]
)
return np.dtype(typecode)

def dtype_for_str_array( arry: "xr.DataArray of strings", for_disk: bool = True ) -> np.dtype: """Find a good string dtype for encoding arry.

Parameters
----------
arry : xr.DataArray of strings
    The array to compress
for_disk : bool
    True if meant for encoding argument of to_netcdf()
    False if meant for in-memory datasets

Returns
-------
smallest: dtype
    The smallest dtype that will represent arry
"""
lengths = arry.str.len()
largest = lengths.max()

if not for_disk:
    # Variant for in-memory datasets
    # Makes dask happier about strings
    typecode = "S{bytes:d}".format(
        largest
    )
else:
    # Variant for on-disk datasets
    # 0.2 and 0.6 are both guesses
    # If there's "a lot" of strings "much shorter than" the longest
    # use vlen str where available
    # otherwise use a string concatenation dimension
    if lengths.quantile(0.2) < 0.6 * largest:
        typecode = "O"
    else:
        typecode = "S1"
return np.dtype(typecode)

Set up encoding for saving to netCDF

encoding = {} for name, var in ds.items(): encoding[name] = {}

var_kind = var.dtype.kind
# Perhaps we should assume "u" means people know what they're
# doing
if var_kind in ("u", "i"):
    dtype = dtype_for_int_array(var)
    if var_kind == "u":
        dtype = dtype.replace("i", "u")
elif var_kind == "f":
    finfo = np.finfo(var.dtype)

    abs_var = np.abs(var)
    dynamic_range = abs_var.max() / abs_var[abs_var > 0].min()
    if dynamic_range > 10**finfo.precision:
        # Dynamic range too high for quantization
        dtype = var.dtype
    else:
        # set scale_factor and add_offset for quantization
        # Also figure out what dtype compresses best
        var_min = var.min()
        var_range = var.max() - var_min
        mid_range = var_min + var_range / 2

        # Rescale to -1 to 1
        values_to_compress = (var - mid_range) / (0.5 * var_range)
        # for digits in range(finfo.precision):
        for digits in (2, 4, 9, 18):
            if np.allclose(
                values_to_compress,
                np.around(values_to_compress, digits),
                rtol=finfo.precision,
            ):
                dtype = var.dtype
                # Convert digits to integer dtype
                # digits <= 2 to i1
                # digits <= 4 to i2
                # digits <= 9 to i4
                # digits <= 18 to i8
                if digits <= 2:
                    dtype = np.dtype("i1")
                elif digits <= 4:
                    dtype = np.dtype("i2")
                elif digits <= 9:
                    dtype = np.dtype("i4")
                else:
                    dtype = np.dtype("i8")

                if dtype.itemsize >= var.dtype.itemsize:
                    # Quantization saves space
                    dtype = var.dtype
                else:
                    # Quantization does not save space
                    storage_iinfo = np.iinfo(dtype)
                    encoding[name]["add_offset"] = mid_range.values
                    encoding[name]["scale_factor"] = (
                        2 * var_range / storage_iinfo.max
                    ).values
                    encoding[name]["_FillValue"] = storage_iinfo.min
                break
        else:
            # Quantization would lose information
            dtype = var.dtype
elif var_kind == "O":
    dtype = dtype_for_str_array(var)
else:
    dtype = var.dtype
encoding[name]["dtype"] = dtype

ds.to_netcdf("ds_encoded.nc", encoding=encoding)

Display results

stat_csv = os.stat("df.csv") stat_nc = os.stat("ds.nc") stat_enc = os.stat("ds_encoded.nc")

sizes = pd.Series( index=["CSV", "default netCDF", "encoded netCDF"], data=[stats.st_size for stats in [stat_csv, stat_nc, stat_enc]], name="File sizes", )

print("File sizes (kB):", np.right_shift(sizes, 10), sep="\n", end="\n\n")

print("Sizes relative to CSV:", sizes / sizes.iloc[0], sep="\n", end="\n\n")

Check that I didn't break the floats

from_disk = xr.open_dataset("ds_encoded.nc") np_tst.assert_allclose(ds["f"], from_disk["f"], rtol=10-DIGITS, atol=10-DIGITS) bash $ python xarray_auto_small_output.py && ls -sSh .csv .nc File sizes (kB): CSV 1942 default netCDF 10161 encoded netCDF 1375 Name: File sizes, dtype: int64

Sizes relative to CSV: CSV 1.000000 default netCDF 5.230366 encoded netCDF 0.708063 Name: File sizes, dtype: float64

10M ds.nc 1.9M df.csv 1.4M ds_encoded.nc ``` I added a column of floats with one digit before and after the decimal point to the example dataset, because why not.

Does this satisfy your use-case?

Should I turn the giant loop into a function to go into xarray somewhere? If so, I should probably tie the float handling in with the new least_significant_digit feature in netCDF4-python so the data gets read in the same way it was before getting written out.

{
    "total_count": 0,
    "+1": 0,
    "-1": 0,
    "laugh": 0,
    "hooray": 0,
    "confused": 0,
    "heart": 0,
    "rocket": 0,
    "eyes": 0
}
  412180435
Powered by Datasette · Queries took 0.599ms · About: xarray-datasette