home / github

Menu
  • GraphQL API
  • Search all tables

issue_comments

Table actions
  • GraphQL API for issue_comments

3 rows where author_association = "CONTRIBUTOR" and issue = 412180435 sorted by updated_at descending

✎ View and edit SQL

This data as json, CSV (advanced)

Suggested facets: created_at (date), updated_at (date)

user 2

  • DWesl 2
  • nedclimaterisk 1

issue 1

  • Automatic dtype encoding in to_netcdf · 3 ✖

author_association 1

  • CONTRIBUTOR · 3 ✖
id html_url issue_url node_id user created_at updated_at ▲ author_association body reactions performed_via_github_app issue
1112552981 https://github.com/pydata/xarray/issues/2780#issuecomment-1112552981 https://api.github.com/repos/pydata/xarray/issues/2780 IC_kwDOAMm_X85CUDYV DWesl 22566757 2022-04-28T18:57:26Z 2022-04-28T19:01:34Z CONTRIBUTOR

I found a way to get the sample dataset to save to a smaller netCDF: ```python import os

import numpy as np import numpy.testing as np_tst import pandas as pd import xarray as xr

Original example

Create pandas DataFrame

df = pd.DataFrame( np.random.randint(low=0, high=10, size=(100000, 5)), columns=["a", "b", "c", "d", "e"], )

Make 'e' a column of strings

df["e"] = df["e"].astype(str)

Make 'f' a column of floats

DIGITS = 1 df["f"] = np.around(10 ** DIGITS * np.random.random(size=df.shape[0]), DIGITS)

Save to csv

df.to_csv("df.csv")

Convert to an xarray's Dataset

ds = xr.Dataset.from_dataframe(df)

Save NetCDF file

ds.to_netcdf("ds.nc")

Additions

def dtype_for_int_array(arry: "array of integers") -> np.dtype: """Find the smallest integer dtype that will encode arry.

Parameters
----------
arry : array of integers
    The array to compress

Returns
-------
smallest: dtype
    The smallest dtype that will represent arry
"""
largest = max(abs(arry.min()), abs(arry.max()))
typecode = "i{bytes:d}".format(
    bytes=2
    ** np.nonzero(
        [
            np.iinfo("i{bytes:d}".format(bytes=2**i)).max >= largest
            for i in range(4)
        ]
    )[0][0]
)
return np.dtype(typecode)

def dtype_for_str_array( arry: "xr.DataArray of strings", for_disk: bool = True ) -> np.dtype: """Find a good string dtype for encoding arry.

Parameters
----------
arry : xr.DataArray of strings
    The array to compress
for_disk : bool
    True if meant for encoding argument of to_netcdf()
    False if meant for in-memory datasets

Returns
-------
smallest: dtype
    The smallest dtype that will represent arry
"""
lengths = arry.str.len()
largest = lengths.max()

if not for_disk:
    # Variant for in-memory datasets
    # Makes dask happier about strings
    typecode = "S{bytes:d}".format(
        largest
    )
else:
    # Variant for on-disk datasets
    # 0.2 and 0.6 are both guesses
    # If there's "a lot" of strings "much shorter than" the longest
    # use vlen str where available
    # otherwise use a string concatenation dimension
    if lengths.quantile(0.2) < 0.6 * largest:
        typecode = "O"
    else:
        typecode = "S1"
return np.dtype(typecode)

Set up encoding for saving to netCDF

encoding = {} for name, var in ds.items(): encoding[name] = {}

var_kind = var.dtype.kind
# Perhaps we should assume "u" means people know what they're
# doing
if var_kind in ("u", "i"):
    dtype = dtype_for_int_array(var)
    if var_kind == "u":
        dtype = dtype.replace("i", "u")
elif var_kind == "f":
    finfo = np.finfo(var.dtype)

    abs_var = np.abs(var)
    dynamic_range = abs_var.max() / abs_var[abs_var > 0].min()
    if dynamic_range > 10**finfo.precision:
        # Dynamic range too high for quantization
        dtype = var.dtype
    else:
        # set scale_factor and add_offset for quantization
        # Also figure out what dtype compresses best
        var_min = var.min()
        var_range = var.max() - var_min
        mid_range = var_min + var_range / 2

        # Rescale to -1 to 1
        values_to_compress = (var - mid_range) / (0.5 * var_range)
        # for digits in range(finfo.precision):
        for digits in (2, 4, 9, 18):
            if np.allclose(
                values_to_compress,
                np.around(values_to_compress, digits),
                rtol=finfo.precision,
            ):
                dtype = var.dtype
                # Convert digits to integer dtype
                # digits <= 2 to i1
                # digits <= 4 to i2
                # digits <= 9 to i4
                # digits <= 18 to i8
                if digits <= 2:
                    dtype = np.dtype("i1")
                elif digits <= 4:
                    dtype = np.dtype("i2")
                elif digits <= 9:
                    dtype = np.dtype("i4")
                else:
                    dtype = np.dtype("i8")

                if dtype.itemsize >= var.dtype.itemsize:
                    # Quantization saves space
                    dtype = var.dtype
                else:
                    # Quantization does not save space
                    storage_iinfo = np.iinfo(dtype)
                    encoding[name]["add_offset"] = mid_range.values
                    encoding[name]["scale_factor"] = (
                        2 * var_range / storage_iinfo.max
                    ).values
                    encoding[name]["_FillValue"] = storage_iinfo.min
                break
        else:
            # Quantization would lose information
            dtype = var.dtype
elif var_kind == "O":
    dtype = dtype_for_str_array(var)
else:
    dtype = var.dtype
encoding[name]["dtype"] = dtype

ds.to_netcdf("ds_encoded.nc", encoding=encoding)

Display results

stat_csv = os.stat("df.csv") stat_nc = os.stat("ds.nc") stat_enc = os.stat("ds_encoded.nc")

sizes = pd.Series( index=["CSV", "default netCDF", "encoded netCDF"], data=[stats.st_size for stats in [stat_csv, stat_nc, stat_enc]], name="File sizes", )

print("File sizes (kB):", np.right_shift(sizes, 10), sep="\n", end="\n\n")

print("Sizes relative to CSV:", sizes / sizes.iloc[0], sep="\n", end="\n\n")

Check that I didn't break the floats

from_disk = xr.open_dataset("ds_encoded.nc") np_tst.assert_allclose(ds["f"], from_disk["f"], rtol=10-DIGITS, atol=10-DIGITS) bash $ python xarray_auto_small_output.py && ls -sSh .csv .nc File sizes (kB): CSV 1942 default netCDF 10161 encoded netCDF 1375 Name: File sizes, dtype: int64

Sizes relative to CSV: CSV 1.000000 default netCDF 5.230366 encoded netCDF 0.708063 Name: File sizes, dtype: float64

10M ds.nc 1.9M df.csv 1.4M ds_encoded.nc ``` I added a column of floats with one digit before and after the decimal point to the example dataset, because why not.

Does this satisfy your use-case?

Should I turn the giant loop into a function to go into xarray somewhere? If so, I should probably tie the float handling in with the new least_significant_digit feature in netCDF4-python so the data gets read in the same way it was before getting written out.

{
    "total_count": 0,
    "+1": 0,
    "-1": 0,
    "laugh": 0,
    "hooray": 0,
    "confused": 0,
    "heart": 0,
    "rocket": 0,
    "eyes": 0
}
  Automatic dtype encoding in to_netcdf 412180435
633296515 https://github.com/pydata/xarray/issues/2780#issuecomment-633296515 https://api.github.com/repos/pydata/xarray/issues/2780 MDEyOklzc3VlQ29tbWVudDYzMzI5NjUxNQ== DWesl 22566757 2020-05-24T20:45:43Z 2020-05-24T20:45:43Z CONTRIBUTOR

For the example given, this would mean finding largest = max(abs(ds.min()), abs(ds.max())) and finding the first integer dtype wide enough to write that: [np.iinfo("i{bytes:d}".format(bytes=2 ** i)).max >= largest for i in range(4)] would help there. The function below should help with this; I would tend to use this at array creation time rather than at save time so you get these benefits in memory as well as on disk.

For the character/string variables, the smallest representation varies a bit more: a fixed-width encoding (dtype=S6) will probably be smaller if all the strings are about the same size, while variable-width strings are probably smaller if there are many short strings and only a few long strings. If you happen to know that a given field is a five-character identifier or a one-character status code, you can again set these types to be used in memory (which I think makes dask happier when it comes time to save), while free-form survey responses will likely be better as a variable-length string. It may be possible use the distribution of string lengths (perhaps using numpy.char.str_len) to see whether most of the strings are at least 90% as long as the longest, but it's probably simpler to test.

Doing this correctly for floating-point types would be difficult, but I think that's outside the scope of this issue.

Hopefully this gives you something to work with.

```python import numpy as np

def dtype_for_int_array(arry: "array of integers") -> np.dtype: """Find the smallest integer dtype that will encode arry.

Parameters
----------
arry : array of integers
    The array to compress

Returns
-------
smallest: dtype
    The smallest dtype that will represent arry
"""
largest = max(abs(arry.min()), abs(arry.max()))
typecode = "i{bytes:d}".format(
    bytes=2 ** np.nonzero([
        np.iinfo("i{bytes:d}".format(bytes=2 ** i)).max >= largest
        for i in range(4)
    ])[0][0]
)
return np.dtype(typecode)

```

Looking at df.memory_usage() will explain why I do this early. If I extend your example with this new function, I see the following: ```python

df_small = df.copy() for col in df_small: ... df_small[col] = df_small[col].astype( ... dtype_for_int_array(df_small[col]) if df_small[col].dtype.kind == "i" else "S1" ... ) ... df_small.memory_usage() Index 80 a 100000 b 100000 c 100000 d 100000 e 800000 dtype: int64 df.memory_usage() Index 80 a 800000 b 800000 c 800000 d 800000 e 800000 dtype: int64 ```

It looks like pandas always uses object dtype for string arrays, so the numbers in that column likely reflect the size of an array of pointers. XArray lets you use a dtype of "S1" or "U1", but I haven't found the equivalent of the memory_usage method.

{
    "total_count": 0,
    "+1": 0,
    "-1": 0,
    "laugh": 0,
    "hooray": 0,
    "confused": 0,
    "heart": 0,
    "rocket": 0,
    "eyes": 0
}
  Automatic dtype encoding in to_netcdf 412180435
465362210 https://github.com/pydata/xarray/issues/2780#issuecomment-465362210 https://api.github.com/repos/pydata/xarray/issues/2780 MDEyOklzc3VlQ29tbWVudDQ2NTM2MjIxMA== nedclimaterisk 43126798 2019-02-20T00:00:41Z 2019-02-20T00:00:41Z CONTRIBUTOR

related: https://github.com/pydata/xarray/issues/2780

{
    "total_count": 0,
    "+1": 0,
    "-1": 0,
    "laugh": 0,
    "hooray": 0,
    "confused": 0,
    "heart": 0,
    "rocket": 0,
    "eyes": 0
}
  Automatic dtype encoding in to_netcdf 412180435

Advanced export

JSON shape: default, array, newline-delimited, object

CSV options:

CREATE TABLE [issue_comments] (
   [html_url] TEXT,
   [issue_url] TEXT,
   [id] INTEGER PRIMARY KEY,
   [node_id] TEXT,
   [user] INTEGER REFERENCES [users]([id]),
   [created_at] TEXT,
   [updated_at] TEXT,
   [author_association] TEXT,
   [body] TEXT,
   [reactions] TEXT,
   [performed_via_github_app] TEXT,
   [issue] INTEGER REFERENCES [issues]([id])
);
CREATE INDEX [idx_issue_comments_issue]
    ON [issue_comments] ([issue]);
CREATE INDEX [idx_issue_comments_user]
    ON [issue_comments] ([user]);
Powered by Datasette · Queries took 10.352ms · About: xarray-datasette