html_url,issue_url,id,node_id,user,created_at,updated_at,author_association,body,reactions,performed_via_github_app,issue https://github.com/pydata/xarray/issues/2780#issuecomment-1112552981,https://api.github.com/repos/pydata/xarray/issues/2780,1112552981,IC_kwDOAMm_X85CUDYV,22566757,2022-04-28T18:57:26Z,2022-04-28T19:01:34Z,CONTRIBUTOR,"I found a way to get the sample dataset to save to a smaller netCDF: ```python import os import numpy as np import numpy.testing as np_tst import pandas as pd import xarray as xr ################################################## # Original example # Create pandas DataFrame df = pd.DataFrame( np.random.randint(low=0, high=10, size=(100000, 5)), columns=[""a"", ""b"", ""c"", ""d"", ""e""], ) # Make 'e' a column of strings df[""e""] = df[""e""].astype(str) # Make 'f' a column of floats DIGITS = 1 df[""f""] = np.around(10 ** DIGITS * np.random.random(size=df.shape[0]), DIGITS) # Save to csv df.to_csv(""df.csv"") # Convert to an xarray's Dataset ds = xr.Dataset.from_dataframe(df) # Save NetCDF file ds.to_netcdf(""ds.nc"") ################################################## # Additions def dtype_for_int_array(arry: ""array of integers"") -> np.dtype: """"""Find the smallest integer dtype that will encode arry. Parameters ---------- arry : array of integers The array to compress Returns ------- smallest: dtype The smallest dtype that will represent arry """""" largest = max(abs(arry.min()), abs(arry.max())) typecode = ""i{bytes:d}"".format( bytes=2 ** np.nonzero( [ np.iinfo(""i{bytes:d}"".format(bytes=2**i)).max >= largest for i in range(4) ] )[0][0] ) return np.dtype(typecode) def dtype_for_str_array( arry: ""xr.DataArray of strings"", for_disk: bool = True ) -> np.dtype: """"""Find a good string dtype for encoding arry. Parameters ---------- arry : xr.DataArray of strings The array to compress for_disk : bool True if meant for encoding argument of to_netcdf() False if meant for in-memory datasets Returns ------- smallest: dtype The smallest dtype that will represent arry """""" lengths = arry.str.len() largest = lengths.max() if not for_disk: # Variant for in-memory datasets # Makes dask happier about strings typecode = ""S{bytes:d}"".format( largest ) else: # Variant for on-disk datasets # 0.2 and 0.6 are both guesses # If there's ""a lot"" of strings ""much shorter than"" the longest # use vlen str where available # otherwise use a string concatenation dimension if lengths.quantile(0.2) < 0.6 * largest: typecode = ""O"" else: typecode = ""S1"" return np.dtype(typecode) # Set up encoding for saving to netCDF encoding = {} for name, var in ds.items(): encoding[name] = {} var_kind = var.dtype.kind # Perhaps we should assume ""u"" means people know what they're # doing if var_kind in (""u"", ""i""): dtype = dtype_for_int_array(var) if var_kind == ""u"": dtype = dtype.replace(""i"", ""u"") elif var_kind == ""f"": finfo = np.finfo(var.dtype) abs_var = np.abs(var) dynamic_range = abs_var.max() / abs_var[abs_var > 0].min() if dynamic_range > 10**finfo.precision: # Dynamic range too high for quantization dtype = var.dtype else: # set scale_factor and add_offset for quantization # Also figure out what dtype compresses best var_min = var.min() var_range = var.max() - var_min mid_range = var_min + var_range / 2 # Rescale to -1 to 1 values_to_compress = (var - mid_range) / (0.5 * var_range) # for digits in range(finfo.precision): for digits in (2, 4, 9, 18): if np.allclose( values_to_compress, np.around(values_to_compress, digits), rtol=finfo.precision, ): dtype = var.dtype # Convert digits to integer dtype # digits <= 2 to i1 # digits <= 4 to i2 # digits <= 9 to i4 # digits <= 18 to i8 if digits <= 2: dtype = np.dtype(""i1"") elif digits <= 4: dtype = np.dtype(""i2"") elif digits <= 9: dtype = np.dtype(""i4"") else: dtype = np.dtype(""i8"") if dtype.itemsize >= var.dtype.itemsize: # Quantization saves space dtype = var.dtype else: # Quantization does not save space storage_iinfo = np.iinfo(dtype) encoding[name][""add_offset""] = mid_range.values encoding[name][""scale_factor""] = ( 2 * var_range / storage_iinfo.max ).values encoding[name][""_FillValue""] = storage_iinfo.min break else: # Quantization would lose information dtype = var.dtype elif var_kind == ""O"": dtype = dtype_for_str_array(var) else: dtype = var.dtype encoding[name][""dtype""] = dtype ds.to_netcdf(""ds_encoded.nc"", encoding=encoding) # Display results stat_csv = os.stat(""df.csv"") stat_nc = os.stat(""ds.nc"") stat_enc = os.stat(""ds_encoded.nc"") sizes = pd.Series( index=[""CSV"", ""default netCDF"", ""encoded netCDF""], data=[stats.st_size for stats in [stat_csv, stat_nc, stat_enc]], name=""File sizes"", ) print(""File sizes (kB):"", np.right_shift(sizes, 10), sep=""\n"", end=""\n\n"") print(""Sizes relative to CSV:"", sizes / sizes.iloc[0], sep=""\n"", end=""\n\n"") # Check that I didn't break the floats from_disk = xr.open_dataset(""ds_encoded.nc"") np_tst.assert_allclose(ds[""f""], from_disk[""f""], rtol=10**-DIGITS, atol=10**-DIGITS) ``` ```bash $ python xarray_auto_small_output.py && ls -sSh *.csv *.nc File sizes (kB): CSV 1942 default netCDF 10161 encoded netCDF 1375 Name: File sizes, dtype: int64 Sizes relative to CSV: CSV 1.000000 default netCDF 5.230366 encoded netCDF 0.708063 Name: File sizes, dtype: float64 10M ds.nc 1.9M df.csv 1.4M ds_encoded.nc ``` I added a column of floats with one digit before and after the decimal point to the example dataset, because why not. Does this satisfy your use-case? Should I turn the giant loop into a function to go into xarray somewhere? If so, I should probably tie the float handling in with [the new `least_significant_digit` feature in netCDF4-python](https://unidata.github.io/netcdf4-python/#efficient-compression-of-netcdf-variables) so the data gets read in the same way it was before getting written out.","{""total_count"": 0, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,412180435 https://github.com/pydata/xarray/issues/2780#issuecomment-1112215165,https://api.github.com/repos/pydata/xarray/issues/2780,1112215165,IC_kwDOAMm_X85CSw59,26384082,2022-04-28T13:37:57Z,2022-04-28T13:37:57Z,NONE,"In order to maintain a list of currently relevant issues, we mark issues as stale after a period of inactivity If this issue remains relevant, please comment here or remove the `stale` label; otherwise it will be marked as closed automatically ","{""total_count"": 0, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,412180435 https://github.com/pydata/xarray/issues/2780#issuecomment-633296515,https://api.github.com/repos/pydata/xarray/issues/2780,633296515,MDEyOklzc3VlQ29tbWVudDYzMzI5NjUxNQ==,22566757,2020-05-24T20:45:43Z,2020-05-24T20:45:43Z,CONTRIBUTOR,"For the example given, this would mean finding `largest = max(abs(ds.min()), abs(ds.max()))` and finding the first integer dtype wide enough to write that: `[np.iinfo(""i{bytes:d}"".format(bytes=2 ** i)).max >= largest for i in range(4)]` would help there. The function below should help with this; I would tend to use this at array creation time rather than at save time so you get these benefits in memory as well as on disk. For the character/string variables, the smallest representation varies a bit more: a fixed-width encoding (`dtype=S6`) will probably be smaller if all the strings are about the same size, while variable-width strings are probably smaller if there are many short strings and only a few long strings. If you happen to know that a given field is a five-character identifier or a one-character status code, you can again set these types to be used in memory (which I think makes dask happier when it comes time to save), while free-form survey responses will likely be better as a variable-length string. It may be possible use the distribution of string lengths (perhaps using [numpy.char.str_len](https://numpy.org/doc/stable/reference/generated/numpy.char.str_len.html)) to see whether most of the strings are at least 90% as long as the longest, but it's probably simpler to test. Doing this correctly for floating-point types would be difficult, but I think that's outside the scope of this issue. Hopefully this gives you something to work with. ```python import numpy as np def dtype_for_int_array(arry: ""array of integers"") -> np.dtype: """"""Find the smallest integer dtype that will encode arry. Parameters ---------- arry : array of integers The array to compress Returns ------- smallest: dtype The smallest dtype that will represent arry """""" largest = max(abs(arry.min()), abs(arry.max())) typecode = ""i{bytes:d}"".format( bytes=2 ** np.nonzero([ np.iinfo(""i{bytes:d}"".format(bytes=2 ** i)).max >= largest for i in range(4) ])[0][0] ) return np.dtype(typecode) ``` Looking at [`df.memory_usage()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.memory_usage.html) will explain why I do this early. If I extend your example with this new function, I see the following: ```python >>> df_small = df.copy() >>> for col in df_small: ... df_small[col] = df_small[col].astype( ... dtype_for_int_array(df_small[col]) if df_small[col].dtype.kind == ""i"" else ""S1"" ... ) ... >>> df_small.memory_usage() Index 80 a 100000 b 100000 c 100000 d 100000 e 800000 dtype: int64 >>> df.memory_usage() Index 80 a 800000 b 800000 c 800000 d 800000 e 800000 dtype: int64 ``` It looks like pandas always uses object dtype for string arrays, so the numbers in that column likely reflect the size of an array of pointers. XArray lets you use a dtype of ""S1"" or ""U1"", but I haven't found the equivalent of the `memory_usage` method.","{""total_count"": 0, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,412180435 https://github.com/pydata/xarray/issues/2780#issuecomment-465362210,https://api.github.com/repos/pydata/xarray/issues/2780,465362210,MDEyOklzc3VlQ29tbWVudDQ2NTM2MjIxMA==,43126798,2019-02-20T00:00:41Z,2019-02-20T00:00:41Z,CONTRIBUTOR,related: https://github.com/pydata/xarray/issues/2780,"{""total_count"": 0, ""+1"": 0, ""-1"": 0, ""laugh"": 0, ""hooray"": 0, ""confused"": 0, ""heart"": 0, ""rocket"": 0, ""eyes"": 0}",,412180435