numpypython-xarraynetcdf4opendap

xarray: mean of data stored via OPeNDAP


I'm using xarray's very cool pydap back-end (http://xarray.pydata.org/en/stable/io.html#opendap) to read data stored via OPenDAP at IRI:

import xarray as xr
remote_data = xr.open_dataarray('http://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods')
print(remote_data)
#<xarray.DataArray 'zg' (P: 2, S: 6569, M: 3, L: 45, Y: 181, X: 360)>
#[115569730800 values with dtype=float32]
#Coordinates:
# * L        (L) timedelta64[ns] 0 days 12:00:00 1 days 12:00:00 ...
# * Y        (Y) float32 -90.0 -89.0 -88.0 -87.0 -86.0 -85.0 -84.0 -83.0 ...
# * S        (S) datetime64[ns] 1999-01-07 1999-01-08 1999-01-09 1999-01-10 ...
# * M        (M) float32 1.0 2.0 3.0
# * X        (X) float32 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 ...
# * P        (P) int32 500 200
#Attributes:
#    level_type:     pressure level
#    standard_name:  geopotential_height
#    long_name:      Geopotential Height
#    units:          m

For reference it's sub-seasonal forecast data where L is lead-time (45 days forecasts), S is initialization date and M is ensemble.

I would like to do an ensemble mean and i'm only interested in the 500 hPa level. However, it crashes out and gives a RuntimeError: NetCDF: Access failure:

da = remote_data.sel(P=500)
da_ensmean = da.mean(dim='M')

RuntimeError                              Traceback (most recent call last)
<ipython-input-46-eca488e9def5> in <module>()
      1 remote_data = xr.open_dataarray('http://iridl.ldeo.columbia.edu/SOURCES/.Models'                                '/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods')
      2 da = remote_data.sel(P=500)
----> 3 da_ensmean = da.mean(dim='M')

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/common.py in wrapped_func(self, dim, axis, skipna, keep_attrs, **kwargs)
     20                              keep_attrs=False, **kwargs):
     21                 return self.reduce(func, dim, axis, keep_attrs=keep_attrs,
---> 22                                    skipna=skipna, allow_lazy=True, **kwargs)
     23         else:
     24             def wrapped_func(self, dim=None, axis=None, keep_attrs=False,

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/dataarray.py in reduce(self, func, dim, axis, keep_attrs, **kwargs)
   1359             summarized data and the indicated dimension(s) removed.
   1360         """
-> 1361         var = self.variable.reduce(func, dim, axis, keep_attrs, **kwargs)
   1362         return self._replace_maybe_drop_dims(var)
   1363 

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in reduce(self, func, dim, axis, keep_attrs, allow_lazy, **kwargs)
   1264         if dim is not None:
   1265             axis = self.get_axis_num(dim)
-> 1266         data = func(self.data if allow_lazy else self.values,
   1267                     axis=axis, **kwargs)
   1268 

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in data(self)
    293             return self._data
    294         else:
--> 295             return self.values
    296 
    297     @data.setter

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in values(self)
    385     def values(self):
    386         """The variable's data as a numpy.ndarray"""
--> 387         return _as_array_or_item(self._data)
    388 
    389     @values.setter

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in _as_array_or_item(data)
    209     TODO: remove this (replace with np.asarray) once these issues are fixed
    210     """
--> 211     data = np.asarray(data)
    212     if data.ndim == 0:
    213         if data.dtype.kind == 'M':

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
    490 
    491     """
--> 492     return array(a, dtype, copy=False, order=order)
    493 
    494 

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
    622 
    623     def __array__(self, dtype=None):
--> 624         self._ensure_cached()
    625         return np.asarray(self.array, dtype=dtype)
    626 

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in _ensure_cached(self)
    619     def _ensure_cached(self):
    620         if not isinstance(self.array, NumpyIndexingAdapter):
--> 621             self.array = NumpyIndexingAdapter(np.asarray(self.array))
    622 
    623     def __array__(self, dtype=None):

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
    490 
    491     """
--> 492     return array(a, dtype, copy=False, order=order)
    493 
    494 

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
    600 
    601     def __array__(self, dtype=None):
--> 602         return np.asarray(self.array, dtype=dtype)
    603 
    604     def __getitem__(self, key):

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
    490 
    491     """
--> 492     return array(a, dtype, copy=False, order=order)
    493 
    494 

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
    506     def __array__(self, dtype=None):
    507         array = as_indexable(self.array)
--> 508         return np.asarray(array[self.key], dtype=None)
    509 
    510     def transpose(self, order):

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/coding/variables.py in __getitem__(self, key)
     64 
     65     def __getitem__(self, key):
---> 66         return self.func(self.array[key])
     67 
     68     def __repr__(self):

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/coding/variables.py in _apply_mask(data, encoded_fill_values, decoded_fill_value, dtype)
    133     for fv in encoded_fill_values:
    134         condition |= data == fv
--> 135     data = np.asarray(data, dtype=dtype)
    136     return np.where(condition, decoded_fill_value, data)
    137 

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
    490 
    491     """
--> 492     return array(a, dtype, copy=False, order=order)
    493 
    494 

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
    506     def __array__(self, dtype=None):
    507         array = as_indexable(self.array)
--> 508         return np.asarray(array[self.key], dtype=None)
    509 
    510     def transpose(self, order):

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/backends/netCDF4_.py in __getitem__(self, key)
     63         with self.datastore.ensure_open(autoclose=True):
     64             try:
---> 65                 array = getitem(self.get_array(), key.tuple)
     66             except IndexError:
     67                 # Catch IndexError in netCDF4 and return a more informative

~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/backends/common.py in robust_getitem(array, key, catch, max_retries, initial_delay)
    114     for n in range(max_retries + 1):
    115         try:
--> 116             return array[key]
    117         except catch:
    118             if n == max_retries:

netCDF4/_netCDF4.pyx in netCDF4._netCDF4.Variable.__getitem__()

netCDF4/_netCDF4.pyx in netCDF4._netCDF4.Variable._get()

netCDF4/_netCDF4.pyx in netCDF4._netCDF4._ensure_nc_success()

RuntimeError: NetCDF: Access failure

Breaking down the calculation removes the RuntimeError. Guess it was just too hefty of a calculation with all the start times. Shouldn't be too difficult to put in a loop over S:

da = remote_data.isel(P=0,S=0)
da_ensmean = da.mean(dim='M')
print(da_ensmean)
<xarray.DataArray 'zg' (L: 45, Y: 181, X: 360)>
array([[[5231.1445, 5231.1445, ..., 5231.1445, 5231.1445],
        [5231.1445, 5231.1445, ..., 5231.1445, 5231.1445],
        ...,
        [5056.2383, 5056.2383, ..., 5056.2383, 5056.2383],
        [5056.2383, 5056.2383, ..., 5056.2383, 5056.2383]],

       [[5211.346 , 5211.346 , ..., 5211.346 , 5211.346 ],
        [5211.346 , 5211.346 , ..., 5211.346 , 5211.346 ],
        ...,
        [5082.062 , 5082.062 , ..., 5082.062 , 5082.062 ],
        [5082.062 , 5082.062 , ..., 5082.062 , 5082.062 ]],

       ...,

       [[5108.8247, 5108.8247, ..., 5108.8247, 5108.8247],
        [5108.8247, 5108.8247, ..., 5108.8247, 5108.8247],
        ...,
        [5154.2173, 5154.2173, ..., 5154.2173, 5154.2173],
        [5154.2173, 5154.2173, ..., 5154.2173, 5154.2173]],

       [[5106.4893, 5106.4893, ..., 5106.4893, 5106.4893],
        [5106.4893, 5106.4893, ..., 5106.4893, 5106.4893],
        ...,
        [5226.0063, 5226.0063, ..., 5226.0063, 5226.0063],
        [5226.0063, 5226.0063, ..., 5226.0063, 5226.0063]]], dtype=float32)
Coordinates:
  * L        (L) timedelta64[ns] 0 days 12:00:00 1 days 12:00:00 ...
  * Y        (Y) float32 -90.0 -89.0 -88.0 -87.0 -86.0 -85.0 -84.0 -83.0 ...
    S        datetime64[ns] 1999-01-07
  * X        (X) float32 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 ...
    P        int32 500

Solution

  • This is a good use-case for chunking with dask, e.g.,

    import xarray as xr
    url = 'http://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods'
    remote_data = xr.open_dataarray(url, chunks={'S': 1, 'L': 1})
    da = remote_data.sel(P=500)
    da_ensmean = da.mean(dim='M')
    

    This version will access the data server in parallel, using many smaller chunks. It will still be slow to download 231 GB of data, but your request will have much better odds of success.