Source code for SMBcorr.racmo_extrap_downscaled

#!/usr/bin/env python
u"""
racmo_extrap_downscaled.py
Written by Tyler Sutterley (09/2024)
Interpolates and extrapolates downscaled RACMO products to times and coordinates

Uses fast nearest-neighbor search algorithms
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.BallTree.html
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html
and inverse distance weighted interpolation to extrapolate spatially

INPUTS:
    base_dir: working data directory
    EPSG: projection of input spatial coordinates
    VERSION: Downscaled RACMO Version
        1.0: RACMO2.3/XGRN11
        2.0: RACMO2.3p2/XGRN11
        3.0: RACMO2.3p2/FGRN055
        4.0: RACMO2.3p2/FGRN055
    tdec: dates to interpolate in year-decimal
    X: x-coordinates to interpolate in projection EPSG
    Y: y-coordinates to interpolate in projection EPSG

OPTIONS:
    VARIABLE: RACMO product to interpolate
        SMB: Surface Mass Balance
        PRECIP: Precipitation
        RUNOFF: Melt Water Runoff
        SNOWMELT: Snowmelt
        REFREEZE: Melt Water Refreeze
    SEARCH: nearest-neighbor search algorithm (BallTree or KDTree)
    NN: number of nearest-neighbor points to use
    POWER: inverse distance weighting power
    FILL_VALUE: output fill_value for invalid points

PYTHON DEPENDENCIES:
    numpy: Scientific Computing Tools For Python
        https://numpy.org
        https://numpy.org/doc/stable/user/numpy-for-matlab-users.html
    scipy: Scientific Tools for Python
        https://docs.scipy.org/doc/
    netCDF4: Python interface to the netCDF C library
         https://unidata.github.io/netcdf4-python/netCDF4/index.html
    pyproj: Python interface to PROJ library
        https://pypi.org/project/pyproj/
    scikit-learn: Machine Learning in Python
        https://scikit-learn.org/stable/index.html
        https://github.com/scikit-learn/scikit-learn

PROGRAM DEPENDENCIES:
    regress_model.py: models a time series using least-squares regression

UPDATE HISTORY:
    Updated 09/2024: use wrapper to importlib for optional dependencies
    Updated 02/2023: don't recompute min and max time cutoffs for cases
    Updated 10/2022: added version 4.0 (RACMO2.3p2 for 1958-2022 from FGRN055)
    Updated 08/2022: updated docstrings to numpy documentation format
    Updated 01/2021: using conversion protocols following pyproj-2 updates
        https://pyproj4.github.io/pyproj/stable/gotchas.html
    Updated 04/2020: reduced to interpolation function.  output masked array
    Updated 09/2019: read subsets of DS1km netCDF4 file to save memory
    Written 09/2019
"""
from __future__ import print_function

import sys
import os
import re
import warnings
import numpy as np
import scipy.interpolate
from SMBcorr.regress_model import regress_model
import SMBcorr.spatial
import SMBcorr.utilities

# attempt imports
netCDF4 = SMBcorr.utilities.import_dependency('netCDF4')
pyproj = SMBcorr.utilities.import_dependency('pyproj')

# PURPOSE: read and interpolate downscaled RACMO products
[docs]def extrapolate_racmo_downscaled(base_dir, EPSG, VERSION, tdec, X, Y, VARIABLE='SMB', SEARCH='BallTree', NN=10, POWER=2.0, FILL_VALUE=None): """ Spatially extrapolates downscaled RACMO products Parameters ---------- base_dir: str Working data directory EPSG: str or int input coordinate reference system VERSION: str Downscaled RACMO Version - ``1.0``: RACMO2.3/XGRN11 - ``2.0``: RACMO2.3p2/XGRN11 - ``3.0``: RACMO2.3p2/FGRN055 tdec: float time coordinates to interpolate in year-decimal X: float x-coordinates to interpolate Y: float y-coordinates to interpolate VARIABLE: str, default 'SMB' RACMO product to interpolate - ``SMB``: Surface Mass Balance - ``PRECIP``: Precipitation - ``RUNOFF``: Melt Water Runoff - ``SNOWMELT``: Snowmelt - ``REFREEZE``: Melt Water Refreeze SEARCH: str, default 'BallTree' nearest-neighbor search algorithm NN: int, default 10 number of nearest-neighbor points to use POWER: int or float, default 2.0 Inverse distance weighting power FILL_VALUE: float or NoneType, default None Output fill_value for invalid points Default will use fill values from data file """ # Full Directory Setup DIRECTORY = 'SMB1km_v{0}'.format(VERSION) # netcdf variable names input_products = {} input_products['SMB'] = 'SMB_rec' input_products['PRECIP'] = 'precip' input_products['RUNOFF'] = 'runoff' input_products['SNOWMELT'] = 'snowmelt' input_products['REFREEZE'] = 'refreeze' # versions 1 and 4 are in separate files for each year if (VERSION == '1.0'): RACMO_MODEL = ['XGRN11','2.3'] VARNAME = input_products[VARIABLE] SUBDIRECTORY = '{0}_v{1}'.format(VARNAME,VERSION) input_dir = os.path.join(base_dir, 'RACMO', DIRECTORY, SUBDIRECTORY) elif (VERSION == '2.0'): RACMO_MODEL = ['XGRN11','2.3p2'] var = input_products[VARIABLE] VARNAME = var if VARIABLE in ('SMB','PRECIP') else '{0}corr'.format(var) input_dir = os.path.join(base_dir, 'RACMO', DIRECTORY) elif (VERSION == '3.0'): RACMO_MODEL = ['FGRN055','2.3p2'] var = input_products[VARIABLE] VARNAME = var if (VARIABLE == 'SMB') else '{0}corr'.format(var) input_dir = os.path.join(base_dir, 'RACMO', DIRECTORY) elif (VERSION == '4.0'): RACMO_MODEL = ['FGRN055','2.3p2'] var = input_products[VARIABLE] VARNAME = var if (VARIABLE == 'SMB') else '{0}corr'.format(var) input_dir = os.path.join(base_dir, 'RACMO', DIRECTORY) # input cumulative netCDF4 file args = (RACMO_MODEL[0],RACMO_MODEL[1],VERSION,VARIABLE) input_file = '{0}_RACMO{1}_DS1km_v{2}_{3}_cumul.nc'.format(*args) # Open the RACMO NetCDF file for reading fileID = netCDF4.Dataset(os.path.join(input_dir,input_file), 'r') # input shape of RACMO data nt,ny,nx = fileID[VARNAME].shape # Get data from each netCDF variable d = {} # cell origins on the bottom right dx = np.abs(fileID.variables['x'][1]-fileID.variables['x'][0]) dy = np.abs(fileID.variables['y'][1]-fileID.variables['y'][0]) # latitude and longitude arrays at center of each cell d['LON'] = fileID.variables['LON'][:,:].copy() d['LAT'] = fileID.variables['LAT'][:,:].copy() # extract time (decimal years) d['TIME'] = fileID.variables['TIME'][:].copy() # mask object for interpolating data d['MASK'] = np.array(fileID.variables['MASK'][:],dtype=bool) i,j = np.nonzero(d['MASK']) # convert RACMO latitude and longitude to input coordinates (EPSG) crs1 = pyproj.CRS.from_string(EPSG) crs2 = pyproj.CRS.from_epsg(4326) transformer = pyproj.Transformer.from_crs(crs1, crs2, always_xy=True) direction = pyproj.enums.TransformDirection.INVERSE # convert projection from model coordinates xg,yg = transformer.transform(d['LON'], d['LAT'], direction=direction) # construct search tree from original points # can use either BallTree or KDTree algorithms xy1 = np.concatenate((xg[i,j,None],yg[i,j,None]),axis=1) tree = SMBcorr.spatial.build_tree(xy1, SEARCH=SEARCH) # output extrapolated arrays of variable npts = len(tdec) extrap_data = np.ma.zeros((npts),dtype=np.float64) extrap_data.data[:] = extrap_data.fill_value extrap_data.mask = np.zeros((npts),dtype=bool) # type designating algorithm used (1:interpolate, 2:backward, 3:forward) extrap_data.interpolation = np.zeros((npts),dtype=np.uint8) # time cutoff without close time interpolation time_cutoff = (d['TIME'].min(), d['TIME'].max()) # find days that can be extrapolated if np.any((tdec >= time_cutoff[0]) & (tdec <= time_cutoff[1])): # indices of dates for interpolated days ind,=np.nonzero((tdec >= time_cutoff[0]) & (tdec < time_cutoff[1])) # reduce x, y and t coordinates xind,yind,tind = (X[ind],Y[ind],tdec[ind]) # determine which subset of time to read from the netCDF4 file f = scipy.interpolate.interp1d(d['TIME'], np.arange(nt), kind='linear', fill_value=(0,nt-1), bounds_error=False) date_indice = f(tind).astype(np.int64) # for each unique RACMO date # linearly interpolate in time between two RACMO maps # then then inverse distance weighting to extrapolate in space for k in np.unique(date_indice): kk, = np.nonzero(date_indice==k) count = np.count_nonzero(date_indice==k) # query the search tree to find the NN closest points xy2 = np.concatenate((xind[kk,None],yind[kk,None]),axis=1) dist,indices = tree.query(xy2, k=NN, return_distance=True) # normalized weights if POWER > 0 (typically between 1 and 3) # in the inverse distance weighting power_inverse_distance = dist**(-POWER) s = np.sum(power_inverse_distance, axis=1) w = power_inverse_distance/np.broadcast_to(s[:,None],(count,NN)) # RACMO variables for times before and after tdec var1 = fileID.variables[VARNAME][k,i,j].copy() var2 = fileID.variables[VARNAME][k+1,i,j].copy() # linearly interpolate to date dt = (tind[kk] - d['TIME'][k])/(d['TIME'][k+1] - d['TIME'][k]) # spatially extrapolate using inverse distance weighting extrap_data[kk] = (1.0-dt)*np.sum(w*var1[indices],axis=1) + \ dt*np.sum(w*var2[indices], axis=1) extrap_data # set interpolation type (1: interpolated in time) extrap_data.interpolation[ind] = 1 # check if needing to extrapolate backwards in time count = np.count_nonzero((tdec < time_cutoff[0])) if (count > 0): # indices of dates before RACMO ind, = np.nonzero(tdec < time_cutoff[0]) # query the search tree to find the NN closest points xy2 = np.concatenate((X[ind,None],Y[ind,None]),axis=1) dist,indices = tree.query(xy2, k=NN, return_distance=True) # normalized weights if POWER > 0 (typically between 1 and 3) # in the inverse distance weighting power_inverse_distance = dist**(-POWER) s = np.sum(power_inverse_distance, axis=1) w = power_inverse_distance/np.broadcast_to(s[:,None],(count,NN)) # calculate a regression model for calculating values # read first 10 years of data to create regression model N = 120 # spatially interpolate variables to coordinates VAR = np.zeros((count,N)) T = np.zeros((N)) # create interpolated time series for calculating regression model for k in range(N): # time at k T[k] = d['TIME'][k] # spatially extrapolate variables var1 = fileID.variables[VARNAME][k,i,j].copy() VAR[:,k] = np.sum(w*var1[indices],axis=1) # calculate regression model for n,v in enumerate(ind): extrap_data[v] = regress_model(T, VAR[n,:], tdec[v], ORDER=2, CYCLES=[0.25,0.5,1.0,2.0,4.0,5.0], RELATIVE=T[0]) # set interpolation type (2: extrapolated backwards in time) extrap_data.interpolation[ind] = 2 # check if needing to extrapolate forward in time count = np.count_nonzero((tdec > time_cutoff[1])) if (count > 0): # indices of dates after RACMO ind, = np.nonzero(tdec >= time_cutoff[1]) # query the search tree to find the NN closest points xy2 = np.concatenate((X[ind,None],Y[ind,None]),axis=1) dist,indices = tree.query(xy2, k=NN, return_distance=True) # normalized weights if POWER > 0 (typically between 1 and 3) # in the inverse distance weighting power_inverse_distance = dist**(-POWER) s = np.sum(power_inverse_distance, axis=1) w = power_inverse_distance/np.broadcast_to(s[:,None],(count,NN)) # calculate a regression model for calculating values # read last 10 years of data to create regression model N = 120 # spatially interpolate variables to coordinates FIRN = np.zeros((count,N)) T = np.zeros((N)) # create interpolated time series for calculating regression model for k in range(N): kk = nt - N + k # time at k T[k] = d['TIME'][kk] # spatially extrapolate variables var1 = fileID.variables[VARNAME][kk,i,j].copy() VAR[:,k] = np.sum(w*var1[indices],axis=1) # calculate regression model for n,v in enumerate(ind): extrap_data[v] = regress_model(T, VAR[n,:], tdec[v], ORDER=2, CYCLES=[0.25,0.5,1.0,2.0,4.0,5.0], RELATIVE=T[-1]) # set interpolation type (3: extrapolated forward in time) extrap_data.interpolation[ind] = 3 # complete mask if any invalid in data invalid, = np.nonzero(extrap_data.data == extrap_data.fill_value) extrap_data.mask[invalid] = True # replace fill value if specified if FILL_VALUE: extrap_data.data[extrap_data.mask] = FILL_VALUE extrap_data.fill_value = FILL_VALUE # close the NetCDF files fileID.close() # return the extrapolated values return extrap_data