• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

mcuntz / pyjams / 6908420384

17 Nov 2023 07:48PM UTC coverage: 26.964% (-68.5%) from 95.437%
6908420384

push

github

mcuntz
Split ncread and ncinfo in separate files

1 of 4 new or added lines in 1 file covered. (25.0%)

936 existing lines in 13 files now uncovered.

357 of 1324 relevant lines covered (26.96%)

1.02 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

14.1
/src/pyjams/kernel_regression.py
1
#!/usr/bin/env python
2
"""
4✔
3
Multi-dimensional non-parametric kernel regression
4

5
This module was written by Matthias Cuntz while at Department of
6
Computational Hydrosystems, Helmholtz Centre for Environmental
7
Research - UFZ, Leipzig, Germany, and continued while at Institut
8
National de Recherche pour l'Agriculture, l'Alimentation et
9
l'Environnement (INRAE), Nancy, France.
10

11
:copyright: Copyright 2012-2022 Matthias Cuntz, see AUTHORS.rst for details.
12
:license: MIT License, see LICENSE for details.
13

14
.. moduleauthor:: Matthias Cuntz
15

16
The following functions are provided
17

18
.. autosummary::
19
   kernel_regression_h
20
   kernel_regression
21

22
History
23
    * Written, Jun 2012 by  Matthias Cuntz - mc (at) macu (dot) de,
24
      inspired by Matlab routines of Yingying Dong, Boston College and
25
      Yi Cao, Cranfield University
26
    * Assert correct input, Apr 2014, Matthias Cuntz
27
    * Corrected bug in _boot_h: x.size->x.shape[0], Jan 2018, Matthias Cuntz
28
    * Code refactoring, Sep 2021, Matthias Cuntz
29
    * Use format strings, Apr 2022, Matthias Cuntz
30
    * Use minimize with method TNC instead of fmin_tnc,
31
      Apr 2022, Matthias Cuntz
32
    * Use helper function array2input to assure correct output type,
33
      Apr 2022, Matthias Cuntz
34
    * Return scalar h if 1-dimensional, Apr 2022, Matthias Cuntz
35
    * Output type is same as y instead of x or xout, Apr 2022, Matthias Cuntz
36

37
"""
38
import numpy as np
4✔
39
import scipy.optimize as opt
4✔
40
from .division import division
4✔
41
from .helper import array2input
4✔
42
# from division import division
43
# from helper import array2input
44

45

46
__all__ = ['kernel_regression_h', 'kernel_regression']
4✔
47

48

49
def _nadaraya_watson(z, y):
4✔
50
    """
51
    Helper function that calculates the Nadaraya-Watson estimator
52
    for a given kernel.
53
    Until now there is only the gaussian kernel.
54

55
    """
UNCOV
56
    kerf = 1. / np.sqrt(2. * np.pi) * np.exp(-0.5 * z * z)
×
UNCOV
57
    w = np.prod(kerf, 1)
×
UNCOV
58
    out = division(np.dot(w, y), np.sum(w), np.nan)
×
59

UNCOV
60
    return out
×
61

62

63
def _cross_valid_h(h, x, y):
4✔
64
    """
65
    Helper function that calculates cross-validation function for the
66
    Nadaraya-Watson estimator, which is basically the mean square error
67
    where model estimate is replaced by the jackknife estimate
68
    (Hardle and Muller 2000).
69

70
    """
UNCOV
71
    n = x.shape[0]
×
72
    # allocate output
UNCOV
73
    out = np.empty(n)
×
74
    # Loop through each regression point
UNCOV
75
    for i in range(n):
×
76
        # all-1 points
UNCOV
77
        xx = np.delete(x, i, axis=0)
×
UNCOV
78
        yy = np.delete(y, i, axis=0)
×
UNCOV
79
        z = (xx - x[i, :]) / h
×
UNCOV
80
        out[i] = _nadaraya_watson(z, yy)
×
UNCOV
81
    cv = np.sum((y - out)**2) / float(n)
×
82

UNCOV
83
    return cv
×
84

85

86
def _boot_h(h, x, y):
4✔
87
    """
88
    Helper function that calculates bootstrap function for the
89
    Nadaraya-Watson estimator, which is basically the mean square error
90
    where model estimate is replaced by the jackknife estimate
91
    (Hardle and Muller 2000).
92

93
    This does basically _cross_valid_h for 100 random points.
94

95
    """
UNCOV
96
    n = 100
×
UNCOV
97
    ind = np.random.randint(x.shape[0], size=n)
×
98
    # allocate output
UNCOV
99
    out = np.empty(n)
×
100
    # Loop through each bootstrap point
UNCOV
101
    for i in range(n):
×
102
        # all-1 points
UNCOV
103
        xx = np.delete(x, i, axis=0)
×
UNCOV
104
        yy = np.delete(y, i, axis=0)
×
UNCOV
105
        z = (xx - x[i, :]) / h
×
UNCOV
106
        out[i] = _nadaraya_watson(z, yy)
×
UNCOV
107
    cv = np.sum((y[ind] - out)**2) / float(n)
×
108

UNCOV
109
    return cv
×
110

111

112
def kernel_regression_h(x, y, silverman=False):
4✔
113
    """
114
    Optimal bandwidth for multi-dimensional non-parametric kernel regression
115

116
    Optimal bandwidth is determined using cross-validation or
117
    Silverman's rule-of-thumb.
118

119
    Parameters
120
    ----------
121
    x : array_like (n, k)
122
        Independent values
123
    y : array_like (n)
124
        Dependent values
125
    silverman : bool, optional
126
        Use Silverman's rule-of-thumb to calculate bandwidth *h* if True,
127
        otherwise determine *h* via cross-validation
128

129
    Returns
130
    -------
131
    float or array
132
        Optimal bandwidth *h*. If multidimensional regression then *h* is a
133
        1d-array, assuming a diagonal bandwidth matrix.
134

135
    References
136
    ----------
137
    Hardle W and Muller M (2000) Multivariate and semiparametric kernel
138
       regression. In MG Schimek (Ed.), Smoothing and regression: Approaches,
139
       computation, and application (pp. 357-392). Hoboken, NJ, USA: John Wiley
140
       & Sons, Inc. doi: 10.1002/9781118150658.ch12
141

142

143
    Examples
144
    --------
145
    >>> n = 10
146
    >>> x = np.zeros((n, 2))
147
    >>> x[:, 0] = np.arange(n, dtype=float) / float(n-1)
148
    >>> x[:, 1] = 1. / (np.arange(n, dtype=float) / float(n-1) + 0.1)
149
    >>> y = 1. + x[:, 0]**2 - np.sin(x[:, 1])**2
150

151
    >>> h = kernel_regression_h(x, y)
152
    >>> print(np.allclose(h, [0.172680, 9.516907], atol=0.0001))
153
    True
154

155
    >>> h = kernel_regression_h(x, y, silverman=True)
156
    >>> print(np.allclose(h, [0.229190, 1.903381], atol=0.0001))
157
    True
158

159
    >>> n = 10
160
    >>> x = np.arange(n, dtype=float) / float(n-1)
161
    >>> y = 1. + x**2 - np.sin(x)**2
162

163
    >>> h = kernel_regression_h(x, y)
164
    >>> print(np.around(h, 4))
165
    0.045
166

167
    >>> h = kernel_regression_h(x, y, silverman=True)
168
    >>> print(np.around(h, 4))
169
    0.2248
170

171
    """
UNCOV
172
    xx = np.array(x)
×
UNCOV
173
    yy = np.array(y)
×
174
    # Check input
UNCOV
175
    assert xx.shape[0] == yy.size, (
×
176
        f'size(x, 0) != size(y): {xx.shape[0]} != {yy.size}' )
UNCOV
177
    if xx.ndim == 1:  # deal with 1d-arrays
×
UNCOV
178
        xx = xx[:, np.newaxis]
×
UNCOV
179
    n = xx.shape[0]
×
UNCOV
180
    d = xx.shape[1]
×
181

182
    # Silverman (1986), Scott (1992), Bowman and Azzalini (1997)
183
    # Very similar to stats.gaussian_kde
184
    # h has dimension d
UNCOV
185
    h = ( (4. / float(d + 2) / float(n))**(1. / float(d + 4)) *
×
186
          np.std(xx, axis=0, ddof=1) )
187

UNCOV
188
    if not silverman:
×
189
        # Find the optimal h
UNCOV
190
        bounds = [(0.2*i, 5.0*i) for i in h]
×
UNCOV
191
        if n <= 100:
×
UNCOV
192
            res = opt.minimize(
×
193
                _cross_valid_h, h, args=(xx, yy), method='TNC', bounds=bounds,
194
                options={'ftol': 1e-10, 'xtol': 1e-10, 'maxfun': 1000})
UNCOV
195
            h = res.x
×
196
        else:
UNCOV
197
            res = opt.minimize(
×
198
                _boot_h, h, args=(xx, yy), method='TNC', bounds=bounds,
199
                options={'ftol': 1e-10, 'xtol': 1e-10, 'maxfun': 1000})
UNCOV
200
            h = res.x
×
201

UNCOV
202
    if len(h) == 1:
×
UNCOV
203
        h = h[0]
×
204

UNCOV
205
    return h
×
206

207

208
def kernel_regression(x, y, h=None, silverman=False, xout=None):
4✔
209
    """
210
    Multi-dimensional non-parametric kernel regression
211

212
    Optimal bandwidth can be estimated by cross-validation
213
    or by using Silverman's rule-of-thumb.
214

215
    Parameters
216
    ----------
217
    x : array_like (n, k)
218
        Independent values
219
    y : array_like (n)
220
        Dependent values
221
    h : float or array_like(k), optional
222
        Use *h* as bandwidth for calculating regression values if given,
223
        otherwise determine optimal *h* using cross-validation or by using
224
        Silverman's rule-of-thumb if *silverman==True*.
225
    silverman : bool, optional
226
        Use Silverman's rule-of-thumb to calculate bandwidth *h* if True,
227
        otherwise determine *h* via cross-validation.
228
        Only used if *h* is not given.
229
    xout : ndarray(n, k), optional
230
        Return fitted values at *xout* if given,
231
        otherwise return fitted values at *x*.
232

233
    Returns
234
    -------
235
    array_like with same type as *x*, or *xout* if given
236
        Fitted values at *x*, or at *xout* if given
237

238
    References
239
    ----------
240
    Hardle W and Muller M (2000) Multivariate and semiparametric kernel
241
       regression. In MG Schimek (Ed.), Smoothing and regression: Approaches,
242
       computation, and application (pp. 357-392). Hoboken, NJ, USA: John Wiley
243
       & Sons, Inc. doi: 10.1002/9781118150658.ch12
244

245
    Examples
246
    --------
247
    >>> n = 10
248
    >>> x = np.zeros((n, 2))
249
    >>> x[:, 0] = np.arange(n, dtype=float) / float(n-1)
250
    >>> x[:, 1] = 1. / (np.arange(n, dtype=float) / float(n-1) + 0.1)
251
    >>> y = 1. + x[:, 0]**2 - np.sin(x[:, 1])**2
252

253
    Separate determination of h and kernel regression
254

255
    >>> h = kernel_regression_h(x, y)
256
    >>> yk = kernel_regression(x, y, h)
257
    >>> print(np.allclose(yk[0:6],
258
    ...       [0.52241, 0.52570, 0.54180, 0.51781, 0.47644, 0.49230],
259
    ...       atol=0.0001))
260
    True
261

262
    Single call to kernel regression
263

264
    >>> yk = kernel_regression(x, y)
265
    >>> print(np.allclose(yk[0:6],
266
    ...       [0.52241, 0.52570, 0.54180, 0.51781, 0.47644, 0.49230],
267
    ...       atol=0.0001))
268
    True
269

270
    Single call to kernel regression using Silverman's rule-of-thumb for h
271

272
    >>> yk = kernel_regression(x, y, silverman=True)
273
    >>> print(np.allclose(yk[0:6],
274
    ...       [0.691153, 0.422809, 0.545844, 0.534315, 0.521494, 0.555426],
275
    ...       atol=0.0001))
276
    True
277

278
    >>> n = 5
279
    >>> xx = np.empty((n, 2))
280
    >>> xx[:, 0] = (np.amin(x[:, 0]) + (np.amax(x[:, 0]) - np.amin(x[:, 0])) *
281
    ...                                 np.arange(n, dtype=float) / float(n))
282
    >>> xx[:, 1] = (np.amin(x[:, 1]) + (np.amax(x[:, 1]) - np.amin(x[:, 1])) *
283
    ...                                 np.arange(n, dtype=float) / float(n))
284
    >>> yk = kernel_regression(x, y, silverman=True, xout=xx)
285
    >>> print(np.allclose(yk,
286
    ...       [0.605485, 0.555235, 0.509529, 0.491191, 0.553325],
287
    ...       atol=0.0001))
288
    True
289

290
    """
UNCOV
291
    xx = np.array(x)
×
UNCOV
292
    yy = np.array(y)
×
293
    # Check input
UNCOV
294
    assert xx.shape[0] == yy.size, (
×
295
        f'size(x,0) != size(y): {xx.shape[0]} != {yy.size}' )
296
    # deal with 1d-arrays and save 1d input type
UNCOV
297
    if xx.ndim == 1:
×
UNCOV
298
        xx = xx[:, np.newaxis]
×
UNCOV
299
    d = xx.shape[1]
×
300

301
    # determine h
UNCOV
302
    if h is None:
×
UNCOV
303
        hh = kernel_regression_h(xx, yy, silverman=silverman)
×
304
    else:
UNCOV
305
        if np.size(np.shape(h)) == 0:
×
UNCOV
306
            hh = np.repeat(h, d)  # use h for all dimensions if one h given
×
307
        else:
UNCOV
308
            hh = np.array(h)
×
UNCOV
309
        assert np.size(hh) == d, 'size(h) must be 1 or x.shape[1]'
×
310

311
    # Calc regression
UNCOV
312
    if xout is None:
×
UNCOV
313
        xxout = xx
×
314
    else:
UNCOV
315
        xxout = np.array(xout)
×
UNCOV
316
    if xxout.ndim == 1:
×
UNCOV
317
        xxout = xxout[:, np.newaxis]
×
UNCOV
318
    nout  = xxout.shape[0]
×
UNCOV
319
    dout  = xxout.shape[1]
×
UNCOV
320
    assert d == dout, f'size(x, 1) != size(xout, 1): {d} != {dout}'
×
321
    # allocate output
UNCOV
322
    out = np.empty(nout)
×
323
    # Loop through each regression point
UNCOV
324
    for i in range(nout):
×
325
        # scaled deference from regression point
UNCOV
326
        z = (xx - xxout[i, :]) / hh
×
327
        # nadaraya-watson estimator of gaussian multivariate kernel
UNCOV
328
        out[i] = _nadaraya_watson(z, y)
×
329

UNCOV
330
    out = array2input(out, y, undef=np.nan)
×
331

UNCOV
332
    return out
×
333

334

335
if __name__ == '__main__':
336
    import doctest
337
    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
338

339
    # nn = 1000
340
    # x = np.zeros((nn, 2))
341
    # x[:, 0] = np.arange(nn, dtype=float) / float(nn-1)
342
    # x[:, 1] = 1. / (x[:, 0] + 0.1)
343
    # y      = 1. + x[:, 0]**2 - np.sin(x[:, 1])**2
344
    # h = kernel_regression_h(x, y)
345
    # print(h)
346
    # yy = kernel_regression(x, y, h, xout=x)
347
    # print(yy[0], yy[-1])
348
    # print(kernel_regression(x, y))
349
    # h = kernel_regression_h(x, y, silverman=True)
350
    # print(h)
351
    # print(kernel_regression(x, y, h))
352
    # ss = np.shape(x)
353
    # nn = 5
354
    # xx = np.empty((nn, ss[1]))
355
    # xx[:,0] = (np.amin(x[:,0]) + (np.amax(x[:,0])-np.amin(x[:,0])) *
356
    #                               np.arange(nn,dtype=float)/float(nn))
357
    # xx[:,1] = (np.amin(x[:,1]) + (np.amax(x[:,1])-np.amin(x[:,1])) *
358
    #                               np.arange(nn,dtype=float)/float(nn))
359
    # print(kernel_regression(x, y, h, xout=xx))
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc