• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

florisvb / PyNumDiff / 18514678847

15 Oct 2025 01:15AM UTC coverage: 74.715% (-0.3%) from 74.977%
18514678847

Pull #161

github

web-flow
Merge c99e6ca0b into 942692adf
Pull Request #161: `robustdiff` to address #97

58 of 82 new or added lines in 5 files covered. (70.73%)

1 existing line in 1 file now uncovered.

851 of 1139 relevant lines covered (74.71%)

0.75 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

59.78
/pynumdiff/optimize/_optimize.py
1
"""Optimization"""
2
import scipy.optimize
1✔
3
import numpy as np
1✔
4
from itertools import product
1✔
5
from functools import partial
1✔
6
from warnings import filterwarnings, warn
1✔
7
from multiprocessing import Pool, Manager
1✔
8
from hashlib import sha1
1✔
9
from tqdm import tqdm
1✔
10

11
from ..utils import evaluate
1✔
12
from ..finite_difference import finitediff, first_order, second_order, fourth_order
1✔
13
from ..smooth_finite_difference import mediandiff, meandiff, gaussiandiff, friedrichsdiff, butterdiff
1✔
14
from ..polynomial_fit import polydiff, savgoldiff, splinediff
1✔
15
from ..basis_fit import spectraldiff, rbfdiff
1✔
16
from ..total_variation_regularization import tvrdiff, velocity, acceleration, jerk, iterative_velocity, smooth_acceleration, jerk_sliding
1✔
17
from ..kalman_smooth import rtsdiff, constant_velocity, constant_acceleration, constant_jerk, robustdiff
1✔
18
from ..linear_model import lineardiff
1✔
19

20
# Map from method -> (search_space, bounds_low_hi)
21
method_params_and_bounds = {
1✔
22
    meandiff: ({'window_size': [5, 15, 30, 50],
23
             'num_iterations': [1, 5, 10]},
24
               {'window_size': (1, 1e6),
25
             'num_iterations': (1, 100)}),
26
    polydiff: ({'step_size': [1, 2, 5],
27
                   'kernel': {'friedrichs', 'gaussian'}, # categorical
28
                   'degree': [2, 3, 5, 7],
29
              'window_size': [11, 31, 51, 91, 131]},
30
               {'step_size': (1, 100),
31
                   'degree': (1, 8),
32
              'window_size': (10, 1000)}),
33
    savgoldiff: ({'degree': [2, 3, 5, 7, 10],
34
             'window_size': [3, 10, 30, 50, 90, 130, 200, 300],
35
           'smoothing_win': [3, 10, 30, 50, 90, 130, 200, 300]},
36
                 {'degree': (1, 12),
37
             'window_size': (3, 1000),
38
           'smoothing_win': (3, 1000)}),
39
    splinediff: ({'degree': {3, 4, 5}, # categorical, because degree is whole number, and there aren't many choices
40
                       's': [0.5, 0.9, 0.95, 1, 10, 100],
41
          'num_iterations': [1, 5, 10]},
42
                      {'s': (1e-2, 1e6),
43
          'num_iterations': (1, 10)}),
44
    spectraldiff: ({'even_extension': {True, False}, # give categorical params in a set
45
                  'pad_to_zero_dxdt': {True, False},
46
                  'high_freq_cutoff': [1e-3, 5e-2, 1e-2, 5e-2, 1e-1]}, # give numerical params in a list to scipy.optimize over them
47
                 {'high_freq_cutoff': (1e-5, 1-1e-5)}),
48
    rbfdiff: ({'sigma': [1e-3, 1e-2, 1e-1, 1],
49
                'lmbd': [1e-3, 1e-2, 1e-1]},
50
              {'sigma': (1e-3, 1e3),
51
                'lmbd': (1e-4, 0.5)}),
52
    finitediff: ({'num_iterations': [5, 10, 30, 50],
53
                           'order': {2, 4}}, # order is categorical here, because it can't be 3
54
                 {'num_iterations': (1, 1000)}),
55
    first_order: ({'num_iterations': [5, 10, 30, 50]},
56
                  {'num_iterations': (1, 1000)}),
57
    butterdiff: ({'filter_order': set(i for i in range(1,11)), # categorical to save us from doing double work by guessing between orders
58
                   'cutoff_freq': [0.0001, 0.001, 0.005, 0.01, 0.1, 0.5],
59
                'num_iterations': [1, 5, 10]},
60
                  {'cutoff_freq': (1e-4, 1-1e-2),
61
                'num_iterations': (1, 1000)}),
62
    tvrdiff: ({'gamma': [1e-2, 1e-1, 1, 10, 100, 1000],
63
               'order': {1, 2, 3}}, # warning: order 1 hacks the loss function when tvgamma is used, tends to win but is usually suboptimal choice in terms of true RMSE
64
              {'gamma': (1e-4, 1e7)}),
65
    velocity: ({'gamma': [1e-2, 1e-1, 1, 10, 100, 1000]},
66
               {'gamma': (1e-4, 1e7)}),
67
    iterative_velocity: ({'num_iterations': [1, 5, 10],
68
                                   'gamma': [1e-2, 1e-1, 1, 10, 100, 1000],
69
                                   'scale': 'small'},
70
                         {'num_iterations': (1, 100), # gets expensive with more iterations
71
                                   'gamma': (1e-4, 1e7)}),
72
    smooth_acceleration: ({'gamma': [1e-2, 1e-1, 1, 10, 100, 1000],
73
                     'window_size': [3, 10, 30, 50, 90, 130]},
74
                          {'gamma': (1e-4, 1e7),
75
                     'window_size': (3, 1000)}),
76
    rtsdiff: ({'forwardbackward': {True, False},
77
                         'order': {1, 2, 3}, # for this few options, the optimization works better if this is categorical
78
                      'qr_ratio': [10**k for k in range(-9, 10, 2)] + [1e12, 1e16]},
79
                     {'qr_ratio': [1e-10, 1e20]}), # qr_ratio is usually >>1
80
    constant_velocity: ({'q': [1e-8, 1e-4, 1e-1, 1e1, 1e4, 1e8],
81
                         'r': [1e-8, 1e-4, 1e-1, 1e1, 1e4, 1e8],
82
           'forwardbackward': {True, False}},
83
                        {'q': (1e-10, 1e10),
84
                         'r': (1e-10, 1e10)}),
85
    robustdiff: ({'order': {1, 2, 3}, # warning: order 1 hacks the loss function when tvgamma is used, tends to win but is usually suboptimal choice in terms of true RMSE
86
               'qr_ratio': [10**k for k in range(-1, 18, 3)],
87
                 'huberM': [0., 2, 10]}, # 0. so type is float. Good choices here really depend on the data scale
88
              {'qr_ratio': (1e-1, 1e18),
89
                 'huberM': (0, 1e2)}), # really only want to use l2 norm when nearby
90
    lineardiff: ({'kernel': 'gaussian',
91
                   'order': 3,
92
                   'gamma': [1e-1, 1, 10, 100],
93
             'window_size': [10, 30, 50, 90, 130]},
94
                  {'order': (1, 5),
95
                   'gamma': (1e-3, 1000),
96
             'window_size': (15, 1000)})
97
} # Methods with nonunique parameter sets are aliased in the dictionary below
98
for method in [second_order, fourth_order]:
1✔
99
    method_params_and_bounds[method] = method_params_and_bounds[first_order]
1✔
100
for method in [mediandiff, gaussiandiff, friedrichsdiff]:
1✔
101
    method_params_and_bounds[method] = method_params_and_bounds[meandiff]
1✔
102
for method in [acceleration, jerk]:
1✔
103
    method_params_and_bounds[method] = method_params_and_bounds[velocity]
1✔
104
method_params_and_bounds[jerk_sliding] = method_params_and_bounds[smooth_acceleration]
1✔
105
for method in [constant_acceleration, constant_jerk]:
1✔
106
    method_params_and_bounds[method] = method_params_and_bounds[constant_velocity]
1✔
107

108

109
# This function has to be at the top level for multiprocessing but is only used by optimize.
110
def _objective_function(point, func, x, dt, singleton_params, categorical_params, search_space_types,
1✔
111
    dxdt_truth, metric, tvgamma, padding, cache):
112
    """Function minimized by scipy.optimize.minimize, needs to have the form: (point, *args) -> float
113
    This is mildly complicated, because "point" controls the settings of a differentiation function, but
114
    the method may have numerical and non-numerical parameters, and all such parameters are now passed by
115
    keyword arguments. So the encoded `point` has to be decoded to dict.
116

117
    :param np.array point: a numerical vector scipy chooses to try in the objective function
118
    :param dict singleton_params: maps parameter names to singleton values
119
    :param dict categorical_params: maps parameter names to values
120
    :param dict search_space_types: maps parameter names to types, for turning float vector point into dict point_params
121
    :param multiprocessing.manager.dict cache: available across processes to save results and work
122
    Other params documented in `optimize`
123

124
    :return: float, cost of this objective at the point
125
    """
NEW
126
    key = sha1((''.join(f"{v:.3e}" for v in point) + # This hash is stable across processes. Takes bytes
×
127
               ''.join(str(v) for k,v in sorted(categorical_params.items()))).encode()).digest()
NEW
128
    if key in cache: return cache[key] # short circuit if this hyperparam combo has already been queried, ~10% savings per #160
×
129

NEW
130
    point_params = {k:(v if search_space_types[k] == float else int(np.round(v)))
×
131
                        for k,v in zip(search_space_types, point)} # point -> dict
132
    # add back in the singletons we're not searching over
133
    try: x_hat, dxdt_hat = func(x, dt, **point_params, **singleton_params, **categorical_params) # estimate x and dxdt
×
NEW
134
    except np.linalg.LinAlgError: cache[key] = 1e10; return 1e10 # some methods can fail numerically
×
135

136
    # evaluate estimate
137
    if dxdt_truth is not None: # then minimize ||dxdt_hat - dxdt_truth||^2
×
138
        if metric == 'rmse':
×
139
            rms_rec_x, rms_x, rms_dxdt = evaluate.rmse(x, dt, x_hat, dxdt_hat, dxdt_truth=dxdt_truth, padding=padding)
×
NEW
140
            cache[key] = rms_dxdt; return rms_dxdt
×
141
        elif metric == 'error_correlation':
×
NEW
142
            ec = evaluate.error_correlation(dxdt_hat, dxdt_truth, padding=padding)
×
NEW
143
            cache[key] = ec; return ec
×
144
    else: # then minimize [ || integral(dxdt_hat) - x||^2 + gamma*TV(dxdt_hat) ]
145
        rms_rec_x, rms_x, rms_dxdt = evaluate.rmse(x, dt, x_hat, dxdt_hat, dxdt_truth=None, padding=padding)
×
NEW
146
        cost = rms_rec_x + tvgamma*evaluate.total_variation(dxdt_hat, padding=padding)
×
NEW
147
        cache[key] = cost; return cost
×
148

149

150
def optimize(func, x, dt, dxdt_truth=None, tvgamma=1e-2, search_space_updates={}, metric='rmse',
1✔
151
    padding=0, opt_method='Nelder-Mead', maxiter=10):
152
    """Find the optimal parameters for a given differentiation method.
153

154
    :param function func: differentiation method to optimize parameters for, e.g. linear_model.savgoldiff
155
    :param np.array[float] x: data to differentiate
156
    :param float dt: step size
157
    :param np.array[float] dxdt_truth: actual time series of the derivative of x, if known
158
    :param float tvgamma: Only used if :code:`dxdt_truth` is given. Regularization value used to select for parameters
159
                    that yield a smooth derivative. Larger value results in a smoother derivative.
160
    :param dict search_space_updates: At the top of :code:`_optimize.py`, each method has a search space of parameters
161
                    settings structured as :code:`{param1:[values], param2:[values], param3:value, ...}`. The Cartesian
162
                    product of values are used as initial starting points in optimization. If left None, the default search
163
                    space is used.
164
    :param str metric: either :code:`'rmse'` or :code:`'error_correlation'`, only applies if :code:`dxdt_truth`
165
                    is not None, see _objective_function
166
    :param int padding: number of time steps to ignore at the beginning and end of the time series in the
167
                    optimization, or :code:`'auto'` to ignore 2.5% at each end. Larger value causes the
168
                    optimization to emphasize the accuracy of dxdt in the middle of the time series
169
    :param str opt_method: Optimization technique used by :code:`scipy.minimize`, the workhorse
170
    :param int maxiter: passed down to :code:`scipy.minimize`, maximum iterations
171

172
    :return: tuple[dict, float] of\n
173
            - **opt_params** -- best parameter settings for the differentation method
174
            - **opt_value** -- lowest value found for objective function
175
    """
176
    if metric not in ['rmse','error_correlation']:
1✔
177
        raise ValueError('`metric` should either be `rmse` or `error_correlation`.')
×
178
    if metric == 'error_correlation' and dxdt_truth is None:
1✔
179
        raise ValueError('`metric` can only be `error_correlation` if `dxdt_truth` is given.')
×
180

181
    search_space, bounds = method_params_and_bounds[func]
1✔
182
    search_space.update(search_space_updates) # for things not given, use defaults
1✔
183

184
    # No need to optimize over singletons, just pass them through
185
    singleton_params = {k:v for k,v in search_space.items() if not isinstance(v, (list, set))}
1✔
186

187
    # To handle categoricals, find their combination, and then pass each set individually
188
    categorical_params = {k for k,v in search_space.items() if isinstance(v, set)}
1✔
189
    categorical_combos = [dict(zip(categorical_params, combo)) for combo in
1✔
190
        product(*[search_space[k] for k in categorical_params])] # ends up [{}] if there are no categorical params
191

192
    # The Nelder-Mead's search space is the dimensions where multiple numerical options are given in a list
193
    search_space_types = {k:type(v[0]) for k,v in search_space.items() if isinstance(v, list)} # map param name -> type, for converting to and from point
1✔
194
    if any(v not in [float, int] for v in search_space_types.values()):
1✔
195
        raise ValueError("To optimize over categorical strings or bools, put them in a tuple, not a list.")
×
196
    # Cast ints to floats, and we're good to go
197
    starting_points = list(product(*[np.array(search_space[k]).astype(float) for k in search_space_types]))
1✔
198
    # The numerical space should have bounds
199
    bounds = [bounds[k] if k in bounds else # pass these to minimize(). It should respect them.
1✔
200
            None for k,v in search_space_types.items()] # None means no bound on a dimension
201

202
    results = []
1✔
203
    filterwarnings("ignore", '', UserWarning) # An extra filtering call, because some worker work can actually be done in the main process
1✔
204
    with Manager() as manager:
1✔
205
        cache = manager.dict() # cache answers to avoid expensive repeat queries
1✔
206
        with Pool(initializer=filterwarnings, initargs=["ignore", '', UserWarning]) as pool: # The heavy lifting
1✔
207
            for categorical_combo in categorical_combos:
1✔
208
                # wrap the objective and scipy.optimize.minimize to pass kwargs and a host of other things that remain the same
209
                _obj_fun = partial(_objective_function, func=func, x=x, dt=dt, singleton_params=singleton_params,
1✔
210
                    categorical_params=categorical_combo, search_space_types=search_space_types, dxdt_truth=dxdt_truth,
211
                    metric=metric, tvgamma=tvgamma, padding=padding, cache=cache)
212
                _minimize = partial(scipy.optimize.minimize, _obj_fun, method=opt_method, bounds=bounds, options={'maxiter':maxiter})
1✔
213
                results += pool.map(_minimize, starting_points) # returns a bunch of OptimizeResult objects
1✔
214

215
    opt_idx = np.nanargmin([r.fun for r in results])
1✔
216
    opt_point = results[opt_idx].x
1✔
217
    # results are going to be floats, but that may not be allowed, so convert back to a dict
218
    opt_params = {k:(v if search_space_types[k] == float else 
1✔
219
                    int(np.round(v)) if search_space_types[k] == int else
220
                    v > 0.5) for k,v in zip(search_space_types, opt_point)}
221
    opt_params.update(singleton_params)
1✔
222
    opt_params.update(categorical_combos[opt_idx//len(starting_points)]) # there are |starting_points| results for each combo
1✔
223

224
    return opt_params, results[opt_idx].fun
1✔
225

226

227
def suggest_method(x, dt, dxdt_truth=None, cutoff_frequency=None):
1✔
228
    """This is meant as an easy-to-use, automatic way for users with some time on their hands to determine
229
    a good method and settings for their data. It calls the optimizer over (almost) all methods in the repo
230
    using default search spaces defined at the top of the :code:`pynumdiff/optimize/_optimize.py` file.
231
    This routine will take a few minutes to run, especially due to `robustdiff`.
232
    
233
    Excluded:
234
        - ``first_order``, because iterating causes drift
235
        - ``lineardiff``, ``iterative_velocity``, and ``jerk_sliding``, because they either take too long,
236
          can be fragile, or tend not to do best
237
        - all ``cvxpy``-based methods if it is not installed
238
        - ``velocity`` because it tends to not be best but dominates the optimization process by directly
239
          optimizing the second term of the metric :math:`L = \\text{RMSE} \\Big( \\text{trapz}(\\mathbf{
240
          \\hat{\\dot{x}}}(\\Phi)) + \\mu, \\mathbf{y} \\Big) + \\gamma \\Big({TV}\\big(\\mathbf{\\hat{
241
          \\dot{x}}}(\\Phi)\\big)\\Big)`
242

243
    :param np.array[float] x: data to differentiate
244
    :param float dt: step size, because most methods are not designed to work with variable step sizes
245
    :param np.array[float] dxdt_truth: if known, you can pass true derivative values; otherwise you must use
246
            :code: `cutoff_frequency`
247
    :param float cutoff_frequency: in Hz, the highest dominant frequency of interest in the signal,
248
            used to find parameter :math:`\\gamma` for regularization of the optimization process
249
            in the absence of ground truth. See https://ieeexplore.ieee.org/document/9241009.
250
            Estimate by (a) counting real number of peaks per second in the data, (b) looking at
251
            power spectrum and choosing a cutoff, or (c) making an educated guess.
252

253
    :return: tuple[callable, dict] of\n
254
            - **method** -- a reference to the function handle of the differentiation method that worked best
255
            - **opt_params** -- optimal parameter settings for the differentation method
256
    """
257
    tvgamma = None
×
258
    if dxdt_truth is None: # parameter checking
×
259
        if cutoff_frequency is None:
×
260
            raise ValueError('Either dxdt_truth or cutoff_frequency must be provided.')
×
261
        tvgamma = np.exp(-1.6*np.log(cutoff_frequency) -0.71*np.log(dt) - 5.1) # See https://ieeexplore.ieee.org/document/9241009
×
262

NEW
263
    methods = [meandiff, mediandiff, gaussiandiff, friedrichsdiff, butterdiff,
×
264
        polydiff, savgoldiff, splinediff, spectraldiff, rbfdiff, finitediff, rtsdiff]
265
    try: # optionally skip some methods
×
266
        import cvxpy
×
NEW
267
        methods += [tvrdiff, smooth_acceleration, robustdiff]
×
268
    except ImportError:
×
NEW
269
        warn("CVXPY not installed, skipping tvrdiff, smooth_acceleration, and robustdiff")
×
270

271
    best_value = float('inf') # core loop
×
272
    for func in tqdm(methods):
×
273
        p, v = optimize(func, x, dt, dxdt_truth=dxdt_truth, tvgamma=tvgamma, search_space_updates=(
×
274
            {'order':{2,3}} if func in [tvrdiff, robustdiff] else {})) # convex-based with order 1 hack the cost function
275
        if v < best_value:
×
276
            method = func
×
277
            best_value = v
×
278
            opt_params = p
×
279

280
    return method, opt_params
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc