• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

wwu-mmll / photonai / 14280849012

04 Nov 2024 01:43PM UTC coverage: 91.073%. Remained the same
14280849012

push

github

web-flow
Merge pull request #89 from wwu-mmll/develop

Develop

93 of 98 new or added lines in 10 files covered. (94.9%)

110 existing lines in 7 files now uncovered.

5815 of 6385 relevant lines covered (91.07%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.77
/photonai/processing/outer_folds.py
1
import datetime
1✔
2
import warnings
1✔
3
import numpy as np
1✔
4
import json
1✔
5

6
from photonai.helper.helper import PhotonDataHelper, print_double_metrics, print_metrics
1✔
7
from photonai.optimization import DummyPerformanceConstraint
1✔
8
from photonai.photonlogger.logger import logger
1✔
9
from photonai.processing.inner_folds import InnerFoldManager
1✔
10
from photonai.processing.photon_folds import FoldInfo
1✔
11
from photonai.processing.results_structure import MDBInnerFold, MDBScoreInformation
1✔
12
from photonai.processing.metrics import Scorer
1✔
13
from photonai.optimization.base_optimizer import PhotonSlaveOptimizer, PhotonMasterOptimizer
1✔
14

15
warnings.filterwarnings('ignore', category=DeprecationWarning)
1✔
16
warnings.filterwarnings('ignore', category=FutureWarning)
1✔
17

18

19
class OuterFoldManager:
1✔
20
    """Outer Fold manager.
21

22
    Controls the tasks over a specified Outer Fold.
23
    It is responsible for generating the split in the outer folds
24
    and triggering the hyperparameter optimization process.
25
    An Objective Function is provided for this purpose.
26
    This defines a black box function over the outer_fold data.
27

28
    Parameters
29
    ----------
30
    pipe: PhotonPipeline
31
        Defined pipeline structure for optimization.
32

33
    optimization_info: Optimization
34
        Contains the information how the black box function is solved.
35
        Depending on the algorithms and metrics, the objective function is adapted to it.
36

37
    outer_fold_id: UUID
38
        Unique ID for this object.
39

40
    cache_folder, str or None, default=None
41
        Folder for storing information in multiprocessing case.
42

43
    cache_updater, default=None
44
        The object that takes active access to the cache structure.
45
        Only in the multiprocess case.
46

47
    dummy_estimator: DummyClassifier, DummyRegressor or None, default=None
48
        To be able to classify the results,
49
        they are compared against a dummy performance.
50
        Since there are exceptions to the calculation,
51
        this does not necessarily have to be passed.
52

53
    result_obj: MDBOuterFold, default=None
54
        Contains the memory structure for this object.
55
        Results are written here during the running process.
56

57
    """
58

59
    def __init__(self, pipe,
1✔
60
                 optimization_info,
61
                 outer_fold_id,
62
                 cross_validation_info,
63
                 cache_folder=None,
64
                 cache_updater=None,
65
                 dummy_estimator=None,
66
                 result_obj=None,
67
                 raise_error=False,
68
                 score_train: bool = True):
69
        self.outer_fold_id = outer_fold_id
1✔
70
        self.cross_validation_info = cross_validation_info
1✔
71
        self.scorer = Scorer(optimization_info.metrics)
1✔
72
        self.optimization_info = optimization_info
1✔
73
        self._pipe = pipe
1✔
74
        self.copy_pipe_fnc = self._pipe.copy_me
1✔
75
        self.dummy_estimator = dummy_estimator
1✔
76
        self.score_train = score_train
1✔
77
        self.raise_error = raise_error
1✔
78

79
        self.cache_folder = cache_folder
1✔
80
        self.cache_updater = cache_updater
1✔
81

82
        # Information about the optimization progress
83
        self.current_best_config = None
1✔
84
        self.optimizer = None
1✔
85
        self.constraint_objects = None
1✔
86

87
        # data
88
        self.result_object = result_obj
1✔
89
        self.inner_folds = None
1✔
90
        self._validation_X = None
1✔
91
        self._validation_y = None
1✔
92
        self._validation_kwargs = None
1✔
93
        self._test_X = None
1✔
94
        self._test_y = None
1✔
95
        self._test_kwargs = None
1✔
96

97
    def _prepare_optimization(self):
1✔
98

99
        logger.info("Preparing Hyperparameter Optimization...")
1✔
100
        pipeline_elements = [e for name, e in self._pipe.elements]
1✔
101

102
        self.optimizer = self.optimization_info.get_optimizer()
1✔
103
        if isinstance(self.optimizer, PhotonMasterOptimizer):
1✔
104
            self.optimizer.prepare(pipeline_elements, self.optimization_info.maximize_metric, self.objective_function)
1✔
105
        else:
106
            self.optimizer.prepare(pipeline_elements, self.optimization_info.maximize_metric)
1✔
107

108
        # we've got some super strange pymodm problems here
109
        # somehow some information from the previous outer fold lingers on and can be found within a completely new
110
        # instantiated OuterFoldMDB object
111
        # hence, clearing it
112
        self.result_object.tested_config_list = list()
1✔
113

114
        # copy constraint objects.
115
        if self.optimization_info.performance_constraints is not None:
1✔
116
            if isinstance(self.optimization_info.performance_constraints, list):
1✔
117
                self.constraint_objects = [original.copy_me() for original in self.optimization_info.performance_constraints]
1✔
118
            else:
119
                self.constraint_objects = [self.optimization_info.performance_constraints.copy_me()]
1✔
120
        else:
121
            self.constraint_objects = None
1✔
122

123
    def _prepare_data(self, X, y=None, **kwargs):
1✔
124
        logger.info("Preparing data for outer fold " + str(self.cross_validation_info.outer_folds[self.outer_fold_id].fold_nr) + "...")
1✔
125
        # Prepare Train and validation set data
126
        train_indices = self.cross_validation_info.outer_folds[self.outer_fold_id].train_indices
1✔
127
        test_indices = self.cross_validation_info.outer_folds[self.outer_fold_id].test_indices
1✔
128
        self._validation_X, self._validation_y, self._validation_kwargs = PhotonDataHelper.split_data(X, y, kwargs,
1✔
129
                                                                                                      indices=train_indices)
130
        self._test_X, self._test_y, self._test_kwargs = PhotonDataHelper.split_data(X, y, kwargs, indices=test_indices)
1✔
131

132
        # write numbers to database info object
133
        self.result_object.number_samples_validation = self._validation_y.shape[0]
1✔
134
        self.result_object.number_samples_test = self._test_y.shape[0]
1✔
135
        if self._pipe._estimator_type == "classifier":
1✔
136
            self.result_object.class_distribution_validation = FoldInfo.data_overview(self._validation_y)
1✔
137
            self.result_object.class_distribution_test = FoldInfo.data_overview(self._test_y)
1✔
138

139
    def _generate_inner_folds(self):
1✔
140

141
        self.inner_folds = FoldInfo.generate_folds(self.cross_validation_info.inner_cv,
1✔
142
                                                   self._validation_X,
143
                                                   self._validation_y,
144
                                                   self._validation_kwargs)
145

146
        self.cross_validation_info.inner_folds[self.outer_fold_id] = {f.fold_id: f for f in self.inner_folds}
1✔
147

148
    def fit(self, X, y=None, **kwargs):
1✔
149
        logger.photon_system_log('')
1✔
150
        logger.stars()
1✔
151
        logger.photon_system_log('Outer Cross validation Fold {}'.format(self.cross_validation_info.outer_folds[self.outer_fold_id].fold_nr))
1✔
152
        logger.stars()
1✔
153

154
        self._prepare_data(X, y, **kwargs)
1✔
155
        self._prepare_optimization()
1✔
156
        self._fit_dummy()
1✔
157
        self._generate_inner_folds()
1✔
158

159
        outer_fold_fit_start_time = datetime.datetime.now()
1✔
160
        self.best_metric_yet = None
1✔
161
        self.tested_config_counter = 0
1✔
162

163
        # distribute number of folds to encapsulated child hyperpipes
164
        # self.__distribute_cv_info_to_hyperpipe_children(num_of_folds=num_folds,
165
        #                                                 outer_fold_counter=outer_fold_counter)
166

167
        if self.cross_validation_info.calculate_metrics_per_fold:
1✔
168
            self.fold_operation = "mean"
1✔
169
        else:
170
            self.fold_operation = "raw"
1✔
171

172
        self.max_nr_of_configs = ''
1✔
173
        if hasattr(self.optimizer, 'n_configurations'):
1✔
174
            self.max_nr_of_configs = str(self.optimizer.n_configurations)
1✔
175

176
        if isinstance(self.optimizer, PhotonMasterOptimizer):
1✔
177
            self.optimizer.optimize()
1✔
178
        else:
179
            # do the optimizing
180
            for current_config in self.optimizer.ask:
1✔
181
                self.objective_function(current_config)
1✔
182

183
        logger.line()
1✔
184
        logger.info('Hyperparameter Optimization finished. Now finding best configuration .... ')
1✔
185
        logger.info(self.tested_config_counter)
1✔
186
        # now go on with the best config found
187
        if self.tested_config_counter > 0:
1✔
188
            best_config_outer_fold = self.result_object.get_optimum_config(self.optimization_info.best_config_metric,
1✔
189
                                                                           self.optimization_info.maximize_metric,
190
                                                                           fold_operation=self.fold_operation)
191
            # inform user
192
            logger.debug('Optimizer metric: ' + self.optimization_info.best_config_metric + '\n' +
1✔
193
                         '   --> Maximize metric: ' + str(self.optimization_info.maximize_metric))
194

195
            logger.system_line()
1✔
196
            logger.photon_system_log('BEST_CONFIG ')
1✔
197
            logger.system_line()
1✔
198
            logger.photon_system_log(json.dumps(best_config_outer_fold.human_readable_config, indent=4,
1✔
199
                                                sort_keys=True))
200
            logger.system_line()
1✔
201
            logger.photon_system_log('VALIDATION PERFORMANCE')
1✔
202
            logger.system_line()
1✔
203
            print_double_metrics(best_config_outer_fold.get_train_metric(operation="mean"),
1✔
204
                                 best_config_outer_fold.get_test_metric(operation="mean"))
205

206
            if not best_config_outer_fold:
1✔
UNCOV
207
                raise Exception("No best config was found!")
×
208

209
            # ... and create optimal pipeline
210
            optimum_pipe = self.copy_pipe_fnc()
1✔
211
            if self.cache_updater is not None:
1✔
212
                self.cache_updater(optimum_pipe, self.cache_folder, "fixed_fold_id")
1✔
213
            optimum_pipe.caching = False
1✔
214
            # set self to best config
215
            optimum_pipe.set_params(**best_config_outer_fold.config_dict)
1✔
216

217
            # Todo: set all children to best config and inform to NOT optimize again, ONLY fit
218
            # for child_name, child_config in best_config_outer_fold_mdb.children_config_dict.items():
219
            #     if child_config:
220
            #         # in case we have a pipeline stacking we need to identify the particular subhyperpipe
221
            #         splitted_name = child_name.split('__')
222
            #         if len(splitted_name) > 1:
223
            #             stacking_element = self.optimum_pipe.named_steps[splitted_name[0]]
224
            #             pipe_element = stacking_element.elements[splitted_name[1]]
225
            #         else:
226
            #             pipe_element = self.optimum_pipe.named_steps[child_name]
227
            #         pipe_element.set_params(**child_config)
228
            #         pipe_element.is_final_fit = True
229

230
            # self.__distribute_cv_info_to_hyperpipe_children(reset=True)
231

232
            logger.debug('Fitting model with best configuration of outer fold...')
1✔
233
            optimum_pipe.fit(self._validation_X, self._validation_y, **self._validation_kwargs)
1✔
234

235
            self.result_object.best_config = best_config_outer_fold
1✔
236

237
            # save test performance
238
            best_config_performance_mdb = MDBInnerFold()
1✔
239
            best_config_performance_mdb.fold_nr = -99
1✔
240
            best_config_performance_mdb.number_samples_training = self._validation_y.shape[0]
1✔
241
            best_config_performance_mdb.number_samples_validation = self._test_y.shape[0]
1✔
242
            best_config_performance_mdb.feature_importances = optimum_pipe.feature_importances_
1✔
243

244
            if self.cross_validation_info.use_test_set:
1✔
245
                # Todo: generate mean and std over outer folds as well. move this items to the top
246
                logger.info('Calculating best model performance on test set...')
1✔
247

248
                logger.debug('...scoring test data')
1✔
249
                test_score_mdb = InnerFoldManager.score(optimum_pipe, self._test_X, self._test_y,
1✔
250
                                                        indices=self.cross_validation_info.outer_folds[self.outer_fold_id].test_indices,
251
                                                        metrics=self.optimization_info.metrics,
252
                                                        scorer=self.scorer,
253
                                                        **self._test_kwargs)
254

255
                logger.debug('... scoring training data')
1✔
256

257
                train_score_mdb = InnerFoldManager.score(optimum_pipe, self._validation_X, self._validation_y,
1✔
258
                                                         indices=self.cross_validation_info.outer_folds[self.outer_fold_id].train_indices,
259
                                                         metrics=self.optimization_info.metrics,
260
                                                         training=True,
261
                                                         scorer=self.scorer,
262
                                                         score_train=self.score_train,
263
                                                         **self._validation_kwargs)
264

265
                best_config_performance_mdb.training = train_score_mdb
1✔
266
                best_config_performance_mdb.validation = test_score_mdb
1✔
267

268
                logger.system_line()
1✔
269
                logger.photon_system_log('TEST PERFORMANCE')
1✔
270
                logger.system_line()
1✔
271
                print_double_metrics(train_score_mdb.metrics, test_score_mdb.metrics)
1✔
272
            else:
273

274
                def _copy_inner_fold_means(metric_dict):
1✔
275
                    # We copy all mean values from validation to the best config
276
                    # training
277
                    train_item_metrics = {}
1✔
278
                    for m in metric_dict:
1✔
279
                        if m.operation == str(self.fold_operation):
1✔
280
                            train_item_metrics[m.metric_name] = m.value
1✔
281
                    train_item = MDBScoreInformation()
1✔
282
                    train_item.metrics_copied_from_inner = True
1✔
283
                    train_item.metrics = train_item_metrics
1✔
284
                    return train_item
1✔
285

286
                # training
287
                best_config_performance_mdb.training = _copy_inner_fold_means(best_config_outer_fold.metrics_train)
1✔
288
                # validation
289
                best_config_performance_mdb.validation = _copy_inner_fold_means(best_config_outer_fold.metrics_test)
1✔
290

291
            # write best config performance to best config item
292
            self.result_object.best_config.best_config_score = best_config_performance_mdb
1✔
293

294
        logger.info('Computations in outer fold {} took {} minutes.'.format(
1✔
295
            self.cross_validation_info.outer_folds[self.outer_fold_id].fold_nr,
296
            (datetime.datetime.now() - outer_fold_fit_start_time).total_seconds() / 60))
297

298
    def objective_function(self, current_config):
1✔
299
        if current_config is None:
1✔
UNCOV
300
            return
×
301
        logger.line()
1✔
302
        self.tested_config_counter += 1
1✔
303

304
        if hasattr(self.optimizer, 'ask_for_pipe'):
1✔
305
            pipe_ctor = self.optimizer.ask_for_pipe()
×
306
        else:
307
            pipe_ctor = self.copy_pipe_fnc
1✔
308

309
        # self.__distribute_cv_info_to_hyperpipe_children(reset=True, config_counter=tested_config_counter)
310

311
        hp = InnerFoldManager(pipe_ctor, current_config,
1✔
312
                              self.optimization_info,
313
                              self.cross_validation_info, self.outer_fold_id, self.constraint_objects,
314
                              cache_folder=self.cache_folder,
315
                              cache_updater=self.cache_updater,
316
                              scorer=self.scorer,
317
                              raise_error=self.raise_error)
318

319
        # Test the configuration cross validated by inner_cv object
320
        current_config_mdb = hp.fit(self._validation_X, self._validation_y, **self._validation_kwargs)
1✔
321
        current_config_mdb.config_nr = self.tested_config_counter
1✔
322

323
        if not current_config_mdb.config_failed:
1✔
324
            metric_train = current_config_mdb.get_train_metric(self.optimization_info.best_config_metric,
1✔
325
                                                               self.fold_operation)
326
            metric_test = current_config_mdb.get_test_metric(self.optimization_info.best_config_metric,
1✔
327
                                                             self.fold_operation)
328

329
            if metric_train is None or metric_test is None:
1✔
UNCOV
330
                raise Exception("Config did not fail, but did not get any metrics either....!!?")
×
331
            config_performance = (metric_train, metric_test)
1✔
332
            if self.best_metric_yet is None:
1✔
333
                self.best_metric_yet = config_performance
1✔
334
                self.current_best_config = current_config_mdb
1✔
335
            else:
336
                # check if we have the next superstar around that exceeds any old performance
337
                if self.optimization_info.maximize_metric:
1✔
338
                    if metric_test > self.best_metric_yet[1]:
1✔
339
                        self.best_metric_yet = config_performance
1✔
340
                        self.current_best_config.decrease_memory()
1✔
341
                        self.current_best_config = current_config_mdb
1✔
342
                    else:
343
                        current_config_mdb.decrease_memory()
1✔
344
                else:
345
                    if metric_test < self.best_metric_yet[1]:
1✔
346
                        self.best_metric_yet = config_performance
1✔
347
                        self.current_best_config.decrease_memory()
1✔
348
                        self.current_best_config = current_config_mdb
1✔
349
                    else:
350
                        current_config_mdb.decrease_memory()
1✔
351

352
            # Print Result for config
353
            computation_duration = current_config_mdb.computation_end_time - current_config_mdb.computation_start_time
1✔
354
            logger.info('Computed configuration ' + str(self.tested_config_counter) + "/" + self.max_nr_of_configs +
1✔
355
                        " in " + str(computation_duration))
356
            logger.info("Performance:             " + self.optimization_info.best_config_metric
1✔
357
                        + " - Train: " + "%.4f" % config_performance[0] + ", Validation: " + "%.4f" %
358
                        config_performance[1])
359
            logger.info("Best Performance So Far: " + self.optimization_info.best_config_metric
1✔
360
                        + " - Train: " + "%.4f" % self.best_metric_yet[0] + ", Validation: "
361
                        + "%.4f" % self.best_metric_yet[1])
362
        else:
363
            config_performance = (-1, -1)
1✔
364
            # Print Result for config
365
            logger.debug('...failed:')
1✔
366
            logger.error(current_config_mdb.config_error)
1✔
367

368
        # add config to result tree
369
        self.result_object.tested_config_list.append(current_config_mdb)
1✔
370

371
        # 3. inform optimizer about performance
372
        logger.debug("Telling hyperparameter optimizer about recent performance.")
1✔
373
        if isinstance(self.optimizer, PhotonSlaveOptimizer):
1✔
374
            self.optimizer.tell(current_config, config_performance[1])
1✔
375
        logger.debug("Asking hyperparameter optimizer for new config.")
1✔
376

377
        if self.optimization_info.maximize_metric:
1✔
378
            return 1 - config_performance[1]
1✔
379
        else:
380
            return config_performance[1]
1✔
381

382
    def _fit_dummy(self):
1✔
383
        if self.dummy_estimator is not None:
1✔
384
            logger.info("Running Dummy Estimator...")
1✔
385
            try:
1✔
386
                if isinstance(self._validation_X, np.ndarray):
1✔
387
                    if len(self._validation_X.shape) > 2:
1✔
UNCOV
388
                        logger.info("Skipping dummy estimator because of too many dimensions")
×
UNCOV
389
                        self.result_object.dummy_results = None
×
UNCOV
390
                        return
×
391
                dummy_y = np.reshape(self._validation_y, (-1, 1))
1✔
392
                self.dummy_estimator.fit(dummy_y, self._validation_y)
1✔
393
                train_scores = InnerFoldManager.score(self.dummy_estimator, self._validation_X, self._validation_y,
1✔
394
                                                      training=True,
395
                                                      dummy=True,
396
                                                      metrics=self.optimization_info.metrics,
397
                                                      score_train=self.score_train,
398
                                                      scorer=self.scorer)
399

400
                # fill result tree with fold information
401
                inner_fold = MDBInnerFold()
1✔
402
                inner_fold.training = train_scores
1✔
403

404
                if self.cross_validation_info.use_test_set:
1✔
405
                    test_scores = InnerFoldManager.score(self.dummy_estimator,
1✔
406
                                                         self._test_X, self._test_y,
407
                                                         metrics=self.optimization_info.metrics,
408
                                                         score_train=self.score_train,
409
                                                         scorer=self.scorer)
410
                    print_metrics("DUMMY", test_scores.metrics)
1✔
411
                    inner_fold.validation = test_scores
1✔
412

413
                self.result_object.dummy_results = inner_fold
1✔
414

415
                # performaceConstraints: DummyEstimator
416
                if self.constraint_objects is not None:
1✔
417
                    dummy_constraint_objs = [opt for opt in self.constraint_objects
1✔
418
                                             if isinstance(opt, DummyPerformanceConstraint)]
419

420
                    if dummy_constraint_objs:
1✔
421
                        for dummy_constraint_obj in dummy_constraint_objs:
1✔
422
                            dummy_constraint_obj.set_dummy_performance(self.result_object.dummy_results)
1✔
423

424
                return inner_fold
1✔
425
            except Exception as e:
1✔
426
                logger.error(e)
1✔
427
                logger.info("Skipping dummy because of error..")
1✔
428
                return None
1✔
429
        else:
430
            logger.info("Skipping dummy ..")
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc