• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

INGEOTEC / CompStats / 13463636560

21 Feb 2025 07:14PM UTC coverage: 98.003% (-0.03%) from 98.031%
13463636560

push

github

mgraffg
Difference (1)

34 of 34 new or added lines in 2 files covered. (100.0%)

1 existing line in 1 file now uncovered.

1374 of 1402 relevant lines covered (98.0%)

2.94 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.5
/CompStats/performance.py
1
# Copyright 2024 Sergio Nava Muñoz and Mario Graff Guerrero
2

3
# Licensed under the Apache License, Version 2.0 (the "License");
4
# you may not use this file except in compliance with the License.
5
# You may obtain a copy of the License at
6

7
#     http://www.apache.org/licenses/LICENSE-2.0
8

9
# Unless required by applicable law or agreed to in writing, software
10
# distributed under the License is distributed on an "AS IS" BASIS,
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
# See the License for the specific language governing permissions and
13
# limitations under the License.
14
from sklearn.metrics import accuracy_score
3✔
15
from sklearn.base import clone
3✔
16
from typing import List, Callable
3✔
17
import pandas as pd
3✔
18
import numpy as np
3✔
19
import seaborn as sns
3✔
20
import math
3✔
21
from CompStats.bootstrap import StatisticSamples
3✔
22
from CompStats.utils import progress_bar
3✔
23
from CompStats import measurements
3✔
24
import matplotlib.pyplot as plt
3✔
25
from statsmodels.stats.multitest import multipletests
3✔
26

27

28
def performance(data: pd.DataFrame,
3✔
29
                gold: str='y',
30
                score: Callable[[np.ndarray, np.ndarray], float]=accuracy_score,
31
                num_samples: int=500,
32
                n_jobs: int=-1,
33
                BiB: bool=True,
34
                statistic_samples: StatisticSamples=None) -> StatisticSamples:
35
    """Calculate bootstrap samples of a performance score for a given dataset.
36

37
    Parameters:
38
    data (pd.DataFrame): Input dataset.
39
    gold (str, optional): Column name of the ground truth or target variable. Defaults to 'y'.
40
    score (Callable, optional): Performance score function. Defaults to accuracy_score.
41
    num_samples (int, optional): Number of bootstrap samples. Defaults to 500.
42
    n_jobs (int, optional): Number of jobs to run in parallel. Defaults to -1.
43
    BiB (bool, optional): Whether the metric is Bigger is Better. Defaults to True.
44
    statistic_samples (StatisticSamples, optional): Pre-initialized StatisticSamples object. Defaults to None.
45

46
    Returns:
47
    StatisticSamples: Object containing the bootstrap samples of the performance score.
48

49
    Example usage:
50

51
    >>> from sklearn.metrics import accuracy_score
52
    >>> import pandas as pd
53
    >>> from CompStats import performance
54
    >>> df = pd.read_csv('path/to/data.csv')
55
    >>> perf = performance(df, gold='y', score=accuracy_score, num_samples=1000)
56
    """
57
    if statistic_samples is None:
3✔
58
        statistic_samples = StatisticSamples(statistic=score, num_samples=num_samples,
3✔
59
                                             n_jobs=n_jobs, BiB=BiB)
60
    columns = data.columns
3✔
61
    y = data[gold]
3✔
62
    for column in progress_bar(columns):
3✔
63
        if column == gold:
3✔
64
            continue
3✔
65
        statistic_samples(y, data[column], name=column)
3✔
66
        
67
    return statistic_samples
3✔
68

69

70
def difference(statistic_samples: StatisticSamples): #, best_index: int=-1):
3✔
71
    """
72
    Computes the difference in performance between the best performing algorithm and others using bootstrap samples.
73

74
    Parameters:
75
    statistic_samples (StatisticSamples): An instance of StatisticSamples containing the performance data.
76

77
    Returns:
78
    StatisticSamples: A new instance of StatisticSamples with the computed differences and information about the best algorithm.
79

80
    The function works as follows:
81
    1. Determines the index of the best performing algorithm based on the BiB attribute.
82
    2. Extracts and calculates the mean performance for each algorithm.
83
    3. Sorts the algorithms by their mean performance.
84
    4. Identifies the best performing algorithm.
85
    5. Computes the difference in performance between the best algorithm and each other algorithm.
86
    6. Returns a new StatisticSamples instance with the computed differences and the name of the best performing algorithm.
87

88
    Example usage:
89

90
    >>> from CompStats import performance, difference
91
    >>> from CompStats.tests.test_performance import DATA
92
    >>> from sklearn.metrics import f1_score
93
    >>> import pandas as pd
94
    >>> df = pd.read_csv(DATA)
95
    >>> score = lambda y, hy: f1_score(y, hy, average='weighted')
96
    >>> perf = performance(df, score=score)
97
    >>> diff = difference(perf)
98
    """
99
    best_index = -1 if statistic_samples.BiB else 0
3✔
100
    items = list(statistic_samples.calls.items())
3✔
101
    perf = [(k, v, np.mean(v)) for k, v in items]
3✔
102
    perf.sort(key=lambda x: x[-1])
3✔
103
    best_name, best_perf, _ = perf[best_index]
3✔
104
    diff = {}
3✔
105
    for alg, alg_perf, _ in perf:
3✔
106
        if alg == best_name:
3✔
107
            continue
3✔
108
        diff[alg] = best_perf - alg_perf
3✔
109
    output = clone(statistic_samples)
3✔
110
    output.calls = diff
3✔
111
    output.info['best'] = best_name
3✔
112
    return output
3✔
113

114

115
def all_differences(statistic_samples: StatisticSamples):
3✔
116
    """
117
    Calculates all possible differences in performance among algorithms and sorts them by average performance.
118

119
    Parameters:
120
    statistic_samples (StatisticSamples): An instance of StatisticSamples containing the performance data.
121

122
    Returns:
123
    StatisticSamples: A new instance of StatisticSamples with the computed performance differences among all algorithms.
124

125
    The function works as follows:
126
    1. Extracts the performance data for each algorithm.
127
    2. Calculates the mean performance for each algorithm and sorts the algorithms based on their mean performance.
128
    3. Iterates over all possible pairs of algorithms.
129
    4. Computes the difference in performance for each pair and stores it in a dictionary.
130
    5. Returns a new StatisticSamples instance with the computed differences.
131

132
    Example usage:
133

134
    >>> from CompStats import performance, all_differences
135
    >>> from CompStats.tests.test_performance import DATA
136
    >>> from sklearn.metrics import f1_score
137
    >>> import pandas as pd
138
    >>> df = pd.read_csv(DATA)
139
    >>> score = lambda y, hy: f1_score(y, hy, average='weighted')
140
    >>> perf = performance(df, score=score)
141
    >>> all_diff = all_differences(perf)
142
    """
143
    items = list(statistic_samples.calls.items())
3✔
144
    # Calculamos el rendimiento medio y ordenamos los algoritmos basándonos en este
145
    perf = [(k, v, np.mean(v)) for k, v in items]
3✔
146
    perf.sort(key=lambda x: x[2], reverse=statistic_samples.BiB)  # Orden por rendimiento medio
3✔
147
    
148
    diffs = {}  # Diccionario para guardar las diferencias
3✔
149
    
150
    # Iteramos sobre todos los pares posibles de algoritmos ordenados
151
    for i in range(len(perf)):
3✔
152
        for j in range(i + 1, len(perf)):
3✔
153
            name_i, perf_i, _ = perf[i]
3✔
154
            name_j, perf_j, _ = perf[j]
3✔
155
            
156
            # Diferencia de i a j
157
            diff_key_i_to_j = f"{name_i} - {name_j}"
3✔
158
            diffs[diff_key_i_to_j] = np.array(perf_i) - np.array(perf_j)
3✔
159
    output = clone(statistic_samples)
3✔
160
    output.calls = diffs
3✔
161
    return output
3✔
162
    
163

164
def plot_performance(statistic_samples: StatisticSamples, CI: float=0.05,
3✔
165
                     var_name='Algorithm', value_name='Score',
166
                     capsize=0.2, linestyle='none', kind='point',
167
                     sharex=False, **kwargs):
168
    """Plots the performance of algorithms with confidence intervals.
169

170
    :param statistic_samples: An instance of StatisticSamples containing the performance data, or a DataFrame in long format.
171
    :type statistic_samples: StatisticSamples or pd.DataFrame
172
    :param CI: Confidence interval level (default is 0.05).
173
    :type CI: float
174
    :param var_name: Variable name for algorithms (default is 'Algorithm').
175
    :type var_name: str
176
    :param value_name: Variable name for scores (default is 'Score').
177
    :type value_name: str
178
    :param capsize: Size of the caps on error bars (default is 0.2).
179
    :type capsize: float
180
    :param linestyle: Line style for the plot (default is 'none').
181
    :type linestyle: str
182
    :param kind: Type of plot (default is 'point').
183
    :type kind: str
184
    :param sharex: Whether to share the x-axis among subplots (default is False).
185
    :type sharex: bool
186
    :param kwargs: Additional keyword arguments passed to seaborn's catplot function.
187

188
    :returns: A seaborn FacetGrid object containing the plot.
189
    :rtype: sns.axisgrid.FacetGrid
190

191
    The function works as follows:
192
    1. If statistic_samples is an instance of StatisticSamples, it extracts and sorts the performance data.
193
    2. Converts the data into a long format DataFrame.
194
    3. Computes the confidence intervals if CI is provided as a float.
195
    4. Plots the performance data with confidence intervals using seaborn's catplot.
196
    
197
    >>> from CompStats import performance, plot_performance
198
    >>> from CompStats.tests.test_performance import DATA
199
    >>> from sklearn.metrics import f1_score
200
    >>> import pandas as pd
201
    >>> df = pd.read_csv(DATA)
202
    >>> score = lambda y, hy: f1_score(y, hy, average='weighted')
203
    >>> perf = performance(df, score=score)
204
    >>> ins = plot_performance(perf)
205
    """
206

207
    if isinstance(statistic_samples, StatisticSamples):
3✔
208
        lista_ordenada = sorted(statistic_samples.calls.items(), key=lambda x: np.mean(x[1]), reverse=statistic_samples.BiB)
3✔
209
        diccionario_ordenado = {nombre: muestras for nombre, muestras in lista_ordenada}
3✔
210
        df2 = pd.DataFrame(diccionario_ordenado).melt(var_name=var_name,
3✔
211
                                                         value_name=value_name)
212
    else:
213
        df2 = statistic_samples
3✔
214
    if isinstance(CI, float):
3✔
215
        ci = lambda x: measurements.CI(x, alpha=CI)
3✔
216
    f_grid = sns.catplot(df2, x=value_name, y=var_name,
3✔
217
                         capsize=capsize, linestyle=linestyle,
218
                         kind=kind, errorbar=ci, sharex=sharex, **kwargs)
219
    return f_grid
3✔
220

221

222
def plot_difference(statistic_samples: StatisticSamples, CI: float=0.05,
3✔
223
                    var_name='Comparison', value_name='Difference',
224
                    set_refline=True, set_title=True,
225
                    hue='Significant', palette=None,
226
                    **kwargs):
227
    """
228
    Plot the difference in performance with its confidence intervals.
229

230
    Parameters:
231
    statistic_samples (StatisticSamples): An instance of StatisticSamples containing the performance data.
232
    CI (float, optional): Confidence interval level. Defaults to 0.05.
233
    var_name (str, optional): Variable name for the comparisons. Defaults to 'Comparison'.
234
    value_name (str, optional): Variable name for the differences. Defaults to 'Difference'.
235
    set_refline (bool, optional): Whether to set a reference line at x=0. Defaults to True.
236
    set_title (bool, optional): Whether to set the title of the plot with the best performing algorithm. Defaults to True.
237
    hue (str or None, optional): Column name for hue encoding. Defaults to 'Significant'.
238
    palette (list or None, optional): Colors to use for different hue levels. Defaults to None.
239
    **kwargs: Additional keyword arguments passed to the plot_performance function.
240

241
    Returns:
242
    sns.axisgrid.FacetGrid: A seaborn FacetGrid object containing the plot.
243

244
    The function works as follows:
245
    1. Converts the differences stored in statistic_samples into a long format DataFrame.
246
    2. Adds a 'Significant' column to indicate whether the confidence interval includes zero.
247
    3. Plots the differences with confidence intervals using the plot_performance function.
248
    4. Optionally sets a reference line at x=0 and a title indicating the best performing algorithm.
249
    
250
    >>> from CompStats import performance, difference, plot_difference
251
    >>> from CompStats.tests.test_performance import DATA
252
    >>> from sklearn.metrics import f1_score
253
    >>> import pandas as pd
254
    >>> df = pd.read_csv(DATA)
255
    >>> score = lambda y, hy: f1_score(y, hy, average='weighted')
256
    >>> perf = performance(df, score=score)
257
    >>> diff = difference(perf)
258
    >>> ins = plot_difference(diff)
259
    """
260
    if isinstance(statistic_samples, StatisticSamples):
3✔
261
        lista_ordenada = sorted(statistic_samples.calls.items(), key=lambda x: np.mean(x[1]), reverse=statistic_samples.BiB)
3✔
262
        diccionario_ordenado = {nombre: muestras for nombre, muestras in lista_ordenada}
3✔
263
        df2 = pd.DataFrame(diccionario_ordenado).melt(var_name=var_name,
3✔
264
                                                         value_name=value_name)
265
    if hue is not None:
3✔
266
        df2[hue] = True
3✔
267
    at_least_one = False
3✔
268
    for key, (left, right) in measurements.CI(statistic_samples, alpha=CI).items():
3✔
269
        if left < 0 < right:
3✔
270
            rows = df2[var_name] == key
3✔
271
            df2.loc[rows, hue] = False
3✔
272
            at_least_one = True
3✔
273
    if at_least_one and palette is None:
3✔
274
        palette = ['r', 'b']
3✔
275
    else:
UNCOV
276
        palette = ['b']        
×
277
    f_grid = plot_performance(df2, CI=CI, var_name=var_name,
3✔
278
                              value_name=value_name, hue=hue,
279
                              palette=palette,
280
                              **kwargs)
281
    if set_refline:
3✔
282
        f_grid.refline(x=0)
3✔
283
    if set_title:
3✔
284
        best = statistic_samples.info['best']
3✔
285
        f_grid.facet_axis(0, 0).set_title(f'Best: {best}')
3✔
286
    return f_grid
3✔
287

288
def performance_multiple_metrics(data: pd.DataFrame, gold: str, 
3✔
289
                                 scores: List[dict],
290
                                 num_samples: int = 500, n_jobs: int = -1):
291
    """
292
    Calculate bootstrap samples of multiple performance metrics for a given dataset.
293

294
    Parameters:
295
    data (pd.DataFrame): Input dataset.
296
    gold (str): Column name of the ground truth or target variable.
297
    scores (List[dict]): A list of dictionaries, each containing:
298
        - "func": The performance score function.
299
        - "args" (optional): Arguments to pass to the score function.
300
        - "BiB": Whether the metric is Bigger is Better.
301
    num_samples (int, optional): Number of bootstrap samples. Defaults to 500.
302
    n_jobs (int, optional): Number of jobs to run in parallel. Defaults to -1.
303

304
    Returns:
305
    dict: A dictionary containing the results for each metric, including:
306
        - 'samples': Bootstrap samples of the performance scores.
307
        - 'performance': Calculated performance scores for each algorithm.
308
        - 'compg': General performance comparison metrics, including:
309
            - 'n': Number of samples.
310
            - 'm': Number of algorithms.
311
            - 'cv': Coefficient of variation for each metric.
312
            - 'dist': Distance metric for each metric.
313
            - 'PPI': Performance potential index for each metric.
314
        - 'BiB': Whether each metric is Bigger is Better.
315

316
    The function works as follows:
317
    1. Defines auxiliary functions for calculating additional performance metrics.
318
    2. Iterates over the list of score functions and their respective arguments.
319
    3. Initializes a StatisticSamples object for each score function.
320
    4. Calculates the performance scores for each column in the dataset (excluding the ground truth column).
321
    5. Computes additional performance metrics (CV, distance, PPI) for each score function.
322
    6. Compiles the results into a dictionary and returns it.
323

324
    Example usage:
325

326
    >>> from sklearn.metrics import accuracy_score, f1_score
327
    >>> import pandas as pd
328
    >>> from CompStats import performance_multiple_metrics
329
    >>> df = pd.read_csv('path/to/data.csv')
330
    >>> scores = [
331
    >>>     {"func": accuracy_score, "BiB": True},
332
    >>>     {"func": f1_score, "args": {"average": "weighted"}, "BiB": True}
333
    >>> ]
334
    >>> results = performance_multiple_metrics(df, gold='target', scores=scores, num_samples=1000)
335
    """
336
    results, performance_dict, perfo, dist, ccv, cppi, compg, cBiB = {}, {}, {}, {}, {}, {}, {}, {}
3✔
337
    n,m = data.shape
3✔
338
    # definimos las funciones para las metricas
339
    cv = lambda x: np.std(x, ddof=1) / np.mean(x) * 100
3✔
340
    dista = lambda x: np.abs(np.max(x) - np.median(x))
3✔
341
    ppi = lambda x: (1 - np.max(x)) * 100
3✔
342
    for score_info in scores:
3✔
343
        score_func = score_info["func"]
3✔
344
        score_args = score_info.get("args", {})
3✔
345
        score_BiB = score_info.get("BiB", True)  # Default to True if not specified
3✔
346
        # Prepara el StatisticSamples con los argumentos específicos para esta métrica
347
        statistic_samples = StatisticSamples(num_samples=num_samples, n_jobs=n_jobs, BiB=score_BiB)
3✔
348
        # Calcula la métrica para cada muestra
349
        statistic_samples.statistic = statistic = lambda y_true, y_pred: score_func(y_true, y_pred, **score_args)
3✔
350
        # metric_name = score_func.__name__ + "_" + "_".join([f"{key}={value}" for key, value in score_args.items()])
351
        metric_name = score_func.__name__ + ("" if not score_args else "_" + "_".join([f"{key}={value}" for key, value in score_args.items()]))
3✔
352
        results[metric_name] = {}
3✔
353
        perfo[metric_name] = {}
3✔
354
        for column in data.columns:
3✔
355
            if column == gold:
3✔
356
                continue
3✔
357
            results[metric_name][column] = statistic_samples(data[gold], data[column])
3✔
358
            perfo[metric_name][column]  = statistic(data[gold], data[column])
3✔
359
        ccv[metric_name] = cv(np.array(list(perfo[metric_name].values())))
3✔
360
        dist[metric_name] = dista(np.array(list(perfo[metric_name].values())))
3✔
361
        cppi[metric_name] = ppi(np.array(list(perfo[metric_name].values())))
3✔
362
        cBiB[metric_name] = score_BiB
3✔
363
    compg = {'n' : n,
3✔
364
             'm' : m-1,
365
             'cv' : ccv,
366
             'dist' : dist,
367
             'PPI' : cppi}
368
    performance_dict = {'samples' : results,
3✔
369
                        'performance' : perfo,
370
                        'compg' : compg,
371
                        'BiB': cBiB}
372
    return performance_dict 
3✔
373

374
def plot_performance2(results: dict, CI: float=0.05,
3✔
375
                     var_name='Algorithm', value_name='Score',
376
                     capsize=0.2, linestyle='none', kind='point',
377
                     sharex=False, **kwargs):
378
    """
379
    Plot the performance with confidence intervals. This function is used by plot_difference_multiple
380

381
    Parameters:
382
    results (dict): A dictionary where keys are algorithm names and values are lists of performance scores.
383
    CI (float, optional): Confidence interval level for error bars. Defaults to 0.05.
384
    var_name (str, optional): Variable name for the algorithms. Defaults to 'Algorithm'.
385
    value_name (str, optional): Variable name for the scores. Defaults to 'Score'.
386
    capsize (float, optional): Cap size for error bars. Defaults to 0.2.
387
    linestyle (str, optional): Line style for the plot. Defaults to 'none'.
388
    kind (str, optional): Type of the plot, e.g., 'point', 'bar'. Defaults to 'point'.
389
    sharex (bool, optional): Whether to share the x-axis among subplots. Defaults to False.
390
    **kwargs: Additional keyword arguments for seaborn.catplot.
391

392
    Returns:
393
    sns.axisgrid.FacetGrid: A seaborn FacetGrid object containing the plot.
394

395
    The function works as follows:
396
    1. If results is a dictionary, it sorts the algorithms by their mean performance scores.
397
    2. Converts the sorted data into a long format DataFrame.
398
    3. Computes the confidence intervals if CI is provided as a float.
399
    4. Uses seaborn's catplot to create and display the performance plot with confidence intervals.
400
    """    
401
    if isinstance(results, dict):
×
402
        lista_ordenada = sorted(results.items(), key=lambda x: np.mean(x[1]), reverse=True)
×
403
        diccionario_ordenado = {nombre: muestras for nombre, muestras in lista_ordenada}
×
404
        df2 = pd.DataFrame(diccionario_ordenado).melt(var_name=var_name,
×
405
                                                         value_name=value_name)
406

407
    if isinstance(CI, float):
×
408
        ci = lambda x: measurements.CI(x, alpha=CI)
×
409
    f_grid = sns.catplot(df2, x=value_name, y=var_name,
×
410
                         capsize=capsize, linestyle=linestyle,
411
                         kind=kind, errorbar=ci, sharex=sharex, **kwargs)
412
    return f_grid
×
413

414

415

416

417
def difference_multiple(results_dict, CI: float=0.05,):
3✔
418
    """
419
    Calculate performance differences for multiple metrics, excluding the comparison of the best
420
    with itself. Additionally, identify the best performing algorithm for each metric.
421

422
    Parameters:
423
    results_dict (dict): A dictionary where keys are metric names and values are dictionaries.
424
                         Each sub-dictionary has algorithm names as keys and lists of performance scores as values.
425
    CI (float, optional): Confidence interval level. Defaults to 0.05.
426

427
    Returns:
428
    dict: A dictionary with the same structure, but where the scores for each algorithm are replaced
429
          by their differences to the scores of the best performing algorithm for that metric,
430
          excluding the best performing algorithm comparing with itself.
431
          Also includes the best algorithm name for each metric.
432

433
    The function works as follows:
434
    1. Iterates over each metric in the results dictionary.
435
    2. Converts performance scores to numpy arrays for efficient computations.
436
    3. Identifies the best performing algorithm for each metric based on the mean performance scores.
437
    4. Calculates the differences in performance scores relative to the best performing algorithm.
438
    5. Computes confidence intervals and p-values for these differences.
439
    6. Stores the differences, confidence intervals, p-values, and the best algorithm for each metric.
440
    7. Returns a dictionary with these calculated differences and additional information.
441

442
    Example usage:
443

444
    >>> from CompStats import performance, difference_multiple
445
    >>> from CompStats.tests.test_performance import DATA
446
    >>> from sklearn.metrics import f1_score
447
    >>> import pandas as pd
448
    >>> df = pd.read_csv(DATA)
449
    >>> score = lambda y, hy: f1_score(y, hy, average='weighted')
450
    >>> perf = performance(df, score=score)
451
    >>> diff_mult = difference_multiple(perf, CI=0.05)
452
    """
453
    differences_dict = results_dict.copy()
3✔
454
    winner = {}
3✔
455
    alpha = CI
3✔
456
    for metric, results in results_dict['samples'].items():
3✔
457
        # Convert scores to arrays for vectorized operations
458
        scores_arrays = {alg: np.array(scores) for alg, scores in results.items()}
3✔
459
        # Identify the best performing algorithm (highest mean score)
460
        if results_dict['BiB'][metric]:
3✔
461
            best_alg = max(scores_arrays, key=lambda alg: np.mean(scores_arrays[alg]))
3✔
462
        else:
463
            best_alg = min(scores_arrays, key=lambda alg: np.mean(scores_arrays[alg]))
3✔
464
        best_scores = scores_arrays[best_alg]
3✔
465
        
466
        # Calculate differences to the best performing algorithm, excluding the best from comparing with itself
467
        differences = {alg: best_scores - scores for alg, scores in scores_arrays.items() if alg != best_alg}
3✔
468

469
        # Calculate Confidence interval for differences to the bet performing algorithm.
470
        CI_differences = {alg: measurements.CI(np.array(scores), alpha=CI) for alg, scores in differences.items()}
3✔
471
        p_value_differences = {alg: measurements.difference_p_value(np.array(scores), BiB= results_dict['BiB'][metric]) for alg, scores in differences.items()}
3✔
472

473

474
        # Store the differences and the best algorithm under the current metric
475
        winner[metric] = {'best': best_alg, 'diff': differences,'CI':CI_differences,
3✔
476
                                    'p_value': p_value_differences,
477
                                    'none': sum(valor > alpha for valor in p_value_differences.values()),
478
                                    'bonferroni': sum(multipletests(list(p_value_differences.values()), method='bonferroni')[1] > alpha), 
479
                                    'holm': sum(multipletests(list(p_value_differences.values()), method='holm')[1] > alpha),
480
                                    'HB': sum(multipletests(list(p_value_differences.values()), method='fdr_bh')[1] > alpha) }
481
    differences_dict['winner'] = winner
3✔
482
    return differences_dict
3✔
483

484

485
def plot_difference2(diff_dictionary: dict, CI: float = 0.05,
3✔
486
                    var_name='Comparison', value_name='Difference',
487
                    set_refline=True, set_title=True,
488
                    hue='Significant', palette=None, BiB: bool=True,
489
                    **kwargs):
490
    """Plot the difference in performance with its confidence intervals
491
    
492
    >>> from CompStats import performance, difference, plot_difference
493
    >>> from CompStats.tests.test_performance import DATA
494
    >>> from sklearn.metrics import f1_score
495
    >>> import pandas as pd
496
    >>> df = pd.read_csv(DATA)
497
    >>> score = lambda y, hy: f1_score(y, hy, average='weighted')
498
    >>> perf = performance(df, score=score)
499
    >>> diff = difference(perf)
500
    >>> ins = plot_difference(diff)
501
    """
502
    if isinstance(diff_dictionary, dict):
3✔
503
        lista_ordenada = sorted(diff_dictionary['diff'].items(), key=lambda x: np.mean(x[1]), reverse=BiB)
3✔
504
        diccionario_ordenado = {nombre: muestras for nombre, muestras in lista_ordenada}
3✔
505
        df2 = pd.DataFrame(diccionario_ordenado).melt(var_name=var_name,
3✔
506
                                                         value_name=value_name)
507
    if hue is not None:
3✔
508
        df2[hue] = True
3✔
509
    at_least_one = False
3✔
510
    for key, (left, right) in diff_dictionary['CI'].items():
3✔
511
        if left < 0 < right:
3✔
512
            rows = df2[var_name] == key
3✔
513
            df2.loc[rows, hue] = False
3✔
514
            at_least_one = True
3✔
515
    if at_least_one and palette is None:
3✔
516
        palette = ['r', 'b']
3✔
517
    else:
518
        palette = ['b']
3✔
519
    f_grid = plot_performance(df2, CI=CI, var_name=var_name,
3✔
520
                              value_name=value_name, hue=hue,
521
                              palette=palette, 
522
                              **kwargs)
523
    if set_refline:
3✔
524
        f_grid.refline(x=0)
3✔
525
    if set_title:
3✔
526
        best = diff_dictionary['best']
3✔
527
        f_grid.facet_axis(0, 0).set_title(f'Best: {best}')
3✔
528
    return f_grid
3✔
529

530
def plot_performance_multiple(results_dict: dict, CI: float = 0.05, capsize: float = 0.2, 
3✔
531
                              linestyle: str = 'none', kind: str = 'point', **kwargs):
532
    """
533
    Create multiple performance plots, one for each performance metric in the results dictionary.
534

535
    Parameters:
536
    results_dict (dict): A dictionary where keys are metric names and values are dictionaries 
537
                         with algorithm names as keys and lists of performance scores as values.
538
    CI (float, optional): Confidence interval level for error bars. Defaults to 0.05.
539
    capsize (float, optional): Cap size for error bars. Defaults to 0.2.
540
    linestyle (str, optional): Line style for the plot. Defaults to 'none'.
541
    kind (str, optional): Type of the plot, e.g., 'point', 'bar'. Defaults to 'point'.
542
    **kwargs: Additional keyword arguments for seaborn.catplot.
543

544
    Returns:
545
    None: The function creates and displays plots.
546

547
    The function works as follows:
548
    1. Iterates over each metric in the results dictionary.
549
    2. Uses the plot_performance2 function to create and display the plot for each metric.
550
    3. Sets the title of each plot to the metric name and the best performing algorithm.
551

552
    Example usage:
553

554
    >>> from CompStats import plot_performance_multiple
555
    >>> results = {
556
    >>>     'accuracy': {
557
    >>>         'alg1': [0.1, 0.2, 0.15], 
558
    >>>         'alg2': [0.05, 0.1, 0.07]
559
    >>>     },
560
    >>>     'f1_score': {
561
    >>>         'alg1': [0.3, 0.25, 0.2], 
562
    >>>         'alg2': [0.2, 0.15, 0.1]
563
    >>>     }
564
    >>> }
565
    >>> plot_performance_multiple(results, CI=0.05)
566
    """
567
    
568
    for metric_name, metric_results in results_dict['samples'].items():
3✔
569
        BiB = results_dict['BiB'].get(metric_name, True)
3✔
570
        # Convert results to long format DataFrame
571
        if isinstance(metric_results, dict):
3✔
572
            lista_ordenada = sorted(metric_results.items(), key=lambda x: np.mean(x[1]), reverse=BiB)
3✔
573
            diccionario_ordenado = {nombre: muestras for nombre, muestras in lista_ordenada}
3✔
574
            df2 = pd.DataFrame(diccionario_ordenado).melt(var_name='Algorithm',
3✔
575
                                                             value_name='Score')
576
         
577
        # Define the confidence interval function
578
        if isinstance(CI, float):
3✔
579
            ci = lambda x: measurements.CI(x, alpha=CI)
3✔
580
        
581
        # Create the plot
582
        g = sns.catplot(df2, x='Score', y='Algorithm', capsize=capsize, linestyle=linestyle, 
3✔
583
                        kind=kind, errorbar=ci, **kwargs)
584
        
585
        # Set the title of the plot
586
        g.figure.suptitle(metric_name)
3✔
587
        
588
        # Display the plot
589
        plt.show()
3✔
590

591

592
def plot_difference_multiple(results_dict, CI=0.05, capsize=0.2, linestyle='none', kind='point', **kwargs):
3✔
593
    """
594
    Create multiple performance plots, one for each performance metric in the results dictionary.
595
    
596
    :param results_dict: A dictionary where keys are metric names and values are dictionaries with algorithm names as keys and lists of scores as values.
597
    :param CI: Confidence interval level for error bars.
598
    :param capsize: Cap size for error bars.
599
    :param linestyle: Line style for the plot.
600
    :param kind: Type of the plot, e.g., 'point', 'bar'.
601
    :param kwargs: Additional keyword arguments for seaborn.catplot.
602
    """   
603
    for metric_name, metric_results in results_dict['winner'].items():
3✔
604
        BiB = results_dict['BiB'].get(metric_name, True)
3✔
605
        # Usa catplot para crear y mostrar el gráfico        
606
        g = plot_difference2(metric_results, BiB=BiB, CI=CI)
3✔
607
        g.figure.suptitle(metric_name)  
3✔
608
        # plt.show()
609
 
610

611

612

613
### este por el momento no.
614
def plot_scatter_matrix(perf):
3✔
615
    """
616
    Generate a scatter plot matrix comparing the performance of the same algorithm
617
    across different metrics contained in the 'perf' dictionary.
618
    
619
    :param perf: A dictionary where keys are metric names and values are dictionaries with algorithm names as keys
620
                 and lists of performance scores as values.
621
    """
622
    # Convertir 'perf' en un DataFrame de pandas para facilitar la manipulación
623
    df_long = pd.DataFrame([
×
624
        {"Metric": metric, "Algorithm": alg, "Score": score, "Indice": i}
625
        for metric, alg_scores in perf['samples'].items()
626
        for alg, scores in alg_scores.items()
627
        for i, (score)  in enumerate(scores)
628
        ])
629
    df_wide = df_long.pivot(index=['Algorithm','Indice'],columns='Metric',values='Score')
×
630
    df_wide = df_wide.reset_index(level=[0])
×
631
    sns.pairplot(df_wide, diag_kind='kde',hue="Algorithm", corner=True)
×
632
    plt.suptitle('Scatter Plot Matrix of Algorithms Performance Across Different Metrics', y=1.02)
×
633
    plt.show()
×
634

635

636

637
def all_differences_multiple(results_dict, alpha: float=0.05):
3✔
638
    """
639
    Calculate performance differences for unique pairs of algorithms for multiple metrics.
640
    Also, calculates the confidence interval for the differences.
641
    
642
    :param results_dict: A dictionary where keys are metric names and values are dictionaries.
643
                         Each sub-dictionary has algorithm names as keys and lists of performance scores as values.
644
    :return: A dictionary where each metric name maps to another dictionary.
645
             This dictionary contains keys for unique pairs of algorithms and their performance differences,
646
             including the confidence interval for these differences.
647
    """
648
    differences_dict = results_dict.copy()
3✔
649
    all = {}
3✔
650
    for metric, results in results_dict['samples'].items():
3✔
651
        # Convert scores to arrays for vectorized operations
652
        scores_arrays = {alg: np.array(scores) for alg, scores in results.items()}      
3✔
653
        scores_arrays = dict(sorted(scores_arrays.items(), key=lambda item: np.mean(item[1]), reverse=results_dict['BiB'][metric]))
3✔
654

655
        
656
        differences = {}
3✔
657
        p_value_differences = {}
3✔
658
        
659
        algorithms = list(scores_arrays.keys())
3✔
660
        # Calculate differences for unique pairs of algorithms
661
        for i, alg_a in enumerate(algorithms):
3✔
662
            for alg_b in algorithms[i+1:]:  # Start from the next algorithm to avoid duplicate comparisons
3✔
663
                # Calculate the difference between alg_a and alg_b
664
                diff = scores_arrays[alg_a] - scores_arrays[alg_b]
3✔
665
                differences[f"{alg_a} vs {alg_b}"] = diff
3✔
666
                
667
                # Placeholder for confidence interval calculation
668
                # Replace the string with an actual call to your CI calculation function
669
                p_value_differences[f"{alg_a} vs {alg_b}"] = measurements.difference_p_value(diff, BiB=results_dict['BiB'][metric])
3✔
670
                # For example:
671
                # CI_differences[f"{alg_a} vs {alg_b}"] = measurements.CI(diff, alpha=CI)
672
                
673
        # Store the differences under the current metric
674
        all[metric] = {'diff': differences, 'p_value': p_value_differences, 
3✔
675
                                    'none': sum(valor > alpha for valor in p_value_differences.values()),
676
                                    'bonferroni': sum(multipletests(list(p_value_differences.values()), method='bonferroni')[1] > alpha), 
677
                                    'holm': sum(multipletests(list(p_value_differences.values()), method='holm')[1] > alpha),
678
                                    'HB': sum(multipletests(list(p_value_differences.values()), method='fdr_bh')[1] > alpha)  }
679
    differences_dict['all'] = all
3✔
680
    return differences_dict
3✔
681

STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc