• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

igmhub / picca / 5878749098

16 Aug 2023 12:28PM UTC coverage: 67.212% (-0.02%) from 67.235%
5878749098

Pull #1038

github-actions

web-flow
Merge a89079702 into db33e4e74
Pull Request #1038: P1d nan fix

1756 of 2972 branches covered (59.08%)

Branch coverage included in aggregate %.

8 of 8 new or added lines in 1 file covered. (100.0%)

6097 of 8712 relevant lines covered (69.98%)

2.1 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

21.01
/py/picca/pk1d/postproc_pk1d.py
1
"""Module defining a set of functions to postprocess files produced by compute_pk1d.py.
2

3
This module provides 3 main functions:
4
    - read_pk1d:
5
        Read all HDUs in an individual "P1D" FITS file and stacks
6
        all data in one table
7
    - compute_mean_pk1d:
8
        Compute the mean P1D in a given (z,k) grid of bins, from individual
9
        "P1Ds" of individual chunks
10
    - parallelize_p1d_comp:
11
        Main function, runs read_pk1d in parallel, then runs compute_mean_pk1d
12
See the respective documentation for details
13
"""
14

15
import os
3✔
16
import glob
3✔
17
from multiprocessing import Pool
3✔
18

19
from functools import partial
3✔
20
import numpy as np
3✔
21
from scipy.stats import binned_statistic
3✔
22
from scipy.optimize import curve_fit
3✔
23
import fitsio
3✔
24
from astropy.table import Table, vstack
3✔
25
from astropy.stats import bootstrap
3✔
26

27
from picca.constants import SPEED_LIGHT
3✔
28
from picca.constants import ABSORBER_IGM
3✔
29
from picca.utils import userprint
3✔
30
from picca.pk1d.utils import MEANPK_FITRANGE_SNR, fitfunc_variance_pk1d
3✔
31

32

33
def read_pk1d(filename, kbin_edges, snrcut=None, zbins_snrcut=None):
3✔
34
    """Read Pk1D data from a single file.
35

36
    Arguments
37
    ---------
38
    filename: string
39
    Fits file, containing individual "P1D" for each chunk
40

41
    kbin_edges: array of floats
42
    Edges of the wavenumber bins to be later used, in Angstrom^-1
43

44
    snrcut: array of floats or None
45
    Chunks with mean SNR > snrcut are discarded. If len(snrcut)>1,
46
    zbins_snrcut must be set, so that the cut is made redshift dependent.
47

48
    zbins_snrcut: array of floats or None
49
    Required if len(snrcut)>1. List of redshifts
50
    associated to the list of snr cuts.
51

52
    Return
53
    ------
54
    p1d_table: Table
55
    One entry per mode(k) per chunk
56

57
    z_array: array of floats
58
    Nchunks entry.
59
    If no chunk is selected, None will be returned instead
60
    """
61
    p1d_table = []
×
62
    z_array = []
×
63
    with fitsio.FITS(filename, "r") as hdus:
×
64
        for i, hdu in enumerate(hdus[1:]):
×
65
            data = hdu.read()
×
66
            chunk_header = hdu.read_header()
×
67
            chunk_table = Table(data)
×
68
            for colname in [
×
69
                "k",
70
                "Pk",
71
                "Pk_raw",
72
                "Pk_noise",
73
                "Pk_diff",
74
                "cor_reso",
75
                "Pk_noise_miss",
76
            ]:
77
                try:
×
78
                    chunk_table.rename_column(colname.upper(), colname)
×
79
                except KeyError:
×
80
                    pass
×
81

82
            if np.nansum(chunk_table["Pk"]) == 0:
×
83
                chunk_table["Pk"] = (
×
84
                    chunk_table["Pk_raw"] - chunk_table["Pk_noise"]
85
                ) / chunk_table["cor_reso"]
86

87
            chunk_table["forest_z"] = float(chunk_header["MEANZ"])
×
88
            chunk_table["forest_snr"] = float(chunk_header["MEANSNR"])
×
89
            chunk_table["forest_id"] = int(chunk_header["LOS_ID"])
×
90
            if "CHUNK_ID" in chunk_header:
×
91
                chunk_table[
×
92
                    "sub_forest_id"
93
                ] = f"{chunk_header['LOS_ID']}_{chunk_header['CHUNK_ID']}"
94

95
            if snrcut is not None:
×
96
                if len(snrcut) > 1:
×
97
                    if (zbins_snrcut is None) or (len(zbins_snrcut) != len(snrcut)):
×
98
                        raise ValueError(
×
99
                            "Please provide same size for zbins_snrcut and snrcut arrays"
100
                        )
101
                    zbin_index = np.argmin(np.abs(zbins_snrcut - chunk_header["MEANZ"]))
×
102
                    snrcut_chunk = snrcut[zbin_index]
×
103
                else:
104
                    snrcut_chunk = snrcut[0]
×
105
                if chunk_header["MEANSNR"] < snrcut_chunk:
×
106
                    continue
×
107

108
            # Empirically remove very noisy chunks
109
            (wk,) = np.where(chunk_table["k"] < kbin_edges[-1])
×
110
            if (
×
111
                chunk_table["Pk_noise"][wk] > 1000000 * chunk_table["Pk_raw"][wk]
112
            ).any():
113
                userprint(
×
114
                    f"file {filename} hdu {i+1} has very high noise power: discarded"
115
                )
116
                continue
×
117

118
            p1d_table.append(chunk_table)
×
119
            z_array.append(float(chunk_header["MEANZ"]))
×
120

121
    if len(p1d_table) == 0:  # No chunk was selected
×
122
        return None
×
123

124
    p1d_table = vstack(p1d_table)
×
125
    p1d_table["Delta2"] = p1d_table["k"] * p1d_table["Pk"] / np.pi
×
126
    p1d_table["Pk_norescor"] = p1d_table["Pk_raw"] - p1d_table["Pk_noise"]
×
127
    p1d_table["Pk_nonoise"] = p1d_table["Pk_raw"] / p1d_table["cor_reso"]
×
128
    p1d_table["Pk_noraw"] = p1d_table["Pk_noise"] / p1d_table["cor_reso"]
×
129
    try:
×
130
        p1d_table["Pk_noraw_miss"] = p1d_table["Pk_noise_miss"] / p1d_table["cor_reso"]
×
131
    except KeyError:
×
132
        pass
×
133

134
    z_array = np.array(z_array)
×
135

136
    return p1d_table, z_array
×
137

138

139
def compute_mean_pk1d(
3✔
140
    p1d_table,
141
    z_array,
142
    zbin_edges,
143
    kbin_edges,
144
    weight_method,
145
    apply_z_weights=False,
146
    nomedians=False,
147
    velunits=False,
148
    output_snrfit=None,
149
    compute_covariance=False,
150
    compute_bootstrap=False,
151
    number_bootstrap=50,
152
    number_worker=8,
153
):
154
    """Compute mean P1D in a set of given (z,k) bins, from individual chunks P1Ds.
155

156
    Arguments
157
    ---------
158
    p1d_table: astropy.table.Table
159
    Individual Pk1Ds of the contributing forest chunks, stacked in one table using "read_pk1d",
160
    Contains 'k', 'Pk_raw', 'Pk_noise', 'Pk_diff', 'cor_reso', 'Pk', 'forest_z', 'forest_snr',
161
            'Delta2', 'Pk_norescor', 'Pk_nonoise', 'Pk_noraw'
162

163
    z_array: Array of float
164
    Mean z of each contributing chunk, stacked in one array using "read_pk1d"
165

166
    zbin_edges: Array of float
167
    Edges of the redshift bins we want to use
168

169
    kbin_edges: Array of float
170
    Edges of the wavenumber bins we want to use, either in (Angstrom)-1 or s/km
171

172
    weight_method: string
173
    2 possible options:
174
        'fit_snr': Compute mean P1D with weights estimated by fitting dispersion vs SNR
175
        'no_weights': Compute mean P1D without weights
176

177
    apply_z_weights: Bool
178
    If True, each chunk contributes to two nearest redshift bins with a linear weighting scheme.
179

180
    nomedians: Bool
181
    Skip computation of median quantities
182

183
    velunits: Bool
184
    Compute P1D in velocity units by converting k on-the-fly from AA-1 to s/km
185

186
    output_snrfit: string
187
    If weight_method='fit_snr', the results of the fit can be saved to an ASCII file.
188
    The file contains (z k a b standard_dev_points) for the "Pk" variable, for each (z,k) point
189

190
    compute_covariance: Bool
191
    If True, compute statistical covariance matrix of the mean P1D.
192
    Requires  p1d_table to contain 'sub_forest_id', since correlations are computed
193
    within individual forests.
194

195
    compute_bootstrap: Bool
196
    If True, compute statistical covariance using a simple bootstrap method.
197

198
    number_bootstrap: int
199
    Number of bootstrap samples used if compute_bootstrap is True.
200

201
    number_worker: int
202
    Calculations of mean P1Ds and covariances are run parallel over redshift bins.
203

204
    Return
205
    ------
206
    meanP1d_table: astropy.table.Table
207
    One row per (z,k) bin; one column per statistics (eg. meanPk, errorPk_noise...)
208
    Other columns: 'N' (nb of chunks used), 'index_zbin' (index of associated
209
    row in metadata_table), 'zbin'
210

211
    metadata_table: astropy.table.Table
212
    One row per z bin; column values z_min/max, k_min/max, N_chunks
213
    """
214
    # Initializing stats we want to compute on data
215
    stats_array = ["mean", "error", "min", "max"]
3✔
216
    if not nomedians:
3!
217
        stats_array += ["median"]
3✔
218

219
    p1d_table_cols = p1d_table.colnames
3✔
220
    p1d_table_cols.remove("forest_id")
3✔
221
    if "sub_forest_id" in p1d_table_cols:
3!
222
        p1d_table_cols.remove("sub_forest_id")
3✔
223

224
    # Convert data into velocity units
225
    if velunits:
3!
226
        conversion_factor = (
×
227
            ABSORBER_IGM["LYA"] * (1.0 + p1d_table["forest_z"])
228
        ) / SPEED_LIGHT
229
        p1d_table["k"] *= conversion_factor
×
230
        for col in p1d_table_cols:
×
231
            if "Pk" in col:
×
232
                p1d_table[col] /= conversion_factor
×
233

234
    # Initialize mean_p1d_table of len = (nzbins * nkbins) corresponding to hdu[1] in final ouput
235
    mean_p1d_table = Table()
3✔
236
    nbins_z, nbins_k = len(zbin_edges) - 1, len(kbin_edges) - 1
3✔
237
    mean_p1d_table["zbin"] = np.zeros(nbins_z * nbins_k)
3✔
238
    mean_p1d_table["index_zbin"] = np.zeros(nbins_z * nbins_k, dtype=int)
3✔
239
    mean_p1d_table["N"] = np.zeros(nbins_z * nbins_k, dtype=int)
3✔
240
    for col in p1d_table_cols:
3✔
241
        for stats in stats_array:
3✔
242
            mean_p1d_table[stats + col] = np.ones(nbins_z * nbins_k) * np.nan
3✔
243

244
    # Initialize metadata_table of len = nbins_z corresponding to hdu[2] in final output
245
    metadata_table = Table()
3✔
246
    metadata_table["z_min"] = zbin_edges[:-1]
3✔
247
    metadata_table["z_max"] = zbin_edges[1:]
3✔
248
    metadata_table["k_min"] = kbin_edges[0] * np.ones(nbins_z)
3✔
249
    metadata_table["k_max"] = kbin_edges[-1] * np.ones(nbins_z)
3✔
250

251
    if compute_covariance or compute_bootstrap:
3!
252
        if "sub_forest_id" not in p1d_table.columns:
×
253
            userprint(
×
254
                """sub_forest_id cannot be computed from individual pk files,
255
                necessary to compute covariance. Skipping calculation"""
256
            )
257
            compute_covariance, compute_bootstrap = False, False
×
258
            cov_table = None
×
259

260
        elif apply_z_weights:
×
261
            userprint(
×
262
                """Covariance calculations are not compatible redshift weighting yes.
263
                Skipping calculation"""
264
            )
265
            compute_covariance, compute_bootstrap = False, False
×
266
            cov_table = None
×
267

268
        else:
269
            # Initialize cov_table of len = (nzbins * nkbins * nkbins)
270
            # corresponding to hdu[3] in final ouput
271
            cov_table = Table()
×
272
            cov_table["zbin"] = np.zeros(nbins_z * nbins_k * nbins_k)
×
273
            cov_table["index_zbin"] = np.zeros(nbins_z * nbins_k * nbins_k, dtype=int)
×
274
            cov_table["N"] = np.zeros(nbins_z * nbins_k * nbins_k, dtype=int)
×
275
            cov_table["covariance"] = np.zeros(nbins_z * nbins_k * nbins_k)
×
276
            cov_table["k1"] = np.zeros(nbins_z * nbins_k * nbins_k)
×
277
            cov_table["k2"] = np.zeros(nbins_z * nbins_k * nbins_k)
×
278

279
            if compute_bootstrap:
×
280
                cov_table["boot_covariance"] = np.zeros(nbins_z * nbins_k * nbins_k)
×
281
                cov_table["error_boot_covariance"] = np.zeros(
×
282
                    nbins_z * nbins_k * nbins_k
283
                )
284

285
            k_index = np.full(len(p1d_table["k"]), -1, dtype=int)
×
286
            for ikbin, _ in enumerate(kbin_edges[:-1]):  # First loop 1) k bins
×
287
                select = (p1d_table["k"] < kbin_edges[ikbin + 1]) & (
×
288
                    p1d_table["k"] > kbin_edges[ikbin]
289
                )  # select a specific k bin
290
                k_index[select] = ikbin
×
291
    else:
292
        cov_table = None
3✔
293

294
    # Number of chunks in each redshift bin
295
    n_chunks, _, _ = binned_statistic(
3✔
296
        z_array, z_array, statistic="count", bins=zbin_edges
297
    )
298
    metadata_table["N_chunks"] = n_chunks
3✔
299

300
    zbin_centers = np.around((zbin_edges[1:] + zbin_edges[:-1]) / 2, 5)
3✔
301
    if weight_method == "fit_snr":
3!
302
        snrfit_table = np.zeros(
×
303
            (nbins_z * nbins_k, 13)
304
        )  # 13 entries: z k a b + 9 SNR bins used for the fit
305
    else:
306
        snrfit_table = None
3✔
307

308
    userprint("Computing average p1d")
3✔
309
    # Main loop 1) z bins
310
    params_pool = [[izbin] for izbin, _ in enumerate(zbin_edges[:-1])]
3✔
311

312
    func = partial(
3✔
313
        compute_average_pk_redshift,
314
        p1d_table,
315
        p1d_table_cols,
316
        weight_method,
317
        apply_z_weights,
318
        nomedians,
319
        nbins_z,
320
        zbin_centers,
321
        zbin_edges,
322
        n_chunks,
323
        nbins_k,
324
        kbin_edges,
325
    )
326
    if number_worker == 1:
3!
327
        output_mean = [func(p[0]) for p in params_pool]
×
328
    else:
329
        with Pool(number_worker) as pool:
3✔
330
            output_mean = pool.starmap(func, params_pool)
3✔
331

332
    fill_average_pk(
3✔
333
        nbins_z,
334
        nbins_k,
335
        output_mean,
336
        mean_p1d_table,
337
        p1d_table_cols,
338
        weight_method,
339
        snrfit_table,
340
        nomedians,
341
    )
342

343
    if compute_covariance:
3!
344
        userprint("Computing covariance matrix")
×
345
        params_pool = []
×
346
        # Main loop 1) z bins
347
        for izbin in range(nbins_z):
×
348
            select_z = (p1d_table["forest_z"] < zbin_edges[izbin + 1]) & (
×
349
                p1d_table["forest_z"] > zbin_edges[izbin]
350
            )
351
            sub_forest_ids = np.unique(p1d_table["sub_forest_id"][select_z])
×
352
            params_pool.append([izbin, select_z, sub_forest_ids])
×
353

354
        func = partial(
×
355
            compute_cov,
356
            p1d_table,
357
            mean_p1d_table,
358
            zbin_centers,
359
            n_chunks,
360
            kbin_edges,
361
            k_index,
362
            nbins_k,
363
            weight_method,
364
            snrfit_table,
365
        )
366
        if number_worker == 1:
×
367
            output_cov = [func(*p) for p in params_pool]
×
368
        else:
369
            with Pool(number_worker) as pool:
×
370
                output_cov = pool.starmap(func, params_pool)
×
371
        # Main loop 1) z bins
372
        for izbin in range(nbins_z):
×
373
            (
×
374
                zbin_array,
375
                index_zbin_array,
376
                n_array,
377
                covariance_array,
378
                k1_array,
379
                k2_array,
380
            ) = (*output_cov[izbin],)
381
            i_min = izbin * nbins_k * nbins_k
×
382
            i_max = (izbin + 1) * nbins_k * nbins_k
×
383
            cov_table["zbin"][i_min:i_max] = zbin_array
×
384
            cov_table["index_zbin"][i_min:i_max] = index_zbin_array
×
385
            cov_table["N"][i_min:i_max] = n_array
×
386
            cov_table["covariance"][i_min:i_max] = covariance_array
×
387
            cov_table["k1"][i_min:i_max] = k1_array
×
388
            cov_table["k2"][i_min:i_max] = k2_array
×
389

390
    if compute_bootstrap:
3!
391
        userprint("Computing covariance matrix with bootstrap method")
×
392

393
        params_pool = []
×
394
        # Main loop 1) z bins
395
        for izbin in range(nbins_z):
×
396
            select_z = (p1d_table["forest_z"] < zbin_edges[izbin + 1]) & (
×
397
                p1d_table["forest_z"] > zbin_edges[izbin]
398
            )
399

400
            sub_forest_ids = np.unique(p1d_table["sub_forest_id"][select_z])
×
401
            if sub_forest_ids.size > 0:
×
402
                bootid = np.array(
×
403
                    bootstrap(np.arange(sub_forest_ids.size), number_bootstrap)
404
                ).astype(int)
405
            else:
406
                bootid = np.full(number_bootstrap, None)
×
407

408
            # Main loop 2) number of bootstrap samples
409
            for iboot in range(number_bootstrap):
×
410
                params_pool.append([izbin, select_z, sub_forest_ids[bootid[iboot]]])
×
411

412
        func = partial(
×
413
            compute_cov,
414
            p1d_table,
415
            mean_p1d_table,
416
            zbin_centers,
417
            n_chunks,
418
            kbin_edges,
419
            k_index,
420
            nbins_k,
421
            weight_method,
422
            snrfit_table,
423
        )
424
        if number_worker == 1:
×
425
            output_cov = [func(*p) for p in params_pool]
×
426
        else:
427
            with Pool(number_worker) as pool:
×
428
                output_cov = pool.starmap(func, params_pool)
×
429
        # Main loop 1) z bins
430
        for izbin in range(nbins_z):
×
431
            boot_cov = []
×
432
            # Main loop 2) number of bootstrap samples
433
            for iboot in range(number_bootstrap):
×
434
                (
×
435
                    zbin_array,
436
                    index_zbin_array,
437
                    n_array,
438
                    covariance_array,
439
                    k1_array,
440
                    k2_array,
441
                ) = (*output_cov[izbin * number_bootstrap + iboot],)
442
                boot_cov.append(covariance_array)
×
443

444
            i_min = izbin * nbins_k * nbins_k
×
445
            i_max = (izbin + 1) * nbins_k * nbins_k
×
446
            cov_table["boot_covariance"][i_min:i_max] = np.mean(boot_cov, axis=0)
×
447
            cov_table["error_boot_covariance"][i_min:i_max] = np.std(boot_cov, axis=0)
×
448

449
    if output_snrfit is not None:
3!
450
        np.savetxt(
×
451
            output_snrfit,
452
            snrfit_table,
453
            fmt="%.5e",
454
            header="Result of fit: Variance(Pks) vs SNR\n"
455
            "SNR bin edges used: 1,  2,  3,  4,  5,  6,  7,  8,  9, 10\n"
456
            "z k a b standard_dev_points",
457
        )
458

459
    return mean_p1d_table, metadata_table, cov_table
3✔
460

461

462
def fill_average_pk(
3✔
463
    nbins_z,
464
    nbins_k,
465
    output_mean,
466
    mean_p1d_table,
467
    p1d_table_cols,
468
    weight_method,
469
    snrfit_table,
470
    nomedians,
471
):
472
    """Fill the average P1D table for all redshift and k bins.
473

474
    Arguments
475
    ---------
476
    nbins_z: int,
477
    Number of redshift bins.
478

479
    nbins_k: int,
480
    Number of k bins.
481

482
    output_mean: tuple of numpy ndarray,
483
    Result of the mean calculation
484

485
    mean_p1d_table: numpy ndarray,
486
    Table to fill.
487

488
    p1d_table_cols: List of str,
489
    Column names in the input table to be averaged.
490

491
    weight_method: string
492
    2 possible options:
493
        'fit_snr': Compute mean P1D with weights estimated by fitting dispersion vs SNR
494
        'no_weights': Compute mean P1D without weights
495

496
    snrfit_table: numpy ndarray,
497
    Table containing SNR fit infos
498

499
    nomedians: bool,
500
    If True, do not use median values in the fit to the SNR.
501

502
    Return
503
    ------
504
    None
505

506
    """
507
    for izbin in range(nbins_z):  # Main loop 1) z bins
3✔
508
        (
3✔
509
            zbin_array,
510
            index_zbin_array,
511
            n_array,
512
            mean_array,
513
            error_array,
514
            min_array,
515
            max_array,
516
            median_array,
517
            snrfit_array,
518
        ) = (*output_mean[izbin],)
519
        i_min = izbin * nbins_k
3✔
520
        i_max = (izbin + 1) * nbins_k
3✔
521

522
        mean_p1d_table["zbin"][i_min:i_max] = zbin_array
3✔
523
        mean_p1d_table["index_zbin"][i_min:i_max] = index_zbin_array
3✔
524
        mean_p1d_table["N"][i_min:i_max] = n_array
3✔
525
        for icol, col in enumerate(p1d_table_cols):
3✔
526
            mean_p1d_table["mean" + col][i_min:i_max] = mean_array[icol]
3✔
527
            mean_p1d_table["error" + col][i_min:i_max] = error_array[icol]
3✔
528
            mean_p1d_table["min" + col][i_min:i_max] = min_array[icol]
3✔
529
            mean_p1d_table["max" + col][i_min:i_max] = max_array[icol]
3✔
530
            if not nomedians:
3!
531
                mean_p1d_table["median" + col][i_min:i_max] = median_array[icol]
3✔
532
        if weight_method == "fit_snr":
3!
533
            snrfit_table[i_min:i_max, :] = snrfit_array
×
534

535

536
def compute_average_pk_redshift(
3✔
537
    p1d_table,
538
    p1d_table_cols,
539
    weight_method,
540
    apply_z_weights,
541
    nomedians,
542
    nbins_z,
543
    zbin_centers,
544
    zbin_edges,
545
    n_chunks,
546
    nbins_k,
547
    kbin_edges,
548
    izbin,
549
):
550
    """Compute the average P1D table for the given redshift and k bins.
551

552
    The function computes the mean P1D table for each redshift and k bin.
553
    If there are no chunks in a given bin, the rows in the
554
    table for that bin will be filled with NaNs.
555
    The mean value for each bin is calculated using a weighting method,
556
    either a fit to the SNR or using weights based on the redshift.
557

558
    Arguments
559
    ---------
560
    p1d_table: numpy ndarray,
561
    Table containing the data to be averaged.
562

563
    p1d_table_cols: List of str,
564
    Column names in the input table to be averaged.
565

566
    weight_method: str,
567
    Method to weight the data.
568

569
    apply_z_weights: bool,
570
    If True, apply redshift weights.
571

572
    nomedians: bool,
573
    If True, do not use median values in the fit to the SNR.
574

575
    nbins_z: int,
576
    Number of redshift bins.
577

578
    zbin_centers: numpy ndarray,
579
    Centers of the redshift bins.
580

581
    zbin_edges: numpy ndarray,
582
    Edges of the redshift bins.
583

584
    n_chunks: numpy ndarray,
585
    Number of chunks in each redshift bin.
586

587
    nbins_k: int,
588
    Number of k bins.
589

590
    kbin_edges: numpy ndarray,
591
    Edges of the k bins.
592

593
    izbin: int,
594
    Index of the current redshift bin.
595

596
    Return
597
    ------
598
    zbin_array
599
    index_zbin_array
600
    n_array
601
    mean_array
602
    error_array
603
    min_array
604
    max_array
605
    median_array
606
    snrfit_array
607
    """
608
    n_array = np.zeros(nbins_k, dtype=int)
×
609
    zbin_array = np.zeros(nbins_k)
×
610
    index_zbin_array = np.zeros(nbins_k, dtype=int)
×
611
    mean_array = []
×
612
    error_array = []
×
613
    min_array = []
×
614
    max_array = []
×
615
    if not nomedians:
×
616
        median_array = []
×
617
    else:
618
        median_array = None
×
619
    if weight_method == "fit_snr":
×
620
        snrfit_array = np.zeros((nbins_k, 13))
×
621
    else:
622
        snrfit_array = None
×
623

624
    for col in p1d_table_cols:
×
625
        mean_array.append(np.zeros(nbins_k))
×
626
        error_array.append(np.zeros(nbins_k))
×
627
        min_array.append(np.zeros(nbins_k))
×
628
        max_array.append(np.zeros(nbins_k))
×
629
        if not nomedians:
×
630
            median_array.append(np.zeros(nbins_k))
×
631

632
    if n_chunks[izbin] == 0:  # Fill rows with NaNs
×
633
        zbin_array[:] = zbin_centers[izbin]
×
634
        index_zbin_array[:] = izbin
×
635
        n_array[:] = 0
×
636
        for icol, col in enumerate(p1d_table_cols):
×
637
            mean_array[icol][:] = np.nan
×
638
            error_array[icol][:] = np.nan
×
639
            min_array[icol][:] = np.nan
×
640
            max_array[icol][:] = np.nan
×
641
            if not nomedians:
×
642
                median_array[icol][:] = np.nan
×
643
        if weight_method == "fit_snr":
×
644
            snrfit_array[:] = np.nan
×
645

646
        return (
×
647
            zbin_array,
648
            index_zbin_array,
649
            n_array,
650
            mean_array,
651
            error_array,
652
            min_array,
653
            max_array,
654
            median_array,
655
            snrfit_array,
656
        )
657

658
    for ikbin, kbin in enumerate(kbin_edges[:-1]):  # Main loop 2) k bins
×
659
        if apply_z_weights:  # special chunk selection in that case
×
660
            delta_z = zbin_centers[1:] - zbin_centers[:-1]
×
661
            if not np.allclose(delta_z, delta_z[0], atol=1.0e-3):
×
662
                raise ValueError(
×
663
                    "z bins should have equal widths with apply_z_weights."
664
                )
665
            delta_z = delta_z[0]
×
666

667
            select = (p1d_table["k"] < kbin_edges[ikbin + 1]) & (
×
668
                p1d_table["k"] > kbin_edges[ikbin]
669
            )
670
            if izbin in (0, nbins_z - 1):
×
671
                # First and last bin: in order to avoid edge effects,
672
                #    use only chunks within the bin
673
                select = (
×
674
                    select
675
                    & (p1d_table["forest_z"] > zbin_edges[izbin])
676
                    & (p1d_table["forest_z"] < zbin_edges[izbin + 1])
677
                )
678
            else:
679
                select = (
×
680
                    select
681
                    & (p1d_table["forest_z"] < zbin_centers[izbin + 1])
682
                    & (p1d_table["forest_z"] > zbin_centers[izbin - 1])
683
                )
684

685
            redshift_weights = (
×
686
                1.0
687
                - np.abs(p1d_table["forest_z"][select] - zbin_centers[izbin]) / delta_z
688
            )
689

690
        else:
691
            select = (
×
692
                (p1d_table["forest_z"] < zbin_edges[izbin + 1])
693
                & (p1d_table["forest_z"] > zbin_edges[izbin])
694
                & (p1d_table["k"] < kbin_edges[ikbin + 1])
695
                & (p1d_table["k"] > kbin_edges[ikbin])
696
            )  # select a specific (z,k) bin
697

698
        zbin_array[ikbin] = zbin_centers[izbin]
×
699
        index_zbin_array[ikbin] = izbin
×
700

701
        # Counts the number of chunks in each (z,k) bin
702
        num_chunks = np.ma.count(p1d_table["k"][select])
×
703

704
        n_array[ikbin] = num_chunks
×
705

706
        if weight_method == "fit_snr":
×
707
            if num_chunks == 0:
×
708
                userprint(
×
709
                    "Warning: 0 chunks found in bin "
710
                    + str(zbin_edges[izbin])
711
                    + "<z<"
712
                    + str(zbin_edges[izbin + 1])
713
                    + ", "
714
                    + str(kbin_edges[ikbin])
715
                    + "<k<"
716
                    + str(kbin_edges[ikbin + 1])
717
                )
718
                continue
×
719
            snr_bin_edges = np.arange(
×
720
                MEANPK_FITRANGE_SNR[0], MEANPK_FITRANGE_SNR[1] + 1, 1
721
            )
722
            snr_bins = (snr_bin_edges[:-1] + snr_bin_edges[1:]) / 2
×
723

724
            p1d_values = p1d_table["Pk"][select]
×
725
            data_snr = p1d_table["forest_snr"][select]
×
726
            mask_nan_p1d_values = (~np.isnan(p1d_values)) & (~np.isnan(data_snr))
×
727
            data_snr, p1d_values = (
×
728
                data_snr[mask_nan_p1d_values],
729
                p1d_values[mask_nan_p1d_values],
730
            )
731

732
            standard_dev, _, _ = binned_statistic(
×
733
                data_snr, p1d_values, statistic="std", bins=snr_bin_edges
734
            )
735
            standard_dev_full = np.copy(standard_dev)
×
736
            standard_dev, snr_bins = (
×
737
                standard_dev[~np.isnan(standard_dev)],
738
                snr_bins[~np.isnan(standard_dev)],
739
            )
740
            coef, *_ = curve_fit(
×
741
                fitfunc_variance_pk1d,
742
                snr_bins,
743
                standard_dev**2,
744
                bounds=(0, np.inf),
745
            )
746
            data_snr[data_snr > MEANPK_FITRANGE_SNR[1]] = MEANPK_FITRANGE_SNR[1]
×
747
            data_snr[data_snr < 1.01] = 1.01
×
748
            variance_estimated = fitfunc_variance_pk1d(data_snr, *coef)
×
749
            weights = 1.0 / variance_estimated
×
750

751
        for icol, col in enumerate(p1d_table_cols):
×
752
            if num_chunks == 0:
×
753
                userprint(
×
754
                    "Warning: 0 chunks found in bin "
755
                    + str(zbin_edges[izbin])
756
                    + "<z<"
757
                    + str(zbin_edges[izbin + 1])
758
                    + ", "
759
                    + str(kbin_edges[ikbin])
760
                    + "<k<"
761
                    + str(kbin_edges[ikbin + 1])
762
                )
763
                continue
×
764

765
            if weight_method == "fit_snr":
×
766
                snr_bin_edges = np.arange(
×
767
                    MEANPK_FITRANGE_SNR[0], MEANPK_FITRANGE_SNR[1] + 1, 1
768
                )
769
                snr_bins = (snr_bin_edges[:-1] + snr_bin_edges[1:]) / 2
×
770

771
                data_values = p1d_table[col][select]
×
772
                data_values = data_values[mask_nan_p1d_values]
×
773
                # Fit function to observed dispersion in col:
774
                standard_dev_col, _, _ = binned_statistic(
×
775
                    data_snr, data_values, statistic="std", bins=snr_bin_edges
776
                )
777
                standard_dev_col, snr_bins = (
×
778
                    standard_dev_col[~np.isnan(standard_dev_col)],
779
                    snr_bins[~np.isnan(standard_dev_col)],
780
                )
781

782
                coef_col, *_ = curve_fit(
×
783
                    fitfunc_variance_pk1d,
784
                    snr_bins,
785
                    standard_dev_col**2,
786
                    bounds=(0, np.inf),
787
                )
788

789
                # Model variance from fit function
790
                data_snr[data_snr > MEANPK_FITRANGE_SNR[1]] = MEANPK_FITRANGE_SNR[1]
×
791
                data_snr[data_snr < 1.01] = 1.01
×
792
                variance_estimated_col = fitfunc_variance_pk1d(data_snr, *coef_col)
×
793
                weights_col = 1.0 / variance_estimated_col
×
794
                if apply_z_weights:
×
795
                    weights *= redshift_weights
×
796
                mean = np.average(data_values, weights=weights)
×
797
                if apply_z_weights:
×
798
                    # Analytic expression for the re-weighted average:
799
                    error = np.sqrt(np.sum(weights_col * redshift_weights)) / np.sum(
×
800
                        weights_col
801
                    )
802
                else:
803
                    error = np.sqrt(1.0 / np.sum(weights_col))
×
804
                if col == "Pk":
×
805
                    standard_dev = np.concatenate(
×
806
                        [
807
                            standard_dev,
808
                            np.full(len(standard_dev[np.isnan(standard_dev)]), np.nan),
809
                        ]
810
                    )
811
                    snrfit_array[ikbin, 0:4] = [
×
812
                        zbin_centers[izbin],
813
                        (kbin + kbin_edges[ikbin + 1]) / 2.0,
814
                        coef[0],
815
                        coef[1],
816
                    ]
817
                    snrfit_array[ikbin, 4:] = standard_dev_full  # also save nan values
×
818

819
            elif weight_method == "no_weights":
×
820
                if apply_z_weights:
×
821
                    mean = np.average(p1d_table[col][select], weights=redshift_weights)
×
822
                    # simple analytic expression:
823
                    error = np.std(p1d_table[col][select]) * (
×
824
                        np.sqrt(np.sum(redshift_weights**2))
825
                        / np.sum(redshift_weights)
826
                    )
827
                else:
828
                    mean = np.mean(p1d_table[col][select])
×
829
                    # unbiased estimate: num_chunks-1
830
                    error = np.std(p1d_table[col][select]) / np.sqrt(num_chunks - 1)
×
831

832
            else:
833
                raise ValueError("Option for 'weight_method' argument not found")
×
834

835
            minimum = np.min((p1d_table[col][select]))
×
836
            maximum = np.max((p1d_table[col][select]))
×
837
            mean_array[icol][ikbin] = mean
×
838
            error_array[icol][ikbin] = error
×
839
            min_array[icol][ikbin] = minimum
×
840
            max_array[icol][ikbin] = maximum
×
841
            if not nomedians:
×
842
                median = np.median((p1d_table[col][select]))
×
843
                median_array[icol][ikbin] = median
×
844
    return (
×
845
        zbin_array,
846
        index_zbin_array,
847
        n_array,
848
        mean_array,
849
        error_array,
850
        min_array,
851
        max_array,
852
        median_array,
853
        snrfit_array,
854
    )
855

856

857
def compute_cov(
3✔
858
    p1d_table,
859
    mean_p1d_table,
860
    zbin_centers,
861
    n_chunks,
862
    kbin_edges,
863
    k_index,
864
    nbins_k,
865
    weight_method,
866
    snrfit_table,
867
    izbin,
868
    select_z,
869
    sub_forest_ids,
870
):
871
    """Compute the covariance of a set of 1D power spectra.
872

873
    Arguments
874
    ---------
875
    p1d_table (array-like):
876
    Table of 1D power spectra, with columns 'Pk' and 'sub_forest_id'.
877

878
    mean_p1d_table (array-like):
879
    Table of mean 1D power spectra, with column 'meanPk'.
880

881
    zbin_centers (array-like):
882
    Array of bin centers for redshift.
883

884
    n_chunks (array-like):
885
    Array of the number of chunks in each redshift bin.
886

887
    k_index (array-like):
888
    Array of indices for k-values, with -1 indicating values outside of the k bins.
889

890
    nbins_k (int):
891
    Number of k bins.
892

893
    weight_method: str,
894
    Method to weight the data.
895

896
    snrfit_table: numpy ndarray,
897
    Table containing SNR fit infos
898

899
    izbin (int):
900
    Current redshift bin being considered.
901

902
    select_z (array-like):
903
    Boolean array for selecting data points based on redshift.
904

905
    sub_forest_ids (array-like):
906
    Array of chunk ids.
907

908
    Return
909
    ------
910
    zbin_array (array-like):
911
    Array of redshift bin centers for each covariance coefficient.
912

913
    index_zbin_array (array-like):
914
    Array of redshift bin indices for each covariance coefficient.
915

916
    n_array (array-like):
917
    Array of the number of power spectra used to compute each covariance coefficient.
918

919
    covariance_array (array-like):
920
    Array of covariance coefficients.
921
    """
922
    zbin_array = np.zeros(nbins_k * nbins_k)
×
923
    index_zbin_array = np.zeros(nbins_k * nbins_k, dtype=int)
×
924
    n_array = np.zeros(nbins_k * nbins_k)
×
925
    covariance_array = np.zeros(nbins_k * nbins_k)
×
926
    k1_array = np.zeros(nbins_k * nbins_k)
×
927
    k2_array = np.zeros(nbins_k * nbins_k)
×
928
    if weight_method == "fit_snr":
×
929
        weight_array = np.zeros(nbins_k * nbins_k)
×
930

931
    kbin_centers = (kbin_edges[1:] + kbin_edges[:-1]) / 2
×
932
    if n_chunks[izbin] == 0:  # Fill rows with NaNs
×
933
        zbin_array[:] = zbin_centers[izbin]
×
934
        index_zbin_array[:] = izbin
×
935
        n_array[:] = 0
×
936
        covariance_array[:] = np.nan
×
937
        k1_array[:] = np.nan
×
938
        k2_array[:] = np.nan
×
939
        return (
×
940
            zbin_array,
941
            index_zbin_array,
942
            n_array,
943
            covariance_array,
944
            k1_array,
945
            k2_array,
946
        )
947
    # First loop 1) id sub-forest bins
948
    for sub_forest_id in sub_forest_ids:
×
949
        select_id = select_z & (p1d_table["sub_forest_id"] == sub_forest_id)
×
950
        selected_pk = p1d_table["Pk"][select_id]
×
951
        selected_ikbin = k_index[select_id]
×
952

953
        if weight_method == "fit_snr":
×
954
            # Definition of weighted unbiased sample covariance taken from
955
            # "George R. Price, Ann. Hum. Genet., Lond, pp485-490,
956
            # Extension of covariance selection mathematics, 1972"
957
            # adapted with weights w_i = np.sqrt(w_ipk * w_ipk2) for covariance
958
            # to obtain the right definition of variance on the diagonal
959
            selected_snr = p1d_table["forest_snr"][select_id]
×
960
            snrfit_z = snrfit_table[izbin * nbins_k : (izbin + 1) * nbins_k, :]
×
961
            selected_variance_estimated = fitfunc_variance_pk1d(
×
962
                selected_snr, snrfit_z[selected_ikbin, 2], snrfit_z[selected_ikbin, 3]
963
            )
964
            # First loop 2) selected pk
965
            for ipk, _ in enumerate(selected_pk):
×
966
                ikbin = selected_ikbin[ipk]
×
967
                # First loop 3) selected pk
968
                for ipk2 in range(ipk, len(selected_pk)):
×
969
                    ikbin2 = selected_ikbin[ipk2]
×
970
                    if (ikbin2 != -1) & (ikbin != -1):
×
971
                        # index of the (ikbin,ikbin2) coefficient on the top of the matrix
972
                        index = (nbins_k * ikbin) + ikbin2
×
973
                        weight = 1 / selected_variance_estimated[ipk]
×
974
                        weight2 = 1 / selected_variance_estimated[ipk2]
×
975
                        covariance_array[index] = covariance_array[index] + selected_pk[
×
976
                            ipk
977
                        ] * selected_pk[ipk2] * np.sqrt(weight * weight2)
978
                        n_array[index] = n_array[index] + 1
×
979
                        weight_array[index] = weight_array[index] + np.sqrt(
×
980
                            weight * weight2
981
                        )
982
        else:
983
            # First loop 2) selected pk
984
            for ipk, _ in enumerate(selected_pk):
×
985
                ikbin = selected_ikbin[ipk]
×
986
                # First loop 3) selected pk
987
                for ipk2 in range(ipk, len(selected_pk)):
×
988
                    ikbin2 = selected_ikbin[ipk2]
×
989
                    if (ikbin2 != -1) & (ikbin != -1):
×
990
                        # index of the (ikbin,ikbin2) coefficient on the top of the matrix
991
                        index = (nbins_k * ikbin) + ikbin2
×
992
                        covariance_array[index] = (
×
993
                            covariance_array[index]
994
                            + selected_pk[ipk] * selected_pk[ipk2]
995
                        )
996
                        n_array[index] = n_array[index] + 1
×
997
    # Second loop 1) k bins
998
    for ikbin in range(nbins_k):
×
999
        mean_ikbin = mean_p1d_table["meanPk"][(nbins_k * izbin) + ikbin]
×
1000

1001
        # Second loop 2) k bins
1002
        for ikbin2 in range(ikbin, nbins_k):
×
1003
            mean_ikbin2 = mean_p1d_table["meanPk"][(nbins_k * izbin) + ikbin2]
×
1004

1005
            # index of the (ikbin,ikbin2) coefficient on the top of the matrix
1006
            index = (nbins_k * ikbin) + ikbin2
×
1007
            if weight_method == "fit_snr":
×
1008
                covariance_array[index] = (
×
1009
                    covariance_array[index] / weight_array[index]
1010
                ) - mean_ikbin * mean_ikbin2
1011
            else:
1012
                covariance_array[index] = (
×
1013
                    covariance_array[index] / n_array[index]
1014
                ) - mean_ikbin * mean_ikbin2
1015

1016
            # Same normalization for fit_snr, equivalent to dividing to
1017
            # weight_array[index] if weights are normalized to give
1018
            # sum(weight_array[index]) = n_array[index]
1019
            covariance_array[index] = covariance_array[index] / n_array[index]
×
1020
            zbin_array[index] = zbin_centers[izbin]
×
1021
            index_zbin_array[index] = izbin
×
1022
            k1_array[index] = kbin_centers[ikbin]
×
1023
            k2_array[index] = kbin_centers[ikbin2]
×
1024

1025
            if ikbin2 != ikbin:
×
1026
                # index of the (ikbin,ikbin2) coefficient on the bottom of the matrix
1027
                index_2 = (nbins_k * ikbin2) + ikbin
×
1028
                covariance_array[index_2] = covariance_array[index]
×
1029
                n_array[index_2] = n_array[index]
×
1030

1031
                zbin_array[index_2] = zbin_centers[izbin]
×
1032
                index_zbin_array[index_2] = izbin
×
1033
                k1_array[index_2] = kbin_centers[ikbin2]
×
1034
                k2_array[index_2] = kbin_centers[ikbin]
×
1035

1036
    # For fit_snr method, due to the SNR fitting scheme used for weighting,
1037
    # the diagonal of the weigthed sample covariance matrix is not equal
1038
    # to the error in mean P1D. This is tested on Ohio mocks.
1039
    # We choose to renormalize the whole covariance matrix.
1040
    if weight_method == "fit_snr":
×
1041
        # Third loop 1) k bins
1042
        for ikbin in range(nbins_k):
×
1043
            std_ikbin = mean_p1d_table["errorPk"][(nbins_k * izbin) + ikbin]
×
1044
            # Third loop 2) k bins
1045
            for ikbin2 in range(ikbin, nbins_k):
×
1046
                std_ikbin2 = mean_p1d_table["errorPk"][(nbins_k * izbin) + ikbin2]
×
1047

1048
                # index of the (ikbin,ikbin2) coefficient on the top of the matrix
1049
                index = (nbins_k * ikbin) + ikbin2
×
1050
                covariance_array[index] = (
×
1051
                    covariance_array[index]
1052
                    * (std_ikbin * std_ikbin2)
1053
                    / np.sqrt(
1054
                        covariance_array[(nbins_k * ikbin) + ikbin]
1055
                        * covariance_array[(nbins_k * ikbin2) + ikbin2]
1056
                    )
1057
                )
1058

1059
                if ikbin2 != ikbin:
×
1060
                    # index of the (ikbin,ikbin2) coefficient on the bottom of the matrix
1061
                    index_2 = (nbins_k * ikbin2) + ikbin
×
1062
                    covariance_array[index_2] = (
×
1063
                        covariance_array[index_2]
1064
                        * (std_ikbin * std_ikbin2)
1065
                        / np.sqrt(
1066
                            covariance_array[(nbins_k * ikbin) + ikbin]
1067
                            * covariance_array[(nbins_k * ikbin2) + ikbin2]
1068
                        )
1069
                    )
1070

1071
    return zbin_array, index_zbin_array, n_array, covariance_array, k1_array, k2_array
×
1072

1073

1074
def run_postproc_pk1d(
3✔
1075
    data_dir,
1076
    output_file,
1077
    zbin_edges,
1078
    kbin_edges,
1079
    weight_method="no_weights",
1080
    apply_z_weights=False,
1081
    snrcut=None,
1082
    zbins_snrcut=None,
1083
    output_snrfit=None,
1084
    nomedians=False,
1085
    velunits=False,
1086
    overwrite=False,
1087
    ncpu=8,
1088
    compute_covariance=False,
1089
    compute_bootstrap=False,
1090
    number_bootstrap=50,
1091
):
1092
    """
1093
    Read individual Pk1D data from a set of files and compute P1D statistics.
1094

1095
    Arguments
1096
    ---------
1097
    data_dir: string
1098
    Directory where individual P1D FITS files are located
1099

1100
    output_file: string
1101
    Output file name
1102

1103
    overwrite: Bool
1104
    Overwrite output file if existing
1105

1106
    ncpu: int
1107
    The I/O function read_pk1d() is run parallel
1108

1109
    Other arguments are as defined
1110
    in compute_mean_pk1d() or read_pk1d()
1111
    """
1112
    if os.path.exists(output_file) and not overwrite:
3!
1113
        raise RuntimeError("Output file already exists: " + output_file)
×
1114

1115
    searchstr = "*"
3✔
1116
    files = glob.glob(os.path.join(data_dir, f"Pk1D{searchstr}.fits.gz"))
3✔
1117

1118
    with Pool(ncpu) as pool:
3✔
1119
        output_readpk1d = pool.starmap(
3✔
1120
            read_pk1d, [[f, kbin_edges, snrcut, zbins_snrcut] for f in files]
1121
        )
1122

1123
    output_readpk1d = [x for x in output_readpk1d if x is not None]
3✔
1124
    p1d_table = vstack([output_readpk1d[i][0] for i in range(len(output_readpk1d))])
3✔
1125
    z_array = np.concatenate(
3✔
1126
        tuple(output_readpk1d[i][1] for i in range(len(output_readpk1d)))
1127
    )
1128

1129
    userprint("Individual P1Ds read, now computing statistics.")
3✔
1130

1131
    mean_p1d_table, metadata_table, cov_table = compute_mean_pk1d(
3✔
1132
        p1d_table,
1133
        z_array,
1134
        zbin_edges,
1135
        kbin_edges,
1136
        weight_method,
1137
        apply_z_weights,
1138
        nomedians=nomedians,
1139
        velunits=velunits,
1140
        output_snrfit=output_snrfit,
1141
        compute_covariance=compute_covariance,
1142
        compute_bootstrap=compute_bootstrap,
1143
        number_bootstrap=number_bootstrap,
1144
    )
1145

1146
    result = fitsio.FITS(output_file, "rw", clobber=True)
3✔
1147
    result.write(mean_p1d_table.as_array())
3✔
1148
    result.write(
3✔
1149
        metadata_table.as_array(),
1150
        header={"VELUNITS": velunits, "NQSO": len(np.unique(p1d_table["forest_id"]))},
1151
    )
1152
    if cov_table is not None:
3!
1153
        result.write(cov_table.as_array())
×
1154
    result.close()
3✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc