• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

iprafols / stacking / 10923202860

18 Sep 2024 01:29PM UTC coverage: 99.247% (-0.8%) from 100.0%
10923202860

push

github

iprafols
yapfed and linted code

499 of 505 branches covered (98.81%)

Branch coverage included in aggregate %.

3 of 3 new or added lines in 3 files covered. (100.0%)

8 existing lines in 3 files now uncovered.

1345 of 1353 relevant lines covered (99.41%)

2.98 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.91
/stacking/stackers/split_stacker.py
1
""" This module defines the class SplitStacker to compute multiple
2
stacks splitting on one or more properties of the spectra"""
3

4
import logging
3✔
5

6
from astropy.table import Table
3✔
7
import numpy as np
3✔
8
import pandas as pd
3✔
9

10
from stacking.errors import StackerError
3✔
11
from stacking.spectrum import Spectrum
3✔
12
from stacking.stacker import Stacker
3✔
13
from stacking.stacker import defaults, accepted_options, required_options
3✔
14
from stacking.stackers.split_stacker_utils import (
3✔
15
    assign_group_multiple_cuts,
16
    assign_group_one_cut,
17
    extract_split_cut_sets,
18
    format_split_on,
19
    format_splits,
20
    retreive_group_number,
21
)
22
from stacking.utils import (update_accepted_options, update_default_options,
3✔
23
                            update_required_options)
24

25
VALID_SPLIT_TYPES = [
3✔
26
    # the split will be performed independently in the different variables,
27
    # thus, a spectrum can enter multiple splits
28
    "OR",
29
    # the split will be performed using all the different variables,
30
    # thus, a spectrum can enter only one splits
31
    "AND"
32
]
33

34
accepted_options = update_accepted_options(accepted_options, [
3✔
35
    "catalogue HDU name or number", "specid name", "split catalogue name",
36
    "split on", "split cuts", "split type"
37
])
38
defaults = update_default_options(defaults, {
3✔
39
    "split type": "OR",
40
    "catalogue HDU name or number": "CATALOG",
41
})
42
required_options = update_required_options(required_options, [
3✔
43
    "catalogue HDU name or number", "specid name", "split catalogue name",
44
    "split on", "split cuts"
45
])
46

47

48
class SplitStacker(Stacker):
3✔
49
    """Abstract class to compute mulitple stacks splitting on one
50
    or more properties of the spectra.
51

52
    Methods
53
    -------
54
    (see Stacker in stacking/stacker.py)
55
    __init__
56
    __parse_config
57
    assing_groups
58
    read_catalogue
59
    stack
60

61
    Attributes
62
    ----------
63
    (see Stacker in stacking/stacker.py)
64

65
    catalogue_hdu_name_or_number: str
66
    Name of the HDU in `split_catalogue_name` that contains the actual catalogue
67
    to split
68

69
    logger: logging.Logger
70
    Logger object
71

72
    groups_info: pd.DataFrame
73
    DataFrame containing the group information
74

75
    num_groups: int
76
    Number of groups the data is split on
77

78
    specid_name: str
79
    Name of the column containing the identifier SPECID
80

81
    split_catalogue: pd.DataFrame
82
    The catalogue to be split
83

84
    split_catalogue_name: str
85
    Filename of the catalogue to be split
86

87
    split_on: list of str
88
    List of column name(s) to be split
89

90
    split_type: "OR" or "AND"
91
    If "OR", then the split will be performed independently in the different
92
    variables (a spectrum can enter multiple splits). If "AND", the split will
93
    be performed using all the different variables (a spectrum can enter at most
94
    one split)
95

96
    splits: list of array of float
97
    List of intervals to perform the splits.
98
    Intervals are defined as [intervals[n], intervals[n-1]].
99
    The lower (upper) limit of the interval is included in(excluded of) the interval
100
    Values outside these intervals will be assinged a -1
101

102
    stackers: list of Stacker
103
    Stacker instances that will contain the stacked spectra for each of the groups
104
    Must be initialized by the child class
105
    """
106

107
    def __init__(self, config, groups_info=None, split_catalogue=None):
3✔
108
        """Initialize class instance
109

110
        Arguments
111
        ---------
112
        config: configparser.SectionProxy
113
        Parsed options to initialize class
114

115
        groups_info: pd.DataFrame or None - default: None
116
        If not None, then the groups information will be computed upon initialization. 
117
        Otherwise, this must be pandas DataFrame with the previously computed information
118

119
        split_catalogue: pd.DataFrame or None - default: None
120
        If not None, then the catalogue will be read from split_catalogue_name
121
        Otherwise, this must be pandas DataFrame with the previously read catalogue
122
        """
123
        self.logger = logging.getLogger(__name__)
3✔
124
        super().__init__(config)
3✔
125

126
        self.catalogue_hdu_name_or_number = None
3✔
127
        self.specid_name = None
3✔
128
        self.split_catalogue_name = None
3✔
129
        self.split_on = None
3✔
130
        self.split_type = None
3✔
131
        self.splits = []
3✔
132
        self.__parse_config(config)
3✔
133

134
        # read the catalogue
135
        if split_catalogue is None:
3!
136
            self.split_catalogue = self.read_catalogue()
3✔
137
        else:
UNCOV
138
            self.split_catalogue = split_catalogue
×
139

140
        # add groups
141
        if groups_info is None:
3!
142
            self.num_groups = None
3✔
143
            self.groups_info = None
3✔
144
            self.assing_groups()
3✔
145
        else:
UNCOV
146
            self.num_groups = groups_info.shape[0]
×
UNCOV
147
            self.groups_info = groups_info
×
148

149
        # This needs to be defined in the child class
150
        self.stackers = []
3✔
151

152
    def __parse_config(self, config):
3✔
153
        """Parse the configuration options
154

155
        Arguments
156
        ---------
157
        config: configparser.SectionProxy
158
        Parsed options to initialize class
159

160
        Raise
161
        -----
162
        StackerError upon missing required variables
163
        StackerError if variables are not properly formatted
164
        StackerError if variables are not coherent
165
        """
166
        self.catalogue_hdu_name_or_number = config.get(
3✔
167
            "catalogue HDU name or number")
168
        if self.catalogue_hdu_name_or_number is None:
3✔
169
            raise StackerError(
3✔
170
                "Missing argument 'catalogue HDU name or number' required by "
171
                "SplitStacker")
172

173
        self.specid_name = config.get("specid name")
3✔
174
        if self.specid_name is None:
3✔
175
            raise StackerError("Missing argument 'specid name' required by "
3✔
176
                               "SplitStacker")
177

178
        self.split_catalogue_name = config.get("split catalogue name")
3✔
179
        if self.split_catalogue_name is None:
3✔
180
            raise StackerError(
3✔
181
                "Missing argument 'split catalogue name' required by "
182
                "SplitStacker")
183

184
        split_on = config.get("split on")
3✔
185
        if split_on is None:
3✔
186
            raise StackerError("Missing argument 'split on' required by "
3✔
187
                               "SplitStacker")
188
        # use any of the following as separators (comma semicolon space)
189
        self.split_on = format_split_on(split_on)
3✔
190

191
        self.split_type = config.get("split type")
3✔
192
        if self.split_type is None:
3✔
193
            raise StackerError("Missing argument 'split type' required by "
3✔
194
                               "SplitStacker")
195
        self.split_type = self.split_type.upper()
3✔
196
        if self.split_type not in VALID_SPLIT_TYPES:
3✔
197
            raise StackerError(
3✔
198
                "Invalid value for argument 'split on' required by SplitStacker. "
199
                "Expected one of '" + " ".join(VALID_SPLIT_TYPES) +
200
                f" Found: '{self.split_type}'")
201

202
        split_cuts = config.get("split cuts")
3✔
203
        if split_cuts is None:
3✔
204
            raise StackerError("Missing argument 'split cuts' required by "
3✔
205
                               "SplitStacker")
206
        # the splitting on the different quantities is done using ; plus
207
        # possibly spaces
208
        split_cuts_sets = extract_split_cut_sets(split_cuts)
3✔
209
        if len(split_cuts_sets) != len(self.split_on):
3✔
210
            raise StackerError(
3✔
211
                "Inconsistency found in reading the splits. The number of "
212
                f"splitting variables is {len(self.split_on)}, but I found "
213
                f"{len(split_cuts_sets)} sets of cuts. Read vaues are\n"
214
                f"'split on' = '{self.split_on}'\n'split cuts' = '{split_cuts}'. "
215
                "Splitting variables are delimited by a semicolon (;), a comma"
216
                "(,) or a white space. Cuts sets should be delimited by the "
217
                "character ';'. Cut values within a given set should be delimited "
218
                "by commas and/or whitespaces)")
219
        self.splits = format_splits(split_cuts_sets)
3✔
220

221
    def assing_groups(self):
3✔
222
        """Assign groups to the catalogue entries. Store the total number of groups
223

224
        If split_type is OR-like, then assign one group number per varible in
225
        the split. Else, it split_type is AND-like, then assing a single group
226
        number
227
        """
228
        self.logger.progress("Assigning groups")
3✔
229

230
        self.num_groups = 0
3✔
231
        if self.split_type == "OR":
3✔
232
            groups = []
3✔
233
            for index, variable in enumerate(self.split_on):
3✔
234
                self.split_catalogue[
3✔
235
                    f"GROUP_{index}"] = self.split_catalogue.apply(
236
                        assign_group_one_cut,
237
                        axis=1,
238
                        args=(variable, self.splits[index], self.num_groups),
239
                    )
240
                # keep grouping info
241
                groups += [[
3✔
242
                    variable, min_value, max_value, f"GROUP_{index}",
243
                    group_index + self.num_groups
244
                ] for group_index, (min_value, max_value) in enumerate(
245
                    zip(self.splits[index][:-1], self.splits[index][1:]))]
246
                # update num_groups
247
                self.num_groups += self.splits[index].size - 1
3✔
248

249
            self.groups_info = pd.DataFrame(data=groups,
3✔
250
                                            columns=[
251
                                                "VARIABLE", "MIN_VALUE",
252
                                                "MAX_VALUE", "COLNAME",
253
                                                "GROUP_NUM"
254
                                            ])
255
        elif self.split_type == "AND":
3✔
256
            num_intervals = np.array([
3✔
257
                self.splits[index].size - 1
258
                for index in range(len(self.split_on))
259
            ])
260

261
            self.split_catalogue["GROUP"] = self.split_catalogue.apply(
3✔
262
                assign_group_multiple_cuts,
263
                axis=1,
264
                args=(self.split_on, self.splits, num_intervals),
265
            )
266

267
            self.num_groups = np.prod(num_intervals)
3✔
268

269
            groups = []
3✔
270
            for group_number in range(self.num_groups):
3✔
271
                aux_groups = [group_number]
3✔
272
                for index, num_intervals_variable in enumerate(num_intervals):
3✔
273
                    variable_index = group_number % num_intervals_variable
3✔
274
                    aux_groups += [
3✔
275
                        self.split_on[index],
276
                        self.splits[index][variable_index],
277
                        self.splits[index][variable_index + 1]
278
                    ]
279
                    group_number = (group_number -
3✔
280
                                    variable_index) // num_intervals_variable
281
                groups.append(aux_groups)
3✔
282

283
            # columns of the data frame
284
            cols = ["GROUP_NUM"]
3✔
285
            for index in range(len(self.split_on)):
3✔
286
                cols += [
3✔
287
                    f"VARIABLE_{index}", f"MIN_VALUE_{index}",
288
                    f"MAX_VALUE_{index}"
289
                ]
290

291
            self.groups_info = pd.DataFrame(data=groups, columns=cols)
3✔
292

293
        # this should never enter unless new split types are not properly added
294
        else:  # pragma: no cover
295
            raise StackerError(
296
                f"Don't know what to do with split type {self.split_type}. "
297
                "This is one of the supported split types, maybe it "
298
                "was not properly coded. If you did the change yourself, check "
299
                "that you added the behaviour of the new mode to method `assing_groups`. "
300
                "Otherwise contact 'stacking' developpers.")
301

302
        self.logger.progress("Groups assigned")
3✔
303

304
    def read_catalogue(self):
3✔
305
        """Read the catalogue to do the splits
306

307
        Return
308
        -----
309
        split_catalogue: pd.DataFrame
310
        The catalogue to be split
311

312
        Raise
313
        -----
314
        StackerError if file is not found
315
        """
316
        self.logger.progress("Reading catalogue from %s",
3✔
317
                             self.split_catalogue_name)
318
        self.logger.progress("Reading HDU '%s'",
3✔
319
                             self.catalogue_hdu_name_or_number)
320
        try:
3✔
321
            catalogue = Table.read(self.split_catalogue_name,
3✔
322
                                   hdu=self.catalogue_hdu_name_or_number)
323
        # we are currently not accessing this as astropy reads the first HDU
324
        # when it does not find the correct key. However, we do not delete this
325
        # check as it is currently raining a DeprecationWarning that will soon
326
        # turn to an error
327
        except KeyError:  # pragma: no cover
328
            self.logger.warning(
329
                "Error reading HDU '%s'. Maybe it is was a name but rather a "
330
                "number. I will try this and come back to you",
331
                self.catalogue_hdu_name_or_number)
332
            try:
333
                catalogue = Table.read(self.split_catalogue_name,
334
                                       hdu=int(
335
                                           self.catalogue_hdu_name_or_number))
336
            except ValueError as error:
337
                raise StackerError(
338
                    "SplitStacker: Problem reading HDU "
339
                    f"{self.catalogue_hdu_name_or_number}") from error
340
            self.logger.ok_warning("Catalogue read properly")
341

342
        except FileNotFoundError as error:
3✔
343
            raise StackerError("SplitStacker: Could not find catalogue: "
3✔
344
                               f"{self.split_catalogue_name}") from error
345

346
        keep_columns = self.split_on + [self.specid_name]
3✔
347

348
        split_catalogue = catalogue[keep_columns].to_pandas()
3✔
349
        split_catalogue.rename(columns={self.specid_name: "SPECID"},
3✔
350
                               inplace=True)
351
        split_catalogue["IN_STACK"] = False
3✔
352

353
        self.logger.progress("Catalogue read")
3✔
354

355
        return split_catalogue
3✔
356

357
    def stack(self, spectra):
3✔
358
        """ Stack spectra
359

360
        Arguments
361
        ---------
362
        spectra: list of Spectrum
363
        The spectra to stack
364

365
        Raise
366
        -----
367
        StackerError if the stackers have not been intialized by the child class
368
        """
369
        if len(self.stackers) != self.num_groups:
3✔
370
            raise StackerError(
3✔
371
                f"I expected {self.num_groups} stackers but found "
372
                f"{len(self.stackers)}. Make sure the member 'stackers' is "
373
                "properly intialized in the child class")
374

375
        self.stacked_flux = np.zeros(
3✔
376
            (Spectrum.common_wavelength_grid.size, self.num_groups),
377
            dtype=float)
378
        self.stacked_weight = np.zeros_like(self.stacked_flux)
3✔
379

380
        for group_number, stacker in enumerate(self.stackers):
3✔
381

382
            # select the spectra of this particular groups
383
            if self.split_type == "OR":
3✔
384
                col = self.groups_info[self.groups_info["GROUP_NUM"] ==
3✔
385
                                       group_number]["COLNAME"].values[0]
386
            elif self.split_type == "AND":
3✔
387
                col = "GROUP"
3✔
388

389
            # this should never enter unless new split types are not properly added
390
            else:  # pragma: no cover
391
                raise StackerError(
392
                    f"Don't know what to do with split type {self.split_type}. "
393
                    "This is one of the supported split types, maybe it "
394
                    "was not properly coded. If you did the change yourself, check "
395
                    "that you added the behaviour of the new mode to method `stack`. "
396
                    "Otherwise contact 'stacking' developpers.")
397

398
            selected_spectra = [
3✔
399
                spectrum for spectrum in spectra if retreive_group_number(
400
                    spectrum.specid, self.split_catalogue["SPECID"].values,
401
                    self.split_catalogue[col].values) == group_number
402
            ]
403

404
            # run the stack
405
            stacker.stack(selected_spectra)
3✔
406

407
            self.stacked_flux[:, group_number] = stacker.stacked_flux
3✔
408
            self.stacked_weight[:, group_number] = stacker.stacked_weight
3✔
409

410
            # update statistics
411
            selected_specids = [
3✔
412
                spectrum.specid for spectrum in selected_spectra
413
            ]
414
            self.split_catalogue.loc[
3✔
415
                self.split_catalogue["SPECID"].isin(selected_specids),
416
                "IN_STACK"] = True
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc