18732618520

Committed 22 Oct 2025 11:21PM UTC coverage: 25.34% (-70.0%) from 95.34%

Build # 18732618520

Build Type

Pull #70

github

Committed by

miketynes

Commit Message

fix init, logging init, waiting

Pull Request Pull Request #70: Academy proto

Run Details

261 of 1030 relevant lines covered (25.34%)

0.25 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

75.0

/cascade/learning/finetuning.py

"""Utilities and data models used when finetuning a model"""
from typing import Collection, Iterable, Any
from dataclasses import dataclass, field

from ase import Atoms


def filter_by_elements(atoms_gen: Iterable[Atoms], allowed_elems: Collection[str]) -> Iterable[Atoms]:
    """Process a stream of entries to only include those with allowed elements

    Args:
        atoms_gen: Stream of Atoms structures to be filtered
        allowed_elems: List of elements which are allowed in the dataset
    Yields:
        Atoms from the stream which contain only the desired elements
    """

    allowed_elems = set(allowed_elems)
    for atoms in atoms_gen:
        elems = set(atoms.get_chemical_symbols())
        if any(e not in allowed_elems for e in elems):
            continue
        yield atoms


# TODO (wardlt): Build towards more advanced methods, like https://arxiv.org/abs/2404.12526v1
@dataclass
class MultiHeadConfig:
    """Configuration used to define replay training"""

    # Defining the training data
    original_dataset: list[Atoms] = ...
    """Path to dataset containing the original training samples

    Must be in a form readable by ASE.
    """
    num_downselect: int | None = None
    """Number of points from the dataset to use for training each training round"""

    # Defining the training procedure
    epoch_frequency: int = 1
    """How often to retrain using the original dataset"""
    lr_reduction: float = 1
    """Factor by which to reduce the learning rate during replay"""
    batch_size: int | None = None
    """Batch size to use during replay"""

    learner_options: dict[str, Any] = field(default_factory=dict)
    """Options specific to a certain learner"""

1	"""Utilities and data models used when finetuning a model"""
2	from typing import Collection, Iterable, Any	1✔
3	from dataclasses import dataclass, field	1✔
4
5	from ase import Atoms	1✔
6
7
8	def filter_by_elements(atoms_gen: Iterable[Atoms], allowed_elems: Collection[str]) -> Iterable[Atoms]:	1✔
9	"""Process a stream of entries to only include those with allowed elements
10
11	Args:
12	atoms_gen: Stream of Atoms structures to be filtered
13	allowed_elems: List of elements which are allowed in the dataset
14	Yields:
15	Atoms from the stream which contain only the desired elements
16	"""
17
18	allowed_elems = set(allowed_elems)	×
19	for atoms in atoms_gen:	×
20	elems = set(atoms.get_chemical_symbols())	×
21	if any(e not in allowed_elems for e in elems):	×
22	continue	×
23	yield atoms	×
24
25
26	# TODO (wardlt): Build towards more advanced methods, like https://arxiv.org/abs/2404.12526v1
27	@dataclass	1✔
28	class MultiHeadConfig:	1✔
29	"""Configuration used to define replay training"""
30
31	# Defining the training data
32	original_dataset: list[Atoms] = ...	1✔
33	"""Path to dataset containing the original training samples	1✔
34
35	Must be in a form readable by ASE.
36	"""
37	num_downselect: int \| None = None	1✔
38	"""Number of points from the dataset to use for training each training round"""	1✔
39
40	# Defining the training procedure
41	epoch_frequency: int = 1	1✔
42	"""How often to retrain using the original dataset"""	1✔
43	lr_reduction: float = 1	1✔
44	"""Factor by which to reduce the learning rate during replay"""	1✔
45	batch_size: int \| None = None	1✔
46	"""Batch size to use during replay"""	1✔
47
48	learner_options: dict[str, Any] = field(default_factory=dict)	1✔
49	"""Options specific to a certain learner"""	1✔

globus-labs / cascade / 18732618520

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous