• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

sgkit-dev / bio2zarr / 14743868772

30 Apr 2025 12:16AM UTC coverage: 97.318% (-0.7%) from 98.015%
14743868772

Pull #346

github

web-flow
Merge 0c6fd5ecf into 3d8ad3ed1
Pull Request #346: Initial tskit conversion code

102 of 126 new or added lines in 1 file covered. (80.95%)

2721 of 2796 relevant lines covered (97.32%)

5.84 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

80.95
/bio2zarr/ts.py
1
import logging
6✔
2
import pathlib
6✔
3

4
import numpy as np
6✔
5
import tskit
6✔
6

7
from bio2zarr import constants, core, vcz
6✔
8

9
logger = logging.getLogger(__name__)
6✔
10

11

12
class TskitFormat(vcz.Source):
6✔
13
    def __init__(self, ts_path, contig_id=None, ploidy=None, isolated_as_missing=False):
6✔
14
        self._path = ts_path
6✔
15
        self.ts = tskit.load(ts_path)
6✔
16
        self.contig_id = contig_id if contig_id is not None else "1"
6✔
17
        self.isolated_as_missing = isolated_as_missing
6✔
18

19
        self._make_sample_mapping(ploidy)
6✔
20
        self.positions = self.ts.sites_position
6✔
21

22
    @property
6✔
23
    def path(self):
6✔
24
        return self._path
6✔
25

26
    @property
6✔
27
    def num_records(self):
6✔
28
        return self.ts.num_sites
6✔
29

30
    @property
6✔
31
    def num_samples(self):
6✔
32
        return self._num_samples
6✔
33

34
    @property
6✔
35
    def samples(self):
6✔
36
        return self._samples
6✔
37

38
    @property
6✔
39
    def root_attrs(self):
6✔
40
        return {}
6✔
41

42
    @property
6✔
43
    def contigs(self):
6✔
44
        return [vcz.Contig(id=self.contig_id)]
6✔
45

46
    def _make_sample_mapping(self, ploidy):
6✔
47
        ts = self.ts
6✔
48
        self.individual_ploidies = []
6✔
49
        self.max_ploidy = 0
6✔
50

51
        if ts.num_individuals > 0 and ploidy is not None:
6✔
NEW
52
            raise ValueError(
×
53
                "Cannot specify ploidy when individuals are present in tables"
54
            )
55

56
        # Find all sample nodes that reference individuals
57
        individuals = np.unique(ts.nodes_individual[ts.samples()])
6✔
58
        if len(individuals) == 1 and individuals[0] == tskit.NULL:
6✔
59
            # No samples refer to individuals
NEW
60
            individuals = None
×
61
        else:
62
            # np.unique sorts the argument, so if NULL (-1) is present it
63
            # will be the first value.
64
            if individuals[0] == tskit.NULL:
6✔
NEW
65
                raise ValueError(
×
66
                    "Sample nodes must either all be associated with individuals "
67
                    "or not associated with any individuals"
68
                )
69

70
        if individuals is not None:
6✔
71
            self.sample_ids = []
6✔
72
            for i in individuals:
6✔
73
                if i < 0 or i >= self.ts.num_individuals:
6✔
NEW
74
                    raise ValueError("Invalid individual IDs provided.")
×
75
                ind = self.ts.individual(i)
6✔
76
                if len(ind.nodes) == 0:
6✔
NEW
77
                    raise ValueError(f"Individual {i} not associated with a node")
×
78
                is_sample = {ts.node(u).is_sample() for u in ind.nodes}
6✔
79
                if len(is_sample) != 1:
6✔
NEW
80
                    raise ValueError(
×
81
                        f"Individual {ind.id} has nodes that are sample and "
82
                        "non-samples"
83
                    )
84
                self.sample_ids.extend(ind.nodes)
6✔
85
                self.individual_ploidies.append(len(ind.nodes))
6✔
86
                self.max_ploidy = max(self.max_ploidy, len(ind.nodes))
6✔
87
        else:
NEW
88
            if ploidy is None:
×
NEW
89
                ploidy = 1
×
NEW
90
            if ploidy < 1:
×
NEW
91
                raise ValueError("Ploidy must be >= 1")
×
NEW
92
            if ts.num_samples % ploidy != 0:
×
NEW
93
                raise ValueError("Sample size must be divisible by ploidy")
×
NEW
94
            self.individual_ploidies = np.full(
×
95
                ts.num_samples // ploidy, ploidy, dtype=np.int32
96
            )
NEW
97
            self.max_ploidy = ploidy
×
NEW
98
            self.sample_ids = np.arange(ts.num_samples, dtype=np.int32)
×
99

100
        self._num_samples = len(self.individual_ploidies)
6✔
101

102
        self._samples = [vcz.Sample(id=f"tsk_{j}") for j in range(self.num_samples)]
6✔
103

104
    def iter_contig(self, start, stop):
6✔
105
        yield from (0 for _ in range(start, stop))
6✔
106

107
    def iter_field(self, field_name, shape, start, stop):
6✔
108
        if field_name == "position":
6✔
109
            for pos in self.ts.sites_position[start:stop]:
6✔
110
                yield int(pos)
6✔
111
        else:
NEW
112
            raise ValueError(f"Unknown field {field_name}")
×
113

114
    def iter_alleles(self, start, stop, num_alleles):
6✔
NEW
115
        for variant in self.ts.variants(
×
116
            samples=self.sample_ids,
117
            isolated_as_missing=self.isolated_as_missing,
118
            left=self.positions[start],
119
            right=self.positions[stop] if stop < self.num_records else None,
120
        ):
NEW
121
            alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
×
NEW
122
            for i, allele in enumerate(variant.alleles):
×
NEW
123
                assert i < num_alleles
×
NEW
124
                alleles[i] = allele
×
NEW
125
            yield alleles
×
126

127
    def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
6✔
128
        gt = np.zeros(shape, dtype=np.int8)
6✔
129
        phased = np.zeros(shape[:-1], dtype=bool)
6✔
130

131
        for variant in self.ts.variants(
6✔
132
            samples=self.sample_ids,
133
            isolated_as_missing=self.isolated_as_missing,
134
            left=self.positions[start],
135
            right=self.positions[stop] if stop < self.num_records else None,
136
        ):
137
            alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
6✔
138
            for i, allele in enumerate(variant.alleles):
6✔
139
                assert i < num_alleles
6✔
140
                alleles[i] = allele
6✔
141

142
            genotypes = variant.genotypes
6✔
143
            sample_index = 0
6✔
144
            for i, ploidy in enumerate(self.individual_ploidies):
6✔
145
                for j in range(ploidy):
6✔
146
                    if j < self.max_ploidy:  # Only fill up to max_ploidy
6✔
147
                        try:
6✔
148
                            gt[i, j] = genotypes[sample_index + j]
6✔
NEW
149
                        except IndexError:
×
150
                            # This can happen if the ploidy varies between individuals
NEW
151
                            gt[i, j] = -2  # Fill value
×
152

153
                # In tskit, all genotypes are considered phased
154
                phased[i] = True
6✔
155
                sample_index += ploidy
6✔
156

157
            yield alleles, (gt, phased)
6✔
158

159
    def generate_schema(
6✔
160
        self,
161
        variants_chunk_size=None,
162
        samples_chunk_size=None,
163
    ):
164
        n = self.num_samples
6✔
165
        m = self.ts.num_sites
6✔
166

167
        # Determine max number of alleles
168
        max_alleles = 0
6✔
169
        for variant in self.ts.variants():
6✔
170
            max_alleles = max(max_alleles, len(variant.alleles))
6✔
171

172
        logging.info(f"Scanned tskit with {n} samples and {m} variants")
6✔
173
        logging.info(
6✔
174
            f"Maximum ploidy: {self.max_ploidy}, maximum alleles: {max_alleles}"
175
        )
176

177
        dimensions = {
6✔
178
            "variants": vcz.VcfZarrDimension(
179
                size=m, chunk_size=variants_chunk_size or vcz.DEFAULT_VARIANT_CHUNK_SIZE
180
            ),
181
            "samples": vcz.VcfZarrDimension(
182
                size=n, chunk_size=samples_chunk_size or vcz.DEFAULT_SAMPLE_CHUNK_SIZE
183
            ),
184
            "ploidy": vcz.VcfZarrDimension(size=self.max_ploidy),
185
            "alleles": vcz.VcfZarrDimension(size=max_alleles),
186
        }
187

188
        schema_instance = vcz.VcfZarrSchema(
6✔
189
            format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
190
            dimensions=dimensions,
191
            fields=[],
192
        )
193

194
        logger.info(
6✔
195
            "Generating schema with chunks="
196
            f"{schema_instance.dimensions['variants'].chunk_size}, "
197
            f"{schema_instance.dimensions['samples'].chunk_size}"
198
        )
199

200
        array_specs = [
6✔
201
            vcz.ZarrArraySpec(
202
                source="position",
203
                name="variant_position",
204
                dtype="i4",
205
                dimensions=["variants"],
206
                description="Position of each variant",
207
            ),
208
            vcz.ZarrArraySpec(
209
                source=None,
210
                name="variant_allele",
211
                dtype="O",
212
                dimensions=["variants", "alleles"],
213
                description="Alleles for each variant",
214
            ),
215
            vcz.ZarrArraySpec(
216
                source=None,
217
                name="variant_contig",
218
                dtype=core.min_int_dtype(0, len(self.contigs)),
219
                dimensions=["variants"],
220
                description="Contig/chromosome index for each variant",
221
            ),
222
            vcz.ZarrArraySpec(
223
                source=None,
224
                name="call_genotype_phased",
225
                dtype="bool",
226
                dimensions=["variants", "samples"],
227
                description="Whether the genotype is phased",
228
                compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
229
            ),
230
            vcz.ZarrArraySpec(
231
                source=None,
232
                name="call_genotype",
233
                dtype="i1",
234
                dimensions=["variants", "samples", "ploidy"],
235
                description="Genotype for each variant and sample",
236
                compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
237
            ),
238
            vcz.ZarrArraySpec(
239
                source=None,
240
                name="call_genotype_mask",
241
                dtype="bool",
242
                dimensions=["variants", "samples", "ploidy"],
243
                description="Mask for each genotype call",
244
                compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
245
            ),
246
        ]
247
        schema_instance.fields = array_specs
6✔
248
        return schema_instance
6✔
249

250

251
def convert(
6✔
252
    ts_path,
253
    zarr_path,
254
    *,
255
    contig_id=None,
256
    ploidy=None,
257
    isolated_as_missing=False,
258
    variants_chunk_size=None,
259
    samples_chunk_size=None,
260
    worker_processes=1,
261
    show_progress=False,
262
):
263
    tskit_format = TskitFormat(
6✔
264
        ts_path,
265
        contig_id=contig_id,
266
        ploidy=ploidy,
267
        isolated_as_missing=isolated_as_missing,
268
    )
269
    schema_instance = tskit_format.generate_schema(
6✔
270
        variants_chunk_size=variants_chunk_size,
271
        samples_chunk_size=samples_chunk_size,
272
    )
273
    zarr_path = pathlib.Path(zarr_path)
6✔
274
    vzw = vcz.VcfZarrWriter(TskitFormat, zarr_path)
6✔
275
    # Rough heuristic to split work up enough to keep utilisation high
276
    target_num_partitions = max(1, worker_processes * 4)
6✔
277
    vzw.init(
6✔
278
        tskit_format,
279
        target_num_partitions=target_num_partitions,
280
        schema=schema_instance,
281
    )
282
    vzw.encode_all_partitions(
6✔
283
        worker_processes=worker_processes,
284
        show_progress=show_progress,
285
    )
286
    vzw.finalise(show_progress)
6✔
287
    vzw.create_index()
6✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc