• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

sequana / sequana / 16568275773

28 Jul 2025 11:51AM UTC coverage: 69.115% (-0.6%) from 69.722%
16568275773

push

github

web-flow
Merge pull request #870 from cokelaer/dev

add threshold parameter in G4 module

1 of 2 new or added lines in 1 file covered. (50.0%)

725 existing lines in 17 files now uncovered.

14389 of 20819 relevant lines covered (69.11%)

2.07 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.6
/sequana/bedtools.py
1
#
2
#  This file is part of Sequana software
3
#
4
#  Copyright (c) 2016-2022 - Sequana Development Team
5
#
6
#  Distributed under the terms of the 3-clause BSD license.
7
#  The full license is in the LICENSE file, distributed with this software.
8
#
9
#  website: https://github.com/sequana/sequana
10
#  documentation: http://sequana.readthedocs.io
11
#
12
##############################################################################
13
"""Utilities for the genome coverage"""
3✔
14
import copy
3✔
15
import gc
3✔
16
import os
3✔
17
import random
3✔
18
import sys
3✔
19
from collections import Counter
3✔
20

21
import colorlog
3✔
22
from easydev import Progress, TempFile
3✔
23
from tqdm import tqdm
3✔
24

25
from sequana.errors import BadFileFormat
3✔
26
from sequana.genbank import GenBank
3✔
27
from sequana.gff3 import GFF3
3✔
28
from sequana.lazy import numpy as np
3✔
29
from sequana.lazy import pandas as pd
3✔
30
from sequana.lazy import pylab, pysam
3✔
31
from sequana.stats import evenness
3✔
32
from sequana.summary import Summary
3✔
33

34
logger = colorlog.getLogger(__name__)
3✔
35

36

37
__all__ = ["SequanaCoverage", "ChromosomeCov", "DoubleThresholds"]
3✔
38

39

40
class DoubleThresholds(object):
3✔
41
    """Simple structure to handle the double threshold for negative and
42
    positive sides
43

44
    Used by the :class:`SequanaCoverage` and related classes.
45

46
    ::
47

48
        dt = DoubleThresholds(-3, 4, 0.5, 0.5)
49

50
    This means the low threshold is -3 while the high threshold is 4. The two
51
    following values must be between 0 and 1 and are used to define the value
52
    of the double threshold set to half the value of the main threshold.
53

54
    Internally, the main thresholds are stored in the low and high attributes.
55
    The secondary thresholds are derived from the main thresholds and the
56
    two ratios. The ratios are named ldtr and hdtr for low double threshold
57
    ratio and high double threshold ratio. The secondary thresholds are
58
    denoted low2 and high2 and are update automatically if low, high, ldtr or
59
    hdtr are changed.
60

61
    """
62

63
    def __init__(self, low=-3, high=3, ldtr=0.5, hdtr=0.5):
3✔
64
        assert ldtr >= 0.0 and ldtr <= 1.0, "ldrt parameter (low double threshold ratio) must be in [0,1]"
3✔
65
        assert hdtr >= 0.0 and hdtr <= 1.0, "hdrt parameter (high double threshold ratio) must be in [0,1]"
3✔
66
        assert low < 0, "low threshold must be negative"
3✔
67
        assert high > 0, "high threshold must be positive"
3✔
68

69
        self._ldtr = ldtr
3✔
70
        self._hdtr = hdtr
3✔
71
        self._high = high
3✔
72
        self._low = low
3✔
73

74
    def _get_ldtr(self):
3✔
75
        return self._ldtr
3✔
76

77
    def _set_ldtr(self, ldtr):
3✔
78
        self._ldtr = ldtr
3✔
79
        self._low2 = self._low * self._ldtr
3✔
80

81
    ldtr = property(_get_ldtr, _set_ldtr)
3✔
82

83
    def _get_hdtr(self):
3✔
84
        return self._hdtr
3✔
85

86
    def _set_hdtr(self, hdtr):
3✔
87
        self._hdtr = hdtr
3✔
88
        self._high2 = self._high * self._hdtr
3✔
89

90
    hdtr = property(_get_hdtr, _set_hdtr)
3✔
91

92
    def _get_low(self):
3✔
93
        return self._low
3✔
94

95
    def _set_low(self, value):
3✔
96
        assert value < 0.0
3✔
97
        self._low = value
3✔
98
        self._low2 = self._low * self._ldtr
3✔
99

100
    low = property(_get_low, _set_low)
3✔
101

102
    def _get_high(self):
3✔
103
        return self._high
3✔
104

105
    def _set_high(self, value):
3✔
106
        assert value > 0.0
3✔
107
        self._high = value
3✔
108
        self._high2 = self._high * self._ldtr
3✔
109

110
    high = property(_get_high, _set_high)
3✔
111

112
    def _get_low2(self):
3✔
113
        return self._low * self._ldtr
3✔
114

115
    low2 = property(_get_low2)
3✔
116

117
    def _get_high2(self):
3✔
118
        return self._high * self._hdtr
3✔
119

120
    high2 = property(_get_high2)
3✔
121

122
    def get_args(self):
3✔
123
        return "%.2f,%.2f,%.2f,%.2f" % (self.low, self.high, self.ldtr, self.hdtr)
×
124

125
    def copy(self):
3✔
126
        thresholds = DoubleThresholds(self.low, self.high, self.ldtr, self.hdtr)
3✔
127
        return thresholds
3✔
128

129
    def __str__(self):
3✔
130
        txt = f"Low threshold: {self.low}\n"
3✔
131
        txt += f"High threshold: {self.high}\n"
3✔
132
        txt += f"double-low threshold: {self.low2}\n"
3✔
133
        txt += f"double-high threshold: {self.high2}"
3✔
134
        return txt
3✔
135

136

137
class SequanaCoverage(object):
3✔
138
    """Create a list of dataframe to hold data from a BED file generated with
139
    samtools depth.
140

141
    This class can be used to plot the coverage resulting from a mapping, which
142
    is stored in BED format. The BED file may contain several chromosomes.
143
    There are handled independently and accessible as a list of
144
    :class:`ChromosomeCov` instances.
145

146
    Example:
147

148
    .. plot::
149
        :include-source:
150

151
        from sequana import SequanaCoverage, sequana_data
152

153
        filename = sequana_data('JB409847.bed')
154
        reference = sequana_data("JB409847.fasta")
155

156
        gencov = SequanaCoverage(filename)
157

158
        # you can change the thresholds:
159
        gencov.thresholds.low = -4
160
        gencov.thresholds.high = 4
161
        #gencov.compute_gc_content(reference)
162

163
        gencov = SequanaCoverage(filename)
164
        for chrom in gencov:
165
            chrom.running_median(n=3001, circular=True)
166
            chrom.compute_zscore()
167
            chrom.plot_coverage()
168

169
    Results are stored in a list of :class:`ChromosomeCov` named
170
    :attr:`chr_list`. For Prokaryotes and small genomes, this API
171
    is convenient but takes lots of memory for larger genomes.
172

173
    Computational time information: scanning 24,000,000 rows
174

175
        - constructor (scanning 40,000,000 rows): 45s
176
        - select contig of 24,000,000 rows: 1min20
177
        - running median: 16s
178
        - compute zscore: 9s
179
        - c.get_rois() ():
180

181
    """
182

183
    def __init__(
3✔
184
        self,
185
        input_filename,
186
        annotation_file=None,
187
        low_threshold=-4,
188
        high_threshold=4,
189
        ldtr=0.5,
190
        hdtr=0.5,
191
        force=False,
192
        chunksize=5000000,
193
        quiet_progress=False,
194
        chromosome_list=[],
195
        reference_file=None,
196
        gc_window_size=101,
197
    ):
198
        """.. rubric:: constructor
199

200
        :param str input_filename: the input data with results of a bedtools
201
            genomecov run. This is just a 3-column file. The first column is a
202
            string (chromosome), second column is the base postion and third
203
            is the coverage.
204
        :param str annotation_file: annotation file of your reference (GFF3/Genbank).
205
        :param float low_threshold: threshold used to identify under-covered
206
            genomic region of interest (ROI). Must be negative
207
        :param float high_threshold: threshold used to identify over-covered
208
            genomic region of interest (ROI). Must be positive
209
        :param float ldtr: fraction of the low_threshold to be used to define
210
            the intermediate threshold in the double threshold method. Must be
211
            between 0 and 1.
212
        :param float rdtr: fraction of the low_threshold to be used to define
213
            the intermediate threshold in the double threshold method. Must be
214
            between 0 and 1.
215
        :param chunksize: size of segments to analyse. If a chromosome is larger
216
            than the chunk size, it is split into N chunks. The segments are
217
            analysed indepdently and ROIs and summary joined together. Note that
218
            GC, plotting functionalities will only plot the first chunk.
219
        :param force: some constraints are set in the code to prevent unwanted
220
            memory issues with specific data sets of parameters. Currently,
221
            by default, (i) you cannot set the threshold below 2.5 (considered as
222
            noise).
223
        :param chromosome_list: list of chromosomes to consider (names). This is useful for
224
            very large input data files (hundreds million of lines) where each
225
            chromosome can be analysed one by one. Used by the sequana_coverage
226
            standalone. The only advantage is to speed up the constructor creation
227
            and could also be used by the Snakemake implementation.
228
        :param reference_file: if provided, computes the GC content
229
        :param int gc_window_size: size of the GC sliding window. (default 101)
230
        """
231
        # Keep various information as attributes
232

233
        self._circular = None
3✔
234
        self._feature_dict = None
3✔
235
        self._gc_window_size = gc_window_size
3✔
236
        self._annotation_file = None
3✔
237
        self._reference_file = reference_file
3✔
238
        self._window_size = None
3✔
239
        self._input_filename = input_filename
3✔
240

241
        # place holder for basic stats on all chromosomes.
242
        self._stats = {}
3✔
243
        self._basic_stats = {}
3✔
244
        self._rois = {}
3✔
245
        self._html_list = set()
3✔
246

247
        # setters
248
        if annotation_file:
3✔
249
            self.annotation_file = annotation_file
3✔
250

251
        self.chunksize = chunksize
3✔
252
        self.force = force
3✔
253
        self.quiet_progress = quiet_progress
3✔
254

255
        if high_threshold < 2.5 and self.force is False:
3✔
256
            raise ValueError("high threshold must be >=2.5")
3✔
257
        if low_threshold > -2.5 and self.force is False:
3✔
258
            raise ValueError("low threshold must be <=-2.5")
3✔
259

260
        #
261
        self.thresholds = DoubleThresholds(low_threshold, high_threshold, ldtr, hdtr)
3✔
262

263
        #
264

265
        if not input_filename.endswith(".bed"):
3✔
266
            raise Exception(("Input file must be a BED file " "(chromosome/position/coverage columns"))
3✔
267

268
        # scan BED files to set chromosome names
269
        self._scan_bed()
3✔
270
        if chromosome_list:
3✔
271
            for chrom in chromosome_list:
×
272
                if chrom not in self.chrom_names:
×
273
                    raise ValueError(
×
274
                        f"You provided an incorrect chromosome name ({chrom}). Please chose one amongst {self.chrom_names}"
275
                    )
276
            self.chrom_names = chromosome_list
×
277

278
    def __getitem__(self, index):
3✔
279
        chrom = self.chrom_names[index]
3✔
280
        return ChromosomeCov(self, chrom, self.thresholds, self.chunksize)
3✔
281

282
    def __iter__(self):
3✔
283
        for chrom in self.chrom_names:
3✔
284
            yield ChromosomeCov(self, chrom, self.thresholds, self.chunksize)
3✔
285

286
    def __len__(self):
3✔
287
        return len(self.chrom_names)
3✔
288

289
    @property
3✔
290
    def input_filename(self):
3✔
291
        return self._input_filename
3✔
292

293
    @property
3✔
294
    def reference_file(self):
3✔
295
        return self._reference_file
3✔
296

297
    @property
3✔
298
    def circular(self):
3✔
299
        """Get the circularity of chromosome(s). It must be a boolean."""
300
        return self._circular
3✔
301

302
    @circular.setter
3✔
303
    def circular(self, circular):
3✔
304
        if isinstance(circular, bool):
3✔
305
            self._circular = circular
×
306
        else:
307
            logger.error("TypeError: Circular must be a boolean. True if your " "genome is circular and False if not.")
3✔
308
            sys.exit(1)
3✔
309

310
    @property
3✔
311
    def feature_dict(self):
3✔
312
        """Get the features dictionary of the genbank."""
313
        return self._feature_dict
3✔
314

315
    @property
3✔
316
    def gc_window_size(self):
3✔
317
        """Get or set the window size to compute the GC content."""
318
        return self._gc_window_size
3✔
319

320
    @gc_window_size.setter
3✔
321
    def gc_window_size(self, n):
3✔
322
        if n % 2 == 0:
×
323
            logger.warning("Window size should be an odd number. Incrementing by one.")
×
324
            self._gc_window_size = n + 1
×
325
        else:
326
            self._gc_window_size = n
×
327

328
    @property
3✔
329
    def annotation_file(self):
3✔
330
        """Get or set the genbank filename to annotate ROI detected with
331
        :meth:`ChromosomeCov.get_roi`. Changing the genbank filename will
332
        configure the :attr:`SequanaCoverage.feature_dict`.
333
        """
334
        return self._annotation_file
×
335

336
    @annotation_file.setter
3✔
337
    def annotation_file(self, annotation_file):
3✔
338
        if os.path.isfile(annotation_file):
3✔
339
            self._annotation_file = os.path.realpath(annotation_file)
3✔
340

341
            try:
3✔
342
                gff = GFF3(annotation_file)
3✔
343
                self._feature_dict = gff.get_features_dict()
3✔
344
            except BadFileFormat:
3✔
345
                self._feature_dict = GenBank(annotation_file).genbank_features_parser()
3✔
346

347
                # just to be equivalent to the GFF, we fill genes with locus tag
348
                # See get_simplify_dataframe from GFF3 class. FIXME should be uniformed
349
                for (
3✔
350
                    chrom_name,
351
                    values,
352
                ) in self._feature_dict.items():  # loop over chromosomes
353
                    # values is a list of dictionaries
354
                    for i, value in enumerate(values):
3✔
355
                        if "gene" not in value and "locus_tag" in value:
3✔
356
                            self._feature_dict[chrom_name][i]["gene"] = value["locus_tag"]
3✔
357

358
        else:
359
            logger.error("FileNotFoundError: The genbank file doesn't exist.")
×
360
            sys.exit(1)
×
361

362
    @property
3✔
363
    def window_size(self):
3✔
364
        """Get or set the window size to compute the running median. Size
365
        must be an interger.
366
        """
367
        return self._window_size
×
368

369
    @window_size.setter
3✔
370
    def window_size(self, n):
3✔
371
        if n % 2 == 0:
3✔
372
            self._window_size = n + 1
3✔
373
        else:
374
            self._window_size = n
3✔
375

376
    def _scan_bed(self):
3✔
377
        # Figure out the length of the data and unique chromosome
378
        # name. This is required for large data sets. We do not store
379
        # any data here but the chromosome names and their positions
380
        # in the BED file. We also store the total length
381
        # Attributes filled:
382
        #  - chrom_names: list of contig/chromosome names
383
        #  - positions: dictionary with contig names and the starting and ending
384
        #               positions. Starting is zero in general, but not
385
        #               compulsary
386
        #  - total_length: number of rows in the BED file
387

388
        N = 0
3✔
389

390
        logger.info("Scanning input file (chunk of {} rows)".format(self.chunksize))
3✔
391
        positions = {}
3✔
392
        self.chrom_names = []
3✔
393

394
        # rough estimate of the number of rows for the progress bar
395
        logger.info("Estimating number of chunks to read")
3✔
396
        fullsize = os.path.getsize(self.input_filename)
3✔
397
        data = pd.read_table(self.input_filename, header=None, sep="\t", nrows=self.chunksize)
3✔
398
        with TempFile() as fh:
3✔
399
            data.to_csv(fh.name, index=None, sep="\t")
3✔
400
            smallsize = os.path.getsize(fh.name)
3✔
401
        del data
3✔
402

403
        Nchunk = int(fullsize / smallsize)
3✔
404
        i = 0
3✔
405
        for chunk in tqdm(
3✔
406
            pd.read_table(
407
                self.input_filename,
408
                header=None,
409
                sep="\t",
410
                usecols=[0],
411
                chunksize=self.chunksize,
412
                dtype="string",
413
            ),
414
            total=Nchunk,
415
            disable=self.quiet_progress,
416
        ):
417
            # accumulate length
418
            N += len(chunk)
3✔
419

420
            # keep unique names (ordered)
421
            contigs = chunk[0].unique()
3✔
422
            for contig in contigs:
3✔
423
                if contig not in self.chrom_names:
3✔
424
                    self.chrom_names.append(contig)
3✔
425

426
            # group by names (unordered)
427
            chunk = chunk.groupby(0)
3✔
428
            for contig in contigs:
3✔
429
                if contig not in positions:
3✔
430
                    positions[contig] = {
3✔
431
                        "start": chunk.groups[contig].min(),
432
                        "end": chunk.groups[contig].max(),
433
                    }
434
                else:
435
                    positions[contig]["end"] = chunk.groups[contig].max()
3✔
436
            i += 1
3✔
437
            i = min(i, Nchunk)
3✔
438
        del chunk
3✔
439
        self.total_length = N
3✔
440

441
        # Used in the main standalone
442
        for k in positions.keys():
3✔
443
            positions[k]["N"] = positions[k]["end"] - positions[k]["start"] + 1
3✔
444
        self.positions = positions
3✔
445

446
    def get_stats(self):
3✔
447
        """Return basic statistics for each chromosome
448

449
        :return: dictionary with chromosome names as keys
450
            and statistics as values.
451

452
        .. seealso:: :class:`ChromosomeCov`.
453

454
        .. note:: used in sequana_summary standalone
455
        """
456
        stats = {}
3✔
457
        for chrom_name in tqdm(self.chrom_names):
3✔
458
            if chrom_name in self._stats:
3✔
459
                stats[chrom_name] = self._stats[chrom_name]
×
460
            else:
461
                chrom = ChromosomeCov(self, chrom_name, self.thresholds, self.chunksize)
3✔
462
                stats[chrom.chrom_name] = chrom.get_stats()
3✔
463
                self._stats[chrom_name] = stats[chrom_name].copy()
3✔
464
        return stats
3✔
465

466

467
class ChromosomeCov(object):
3✔
468
    """Factory to manipulate coverage and extract region of interests.
469

470
    Example:
471

472
    .. plot::
473
        :include-source:
474

475
        from sequana import SequanaCoverage, sequana_data
476
        filename = sequana_data("virus.bed")
477

478
        gencov = SequanaCoverage(filename)
479

480
        chrcov = gencov[0]
481
        chrcov.running_median(n=3001)
482
        chrcov.compute_zscore()
483
        chrcov.plot_coverage()
484

485
        df = chrcov.get_rois().get_high_rois()
486

487
    The *df* variable contains a dataframe with high region of interests (over
488
    covered)
489

490

491
    If the data is large, the input data set is split into chunk. See
492
    :attr:`chunksize`, which is 5,000,000 by default.
493

494
    If your data is larger, then you should use the :meth:`run` method.
495

496
    .. seealso:: sequana_coverage standalone application
497
    """
498

499
    def __init__(self, genomecov, chrom_name, thresholds=None, chunksize=5000000):
3✔
500
        """.. rubric:: constructor
501

502
        :param df: dataframe with position for a chromosome used within
503
            :class:`SequanaCoverage`. Must contain the following columns:
504
            ["pos", "cov"]
505
        :param genomecov:
506
        :param chrom_name: to save space, no need to store the chrom name in the
507
            dataframe.
508
        :param thresholds: a data structure :class:`DoubleThresholds` that holds
509
            the double threshold values.
510
        :param chunksize: if the data is large, it is split and analysed by
511
            chunk. In such situations, you should use the :meth:`run` instead
512
            of calling the running_median and compute_zscore functions.
513

514
        """
515
        self._rois = None
3✔
516
        self.binning = 1
3✔
517

518
        # keep track of the chunksize user argument.
519
        self.chunksize = chunksize
3✔
520

521
        # and keep a link to the original SequanaCoverage instance.
522
        self._bed = genomecov
3✔
523

524
        # placeholder for GC vector
525
        self._gc_content = None
3✔
526

527
        # the chromosome of interest
528
        self.chrom_name = chrom_name
3✔
529

530
        # full data in memory or split into chunks ?
531
        self._mode = None
3✔
532

533
        self.thresholds = thresholds.copy()
3✔
534

535
        # open the file as an iterator (may be huge), set _df to None and
536
        # all attributes to None.
537
        self.init()
3✔
538

539
    def init(self):
3✔
540
        # jump to the file position corresponding to the chrom name.
541
        N = self.bed.positions[self.chrom_name]["N"]
3✔
542
        toskip = self.bed.positions[self.chrom_name]["start"]
3✔
543

544
        self.iterator = pd.read_table(
3✔
545
            self.bed.input_filename,
546
            skiprows=toskip,
547
            nrows=N,
548
            header=None,
549
            sep="\t",
550
            chunksize=self.chunksize,
551
            dtype={0: "string"},
552
        )
553

554
        if N <= self.chunksize:
3✔
555
            # we can load all data into memory:
556
            self._mode = "memory"
3✔
557
        else:
558
            self._mode = "chunks"
3✔
559

560
        self._df = None
3✔
561
        self._reset_metrics()
3✔
562

563
    def _reset_metrics(self):
3✔
564
        # attributes for stats
565
        self._evenness = None
3✔
566
        self._DOC = None
3✔
567
        self._BOC = None
3✔
568
        self._CV = None
3✔
569
        self._STD = None
3✔
570
        self._C3 = None
3✔
571
        self._C4 = None
3✔
572
        self._rois = None
3✔
573

574
    def __str__(self):
3✔
575
        N = self.bed.positions[self.chrom_name]["N"]
3✔
576
        if self._mode == "memory":
3✔
577
            stats = self.get_stats()
3✔
578
            txt = "\nChromosome length: {:>10}".format(N)
3✔
579
            txt += "\nSequencing depth (DOC):                {:>10.2f} ".format(self.DOC)
3✔
580
            txt += "\nSequencing depth (median):             {:>10.2f} ".format(stats["Median"])
3✔
581
            txt += "\nBreadth of coverage (BOC) (percent):   {:>10.2f} ".format(self.BOC)
3✔
582
            txt += "\nCoverage standard deviation:    {:>10.2f}".format(self.STD)
3✔
583
            txt += "\nCoverage coefficient variation: {:>10.2f}".format(self.CV)
3✔
584
        else:
585
            stats = self.get_stats()
3✔
586
            txt = "\nChromosome length: {:>10}".format(N)
3✔
587
            txt += "\n!!!! Information based on a sample of {} points".format(self.chunksize)
3✔
588
            txt += "\nSequencing depth (DOC):                {:>10.2f} ".format(self.DOC)
3✔
589
            txt += "\nSequencing depth (median):             {:>10.2f} ".format(stats["Median"])
3✔
590
            txt += "\nBreadth of coverage (BOC) (percent):   {:>10.2f} ".format(self.BOC)
3✔
591
            txt += "\nCoverage standard deviation:    {:>10.2f}".format(self.STD)
3✔
592
            txt += "\nCoverage coefficient variation: {:>10.2f}".format(self.CV)
3✔
593
            txt += "\nExact values will be available in the summary file (json)"
3✔
594
        return txt
3✔
595

596
    def __len__(self):
3✔
597

598
        # binning may be used, so len is not the length of the
599
        # dataframe, but the min/max positions.
600
        if self.binning > 1:
3✔
601
            return self._length
3✔
602
        else:
603
            return int(self.bed.positions[self.chrom_name]["N"])
3✔
604
        #    return self.df.__len__()
605

606
    def _set_chunk(self, chunk):
3✔
607
        chunk.rename(columns={0: "chr", 1: "pos", 2: "cov", 3: "mapq0"}, inplace=True)
3✔
608
        assert set(chunk["chr"].unique()) == set([self.chrom_name])
3✔
609
        chunk = chunk.set_index("chr", drop=True)
3✔
610
        chunk = chunk.set_index("pos", drop=False)
3✔
611
        self._df = chunk
3✔
612
        self._reset_metrics()
3✔
613

614
        # set GC if available
615
        if self.bed.reference_file:
3✔
616
            if self._gc_content is None:
3✔
617
                self._compute_gc_content()
3✔
618
            i1 = self._df.index[0] - 1
3✔
619
            i2 = self._df.index[-1]
3✔
620

621
            try:
3✔
622
                self._df["gc"] = self._gc_content[i1:i2]
3✔
UNCOV
623
            except TYpeError:
×
UNCOV
624
                logger.warning(
×
625
                    "Could not access to GC content of {self.chrom_name}. Check your reference is the one used to build the BAM/BED file"
626
                )
627

628
    def next(self):
3✔
629
        try:
3✔
630
            chunk = next(self.iterator)
3✔
631
            self._set_chunk(chunk)
3✔
632
        except Exception as err:
×
633
            print(err)
×
UNCOV
634
            self._df = None
×
635

636
    def _check_window(self, W):
3✔
637
        if W * 2 > len(self.df):
3✔
UNCOV
638
            msg = "W ({}) is too large compared to the contig ({})".format(W, len(self.df))
×
UNCOV
639
            logger.error(msg)
×
640
            raise Exception(msg)
×
641

642
    def run(self, W, k=2, circular=False, binning=None, cnv_delta=None):
3✔
643

644
        self.init()
3✔
645
        # for the coverare snakemake pipeline
646
        if binning == -1:
3✔
UNCOV
647
            binning = None
×
648

649
        # for the coverare snakemake pipeline
650
        if cnv_delta == -1:
3✔
651
            cnv_delta = None
3✔
652

653
        if binning:
3✔
654
            logger.debug("binning={}".format(binning))
3✔
655
        if cnv_delta:
3✔
656
            logger.debug("cnv_delta={}".format(cnv_delta))
3✔
657
        if self.chunksize < 5 * W:
3✔
658
            logger.warning(("chunksize is small as compared to W. " "Not recommended! "))
3✔
659
        if binning:
3✔
660
            assert binning > 1 and isinstance(binning, int), "binning must be integer > 1"
3✔
661

662
        self.chunk_rois = []
3✔
663

664
        # Get the number of chunks
665
        num = self.bed.positions[self.chrom_name]["N"]
3✔
666
        denom = self.chunksize
3✔
667
        if num % denom == 0:
3✔
UNCOV
668
            N = num // denom
×
669
        else:
670
            N = num // denom + 1
3✔
671

672
        if binning is None or binning == 1:
3✔
673
            self.binning = 1
3✔
674
            if N > 1:
3✔
675
                pb = Progress(N)
3✔
676
                pb.animate(0)
3✔
677
            for i, chunk in enumerate(self.iterator):
3✔
678
                logger.debug("Analysing chunk {}".format(i + 1))
3✔
679
                self._set_chunk(chunk)
3✔
680

681
                logger.debug("running median computation")
3✔
682
                self.running_median(W, circular=circular)
3✔
683
                logger.debug("zscore computation")
3✔
684
                self.compute_zscore(k=k, verbose=False)  # avoid repetitive warning
3✔
685

686
                rois = self.get_rois()
3✔
687
                if cnv_delta is not None and cnv_delta > 1:
3✔
UNCOV
688
                    rois.merge_rois_into_cnvs(delta=cnv_delta)
×
689
                summary = self.get_summary()
3✔
690
                self.chunk_rois.append([summary, rois])
3✔
691
                if N > 1:
3✔
692
                    pb.animate(i + 1)
3✔
693
            if N > 1:
3✔
694
                print()
3✔
695
        else:
696
            # Need to store the length
697
            total_length = 0
3✔
698
            binned_df = None
3✔
699
            if N > 1:
3✔
700
                pb = Progress(N)
3✔
701
            for i, chunk in enumerate(self.iterator):
3✔
702
                # we group the data by small chunk of "binning" values
703
                self._set_chunk(chunk)
3✔
704
                NN = len(self._df)
3✔
705
                total_length += NN
3✔
706

707
                n = int(NN / binning)
3✔
708
                a = [i for i in range(n) for j in range(binning)]
3✔
709
                # if non-integer n, we need to fill the last ID manually
710
                # with the remaining points
711

712
                if len(a) < NN:
3✔
713
                    a = a + [n] * (NN - len(a))
3✔
714
                # Now, we groupby on the ID, and aggregate the cov columns (sum)
715
                # while the pos is aggregate by simply taking the minimum
716
                # (starting position).
717
                # self._set_chunk(chunk)
718
                df = self._df.copy()
3✔
719
                df["id"] = a
3✔
720
                groups = df.groupby("id")
3✔
721
                # Don't understand why but this is ultra slow
722
                # when called from this method, whereas, it is
723
                # fast in a shell, when calling statement by statement
724
                # Tried to use np.mean, pylab.mean, from np import mean,
725
                # the "mean" string command, etc...
726
                # df = groups.agg({
727
                #        "pos":lambda x: x.min(),
728
                #        "cov": np.mean
729
                #    })
730
                df = pd.DataFrame(
3✔
731
                    {
732
                        "pos": groups.min()["pos"].astype(int),
733
                        "cov": groups.mean()["cov"],
734
                    }
735
                )
736

737
                if "gc" in self._df.columns:
3✔
UNCOV
738
                    df["gc"] = groups.mean()["gc"]
×
739

740
                df.index = df["pos"]
3✔
741
                # append
742
                if binned_df is not None:
3✔
743
                    binned_df = pd.concat([binned_df, df])
3✔
744
                else:
745
                    binned_df = df
3✔
746
                if N > 1:
3✔
747
                    pb.animate(i + 1)
3✔
748
            if N > 1:
3✔
749
                print()
3✔
750
            # used by __len__
751
            self._length = total_length
3✔
752
            self._df = binned_df.copy()
3✔
753
            self.binning = binning
3✔
754

755
            self.running_median(int(W / binning), circular=circular)
3✔
756
            self.compute_zscore(k=k, verbose=False)  # avoid repetitive warning
3✔
757
            # Only one ROIs, but we use the same logic as in the chunk case,
758
            # and store the rois/summary in the ChromosomeCovMultiChunk
759
            # structure
760
            rois = self.get_rois()
3✔
761
            if cnv_delta is not None and cnv_delta > 1:
3✔
762
                rois.merge_rois_into_cnvs(delta=cnv_delta)
3✔
763
            summary = self.get_summary()
3✔
764
            self.chunk_rois.append([summary, rois])
3✔
765

766
        results = ChromosomeCovMultiChunk(self.chunk_rois)
3✔
767

768
        self._rois = results.get_rois()
3✔
769

770
        self.bed._basic_stats[self.chrom_name] = {
3✔
771
            "DOC": self.DOC,
772
            "CV": self.CV,
773
            "length": len(self),
774
        }
775
        # duplicated call to results.get_rois() FIXME
776
        self.bed._rois[self.chrom_name] = results.get_rois()
3✔
777

778
        # self._html_list = self._html_list.union(f"{sample}/{sample}.cov.html")
779

780
        return results
3✔
781

782
    @property
3✔
783
    def df(self):
3✔
784
        if self._df is None:
3✔
785
            self.next()
3✔
786
        return self._df
3✔
787

788
    @property
3✔
789
    def rois(self):
3✔
790
        if self._rois is None:
3✔
791
            self._rois = self.get_rois()
3✔
792
        return self._rois
3✔
793

794
    @property
3✔
795
    def bed(self):
3✔
796
        return self._bed
3✔
797

798
    def get_gaussians(self):
3✔
UNCOV
799
        return "{0}: {1}".format(self.chrom_name, self.gaussians_params)
×
800

801
    def moving_average(self, n, circular=False):
3✔
802
        """Compute moving average of the genome coverage
803

804
        :param n: window's size. Must be odd
805
        :param bool circular: is the chromosome circular or not
806

807
        Store the results in the :attr:`df` attribute (dataframe) with a
808
        column named *ma*.
809

810
        """
811
        N = len(self.df["cov"])
3✔
812
        assert n < N / 2
3✔
813
        from sequana.stats import moving_average
3✔
814

815
        ret = np.cumsum(np.array(self.df["cov"]), dtype=float)
3✔
816
        ret[n:] = ret[n:] - ret[:-n]
3✔
817
        ma = ret[n - 1 :] / n
3✔
818
        mid = int(n / 2)
3✔
819
        self._df["ma"] = pd.Series(ma, index=np.arange(start=mid, stop=(len(ma) + mid)))
3✔
820

821
        if circular:
3✔
822
            # FIXME: shift of +-1 as compared to non circular case...
823
            # shift the data and compute the moving average
824
            self.data = (
3✔
825
                list(self.df["cov"].values[N - n :]) + list(self.df["cov"].values) + list(self.df["cov"].values[0:n])
826
            )
827
            ma = moving_average(self.data, n)
3✔
828
            self.ma = ma[n // 2 + 1 : -n // 2]
3✔
829
            self._df["ma"] = pd.Series(self.ma, index=self.df["cov"].index)
3✔
830

831
    def running_median(self, n, circular=False):
3✔
832
        """Compute running median of genome coverage
833

834
        :param int n: window's size.
835
        :param bool circular: if a mapping is circular (e.g. bacteria
836
            whole genome sequencing), set to True
837

838
        Store the results in the :attr:`df` attribute (dataframe) with a
839
        column named *rm*.
840

841
        .. versionchanged:: 0.1.21
842
            Use Pandas rolling function to speed up computation.
843

844
        """
845
        self._check_window(n)
3✔
846
        self.circular = circular
3✔
847
        self.window_size = n
3✔
848

849
        # in py2/py3 the division (integer or not) has no impact
850
        mid = int(self.window_size / 2)
3✔
851
        self.range = [None, None]
3✔
852
        try:
3✔
853
            if self.circular:
3✔
854
                # BASED on running_median pure implementation, could be much
855
                # slower than pure pandas rolling function. Keep those 4 lines
856
                # for book keeping though.
857
                # cover = list(self.df["cov"])
858
                # cover = cover[-mid:] + cover + cover[:mid]
859
                # rm = running_median.RunningMedian(cover, n).run()
860
                # self.df["rm"] = rm[mid:-mid]
861
                rm = (
3✔
862
                    pd.concat([self.df["cov"][-mid:], self.df["cov"], self.df["cov"][:mid]])
863
                    .rolling(n, center=True)
864
                    .median()
865
                )
866
                self._df["rm"] = rm[mid:-mid]
3✔
867

868
            else:
869
                rm = self.df["cov"].rolling(n, center=True).median()
3✔
870
                # Like in RunningMedian, we copy the NAN with real data
871
                rm[0:mid] = self.df["cov"][0:mid]
3✔
872
                rm[-mid:] = self.df["cov"][-mid:]
3✔
873
                # rm = running_median.RunningMedian(cover, n).run()
874

875
                self._df["rm"] = rm
3✔
876
                # set up slice for gaussian prediction
877
                self.range = [mid, -mid]
3✔
UNCOV
878
        except:
×
UNCOV
879
            self._df["rm"] = self.df["cov"]
×
880

881
    @property
3✔
882
    def DOC(self):
3✔
883
        """depth of coverage"""
884
        if self._DOC is None:
3✔
885
            self._DOC = self.df["cov"].mean()
3✔
886
        return self._DOC
3✔
887

888
    @property
3✔
889
    def STD(self):
3✔
890
        """standard deviation of depth of coverage"""
891
        if self._STD is None:
3✔
892
            self._STD = self.df["cov"].std()
3✔
893
        return self._STD
3✔
894

895
    @property
3✔
896
    def CV(self):
3✔
897
        """The coefficient of variation (CV) is defined as sigma / mu"""
898
        if self._CV is None:
3✔
899
            if self.DOC == 0:
3✔
UNCOV
900
                self._CV = np.nan
×
901
            else:
902
                self._CV = self.STD / self.DOC
3✔
903
        return self._CV
3✔
904

905
    @property
3✔
906
    def BOC(self):
3✔
907
        """breadth of coverage"""
908
        if self._BOC is None:
3✔
909
            self._BOC = 100 * (1 - (self.df["cov"] == 0).sum() / float(len(self.df)))
3✔
910
        return self._BOC
3✔
911

912
    @property
3✔
913
    def C3(self):
3✔
914
        if self._C3 is None:
3✔
915
            self._C3 = self.get_centralness(3)
3✔
916
        return self._C3
3✔
917

918
    @property
3✔
919
    def C4(self):
3✔
920
        if self._C4 is None:
3✔
921
            self._C4 = self.get_centralness(4)
3✔
922
        return self._C4
3✔
923

924
    @property
3✔
925
    def evenness(self):
3✔
926
        """Return Evenness of the coverage
927

928
        :Reference: Konrad Oexle, Journal of Human Genetics 2016, Evaulation
929
            of the evenness score in NGS.
930

931
        work before or after normalisation but lead to different results.
932

933
        """
934
        if self._evenness is None:
3✔
935
            try:
3✔
936
                self._evenness = evenness(self.df["cov"])
3✔
UNCOV
937
            except:
×
UNCOV
938
                self._evenness = 0
×
939
        return self._evenness
3✔
940

941
    def _coverage_scaling(self):
3✔
942
        """Normalize data with moving average of coverage
943

944
        Store the results in the :attr:`df` attribute (dataframe) with a
945
        column named *scale*.
946

947
        .. note:: Needs to call :meth:`running_median`
948

949
        """
950
        if "rm" not in self.df.columns:
3✔
951
            txt = "Column rm (running median) is missing.\n" + self.__doc__
3✔
952
            print(txt)
3✔
953
            raise KeyError
3✔
954
        else:
955
            self._df["scale"] = self.df["cov"] / self.df["rm"]
3✔
956
        self._df = self.df.replace(np.inf, np.nan)
3✔
957
        self._df = self.df.replace(-np.inf, np.nan)
3✔
958

959
    def _get_best_gaussian(self):
3✔
960
        results_pis = [model["pi"] for model in self.gaussians_params]
3✔
961
        indice = np.argmax(results_pis)
3✔
962
        return self.gaussians_params[indice]
3✔
963

964
    def compute_zscore(self, k=2, use_em=True, clip=4, verbose=True, force_models=None):
3✔
965
        """Compute zscore of coverage and normalized coverage.
966

967
        :param int k: Number gaussian predicted in mixture (default = 2)
968
        :param use_em: use Expectation-Maximization (EM) algorithm
969
        :param float clip: ignore values above the clip threshold
970
        :param bool force_models: if set, fitted models is ignored and replaced with 2 Gaussian models
971
            where the main model has mean of 1 and represent 90% of the data. Useful to override
972
            normal behavior
973

974
        Store the results in the :attr:`df` attribute (dataframe) with a
975
        column named *zscore*.
976

977
        .. note:: needs to call :meth:`running_median` before hand.
978

979
        """
980
        # here for lazy import
981
        from sequana import mixture
3✔
982

983
        # normalize coverage
984
        self._coverage_scaling()
3✔
985

986
        # ignore start and end (corrupted due to running median window)
987
        # the slice does not seem to work as a copy, hence the copy()
988
        data = self.df["scale"][self.range[0] : self.range[1]].copy()
3✔
989

990
        # remove zero, nan and inf values and ignore values above 4 that would
991
        # bias the estimation of the central
992
        data[data > 4] = 0
3✔
993
        data = data.replace(0, np.nan)
3✔
994
        data = data.dropna()
3✔
995

996
        if data.empty:  # pragma: no cover
997
            self._df["scale"] = np.ones(len(self.df), dtype=int)
998
            self._df["zscore"] = np.zeros(len(self.df), dtype=int)
999
            # define arbitrary values
1000
            self.gaussians_params = [
1001
                {"mu": 0.5, "pi": 0.15, "sigma": 0.1},
1002
                {"mu": 1, "pi": 0.85, "sigma": 0.1},
1003
            ]
1004
            self.best_gaussian = self._get_best_gaussian()
1005

1006
            return
1007

1008
        # if len data > 100,000 select 100,000 data points randomly
1009
        if len(data) > 100000:
3✔
1010
            indices = random.sample(range(len(data)), 100000)
×
UNCOV
1011
            data = [data.iloc[i] for i in indices]
×
1012

1013
        if force_models:
3✔
UNCOV
1014
            self.gaussians_params = [{"mu": 1, "sigma": 0.2, "pi": 0.9}, {"mu": 2, "sigma": 0.2, "pi": 0.1}]
×
UNCOV
1015
            self.best_gaussian = self._get_best_gaussian()
×
1016
            # dummy value not equal to zero
1017
            self.gaussians = {"sigmas": [0.2, 0.2], "mus": [1, 2], "pis": [0.9, 0.1]}
×
1018
        else:
1019
            if use_em:
3✔
1020
                self.mixture_fitting = mixture.EM(data)
3✔
1021
                self.mixture_fitting.estimate(k=k)
3✔
1022
            else:
UNCOV
1023
                self.mixture_fitting = mixture.GaussianMixtureFitting(data, k=k)
×
UNCOV
1024
                self.mixture_fitting.estimate()
×
1025

1026
            # keep gaussians informations
1027
            self.gaussians = self.mixture_fitting.results
3✔
1028
            params_key = ("mus", "sigmas", "pis")
3✔
1029
            self.gaussians_params = [{key[:-1]: self.gaussians[key][i] for key in params_key} for i in range(k)]
3✔
1030
            self.best_gaussian = self._get_best_gaussian()
3✔
1031

1032
        # warning when sigma is equal to 0
1033
        if self.best_gaussian["sigma"] == 0:
3✔
UNCOV
1034
            logger.warning("A problem related to gaussian prediction is " "detected. Be careful, Sigma is equal to 0.")
×
UNCOV
1035
            self._df["zscore"] = np.zeros(len(self.df), dtype=int)
×
1036
        else:
1037
            self._df["zscore"] = (self.df["scale"] - self.best_gaussian["mu"]) / self.best_gaussian["sigma"]
3✔
1038

1039
        # Naive checking that the 2 models are sensible (mus are different)
1040
        if k == 2:
3✔
1041
            mus = self.gaussians["mus"]
3✔
1042
            sigmas = self.gaussians["sigmas"]
3✔
1043

1044
            index0 = mus.index(self.best_gaussian["mu"])
3✔
1045
            if index0 == 0:
3✔
1046
                mu1 = mus[1]
3✔
1047
                s1 = sigmas[1]
3✔
1048
                s0 = sigmas[0]
3✔
1049
                mu0 = mus[0]
3✔
1050
            else:
1051
                mu1 = mus[0]
3✔
1052
                s1 = sigmas[0]
3✔
1053
                s0 = sigmas[1]
3✔
1054
                mu0 = mus[1]
3✔
1055
            if abs(mu0 - mu1) < s0:
3✔
1056
                if verbose:
3✔
UNCOV
1057
                    logger.warning(
×
1058
                        (
1059
                            "\nFound mixture model parameters (k=2) where "
1060
                            " |mu0-mu1| < sigma0. k=1 could be a better choice."
1061
                            "mu0={}, m1={}, sigma0={}, sigma1={}".format(mu0, mu1, s0, s1)
1062
                        )
1063
                    )
1064

1065
        # If coverage is zero, clearly, this is the lowest bound
1066
        # Yet, for low depth of coverage (e.g. around 5), a deleted
1067
        # gene may have a zscore between 0 and 3, which is considered noise
1068
        # If we have deleted area, we want to make sure that the zscore is
1069
        # large enough that it can be considered as an event.
1070

1071
        # Here, we set the zscore value to at least the value of the threshold
1072

1073
        self._df["zscore"] = self._df["zscore"].fillna(0)
3✔
1074

1075
        floor = self.df["cov"] == 0  # fastest than query, or ==
3✔
1076
        newvalues = self.df.loc[floor, "zscore"].apply(lambda x: min(self.thresholds.low - 0.01, x))
3✔
1077
        self._df.loc[floor, "zscore"] = newvalues
3✔
1078

1079
        # finally, since we compute the zscore, rois must be recomputed
1080
        self._rois = None
3✔
1081

1082
    def get_centralness(self, threshold=3):
3✔
1083
        """Proportion of central (normal) genome coverage
1084

1085
        This is 1 - (number of non normal data) / (total length)
1086

1087
        .. note:: depends on the thresholds attribute being used.
1088
        .. note:: depends slightly on :math:`W` the running median window
1089
        """
1090
        # keep original thresholds
1091
        temp_low = self.thresholds.low
3✔
1092
        temp_high = self.thresholds.high
3✔
1093

1094
        self.thresholds.low = -threshold
3✔
1095
        self.thresholds.high = threshold
3✔
1096

1097
        try:
3✔
1098
            filtered = self.get_rois()
3✔
1099
        except Exception as err:  # pragma: no cover
1100
            raise (err)
1101
        finally:
1102
            # restore the original threshold
1103
            self.thresholds.low = temp_low
3✔
1104
            self.thresholds.high = temp_high
3✔
1105
        Cplus = (filtered.get_high_rois()["size"]).sum()
3✔
1106
        Cminus = (filtered.get_low_rois()["size"]).sum()
3✔
1107
        Centralness = 1 - (Cplus + Cminus) / float(len(self))
3✔
1108

1109
        return Centralness
3✔
1110

1111
    def get_rois(self):
3✔
1112
        """Keep positions with zscore outside of the thresholds range.
1113

1114
        :return: a dataframe from :class:`FilteredGenomeCov`
1115

1116
        .. note:: depends on the :attr:`thresholds` low and high values.
1117
        """
1118
        if "zscore" not in self.df.columns:
3✔
1119
            logger.critical(
3✔
1120
                (
1121
                    "you must call running_median and compute_zscore."
1122
                    "alternatively, the run() method does the two steps"
1123
                    " at once"
1124
                )
1125
            )
1126
            raise Exception
3✔
1127
        features = self.bed.feature_dict
3✔
1128

1129
        try:
3✔
1130
            second_high = self.thresholds.high2
3✔
1131
            second_low = self.thresholds.low2
3✔
1132
            query = "zscore > @second_high or zscore < @second_low"
3✔
1133

1134
            # in the genbank, the names appears as e.g. JB12345
1135
            # but in the fasta or BED files, it may be something like
1136
            # gi|269939526|emb|FN433596.1|
1137
            # so they do not match. We can try to guess it
1138
            alternative = None
3✔
1139

1140
            if features:
3✔
1141
                if self.chrom_name not in features.keys():
3✔
1142

1143
                    alternative = [x for x in self.chrom_name.split("|") if x]
3✔
1144
                    alternative = alternative[-1]  # assume the accession is last
3✔
1145
                    alternative = alternative.split(".")[0]  # remove version
3✔
1146
                    if alternative not in features.keys():
3✔
UNCOV
1147
                        msg = """Chromosome name (%s) not found
×
1148
                            in the genbank. Make sure the chromosome names in
1149
                            the BAM/BED files are compatible with the genbank
1150
                            content. Genbank files contains the following keys """
UNCOV
1151
                        for this in features.keys():
×
UNCOV
1152
                            msg += f"\n                        - {this}"
×
UNCOV
1153
                        logger.warning(msg % self.chrom_name)
×
1154

1155
            data = self.df.query(query)
3✔
1156
            data.insert(0, "chr", self.chrom_name)
3✔
1157

1158
            if features:
3✔
1159
                if alternative:
3✔
1160
                    return FilteredGenomeCov(data, self.thresholds, features[alternative], step=self.binning)
3✔
1161
                else:
1162
                    return FilteredGenomeCov(
3✔
1163
                        data,
1164
                        self.thresholds,
1165
                        features[self.chrom_name],
1166
                        step=self.binning,
1167
                    )
1168
            else:
1169
                return FilteredGenomeCov(data, self.thresholds, step=self.binning)
3✔
1170
        except KeyError:  # pragma: no cover
1171
            logger.error(
1172
                "Column zscore is missing in data frame.\n"
1173
                "You must run compute_zscore before get low coverage."
1174
                "\n\n",
1175
                self.__doc__,
1176
            )
1177
            sys.exit(1)
1178

1179
    def plot_rois(
3✔
1180
        self,
1181
        x1,
1182
        x2,
1183
        set_ylimits=False,
1184
        rois=None,
1185
        fontsize=16,
1186
        color_high="r",
1187
        color_low="g",
1188
        clf=True,
1189
    ):
1190
        if rois is None:
3✔
1191
            rois = self.rois
3✔
1192

1193
        high = rois.get_high_rois().query("end>@x1 and start<@x2")
3✔
1194
        low = rois.get_low_rois().query("end>@x1 and start<@x2")
3✔
1195

1196
        self.plot_coverage(
3✔
1197
            x1=x1,
1198
            x2=x2,
1199
            set_ylimits=set_ylimits,
1200
            sample=False,
1201
            fontsize=fontsize,
1202
            clf=clf,
1203
        )
1204

1205
        for start, end, cov in zip(high.start, high.end, high.mean_cov):
3✔
1206
            pylab.plot([start, end], [cov, cov], lw=2, color=color_high, marker="o")
3✔
1207

1208
        for start, end, cov in zip(low.start, low.end, low.mean_cov):
3✔
1209
            pylab.plot([start, end], [cov, cov], lw=2, color=color_low, marker="o")
3✔
1210
        pylab.xlim([x1, x2])
3✔
1211

1212
    def plot_coverage(
3✔
1213
        self,
1214
        filename=None,
1215
        fontsize=16,
1216
        rm_lw=1,
1217
        rm_color="#0099cc",
1218
        rm_label="Running median",
1219
        th_lw=1,
1220
        th_color="r",
1221
        th_ls="--",
1222
        main_color="k",
1223
        main_lw=1,
1224
        main_kwargs={},
1225
        sample=True,
1226
        set_ylimits=True,
1227
        x1=None,
1228
        x2=None,
1229
        clf=True,
1230
    ):
1231
        """Plot coverage as a function of base position.
1232

1233
        :param filename:
1234
        :param rm_lw: line width of the running median
1235
        :param rm_color: line color of the running median
1236
        :param rm_color: label for the running median
1237
        :param th_lw: line width of the thresholds
1238
        :param th_color: line color of the thresholds
1239
        :param main_color: line color of the coverage
1240
        :param main_lw: line width of the coverage
1241
        :param sample: if there are more than 1 000 000 points, we
1242
            use an integer step to skip data points. We can still plot
1243
            all points at your own risk by setting this option to False
1244

1245
        :param set_ylimits: we want to focus on the "normal" coverage ignoring
1246
            unsual excess. To do so, we set the yaxis range between 0 and a
1247
            maximum value. This maximum value is set to the minimum between the
1248
            10 times the mean coverage and 1.5 the maximum of the high coverage
1249
            threshold curve. If you want to let the ylimits free, set this
1250
            argument to False
1251
        :param x1: restrict lower x value to x1
1252
        :param x2: restrict lower x value to x2 (x2 must be greater than x1)
1253

1254
        .. note:: if there are more than 1,000,000 points, we show only
1255
            1,000,000 by points. For instance for 5,000,000 points,
1256

1257
        In addition to the coverage, the running median and coverage confidence
1258
        corresponding to the lower and upper  zscore thresholds are shown.
1259

1260
        .. note:: uses the thresholds attribute.
1261
        """
1262
        # z = (X/rm - \mu ) / sigma
1263
        # some view to restrict number of points to look at:
1264
        if x1 is None:
3✔
1265
            x1 = self.df.iloc[0].name
3✔
1266
        if x2 is None:
3✔
1267
            x2 = self.df.iloc[-1].name
3✔
1268
        assert x1 < x2
3✔
1269

1270
        df = self.df.loc[x1:x2]
3✔
1271

1272
        high_zcov = (self.thresholds.high * self.best_gaussian["sigma"] + self.best_gaussian["mu"]) * df["rm"]
3✔
1273
        low_zcov = (self.thresholds.low * self.best_gaussian["sigma"] + self.best_gaussian["mu"]) * df["rm"]
3✔
1274

1275
        if clf:
3✔
1276
            pylab.clf()
3✔
1277
        ax = pylab.gca()
3✔
1278
        ax.set_facecolor("#eeeeee")
3✔
1279
        try:
3✔
1280
            pylab.xlim(df["pos"].iloc[0], df["pos"].iloc[-1])
3✔
UNCOV
1281
        except IndexError:
×
1282
            pass
×
1283
        axes = []
3✔
1284
        labels = []
3✔
1285

1286
        # 1,000,000 points is already a lot for matplotlib. Let us restrict
1287
        # the number of points to a million for now.
1288
        if len(df) > 1000000 and sample is True:
3✔
UNCOV
1289
            logger.info("sub sample data for plotting the coverage")
×
UNCOV
1290
            NN = int(len(df) / 1000000)
×
1291
        else:  # pragma: no cover
1292
            NN = 1
1293

1294
        # the main coverage plot
1295
        (p1,) = pylab.plot(
3✔
1296
            df["cov"][::NN],
1297
            color=main_color,
1298
            label="Coverage",
1299
            linewidth=main_lw,
1300
            **main_kwargs,
1301
        )
1302
        axes.append(p1)
3✔
1303
        labels.append("Coverage")
3✔
1304

1305
        # The running median plot
1306
        if rm_lw > 0:
3✔
1307
            (p2,) = pylab.plot(df["rm"][::NN], color=rm_color, linewidth=rm_lw, label=rm_label)
3✔
1308
            axes.append(p2)
3✔
1309
            labels.append(rm_label)
3✔
1310

1311
        # The threshold curves
1312
        if th_lw > 0:
3✔
1313
            (p3,) = pylab.plot(
3✔
1314
                high_zcov[::NN],
1315
                linewidth=th_lw,
1316
                color=th_color,
1317
                ls=th_ls,
1318
                label="Thresholds",
1319
            )
1320
            (p4,) = pylab.plot(
3✔
1321
                low_zcov[::NN],
1322
                linewidth=th_lw,
1323
                color=th_color,
1324
                ls=th_ls,
1325
                label="_nolegend_",
1326
            )
1327
            axes.append(p3)
3✔
1328
            labels.append("Thresholds")
3✔
1329

1330
        pylab.legend(axes, labels, loc="best")
3✔
1331
        pylab.xlabel("Position", fontsize=fontsize)
3✔
1332
        pylab.ylabel("Per-base coverage", fontsize=fontsize)
3✔
1333
        pylab.grid(True)
3✔
1334

1335
        # sometimes there are large coverage value that squeeze the plot.
1336
        # Let us restrict it. We can say that 10 times the average coverage is
1337
        # the maximum. We can save the original plot and the squeezed one.
1338

1339
        if set_ylimits is True:
3✔
1340
            # m1 = high_zcov.max(skipna=True)
1341
            m4 = high_zcov[high_zcov > 0]
3✔
1342
            if len(m4) == 0:
3✔
UNCOV
1343
                m4 = 3
×
1344
            else:
1345
                m4 = m4.max(skipna=True)
3✔
1346
            # ignore values equal to zero to compute mean average
1347
            m3 = df[df["cov"] > 0]["cov"].mean()
3✔
1348

1349
            pylab.ylim([0, min([m4 * 2, m3 * 10])])
3✔
1350
        else:
1351
            pylab.ylim([0, pylab.ylim()[1]])
3✔
1352

1353
        try:
3✔
1354
            pylab.gcf().set_layout_engine("tight")
3✔
1355
        except:  # pragma: no cover
1356
            pass
1357

1358
        if filename:
3✔
1359
            pylab.savefig(filename)
3✔
1360

1361
    def _set_bins(self, df, binwidth):
3✔
1362
        try:
3✔
1363
            bins = np.arange(min(df), max(df) + binwidth, binwidth)
3✔
UNCOV
1364
        except ValueError:
×
UNCOV
1365
            return 100
×
1366
        if bins.any():
3✔
1367
            return bins
3✔
UNCOV
1368
        return 100
×
1369

1370
    def plot_hist_zscore(self, fontsize=16, filename=None, max_z=6, binwidth=0.5, **hist_kargs):
3✔
1371
        """Barplot of the zscore values"""
1372
        pylab.clf()
3✔
1373
        bins = self._set_bins(self.df["zscore"][self.range[0] : self.range[1]], binwidth)
3✔
1374
        self.df["zscore"][self.range[0] : self.range[1]].hist(grid=True, bins=bins, **hist_kargs)
3✔
1375
        pylab.xlabel("Z-score", fontsize=fontsize)
3✔
1376
        try:
3✔
1377
            pylab.gcf().set_layout_engine("tight")
3✔
1378
        except:  # pragma: no cover
1379
            pass
1380
        pylab.xlim([-max_z, max_z])
3✔
1381
        if filename:
3✔
1382
            pylab.savefig(filename)
3✔
1383

1384
    def plot_hist_normalized_coverage(self, filename=None, binwidth=0.05, max_z=3):
3✔
1385
        """Barplot of the normalized coverage with gaussian fitting"""
1386
        pylab.clf()
3✔
1387
        # if there are a NaN -> can't set up binning
1388
        d = self.df["scale"][self.range[0] : self.range[1]].dropna()
3✔
1389
        # remove outlier -> plot crash if range between min and max is too high
1390
        d = d[np.abs(d - d.mean()) <= (4 * d.std())]
3✔
1391
        bins = self._set_bins(d, binwidth)
3✔
1392
        try:
3✔
1393
            self.mixture_fitting.data = d
3✔
1394
            self.mixture_fitting.plot(self.gaussians_params, bins=bins, Xmin=0, Xmax=max_z)
3✔
1395
        except (AttributeError, ZeroDivisionError):  # pragma: no cover
1396
            return
1397

1398
        pylab.grid(True)
3✔
1399
        pylab.xlim([0, max_z])
3✔
1400
        pylab.xlabel("Normalised per-base coverage")
3✔
1401

1402
        try:
3✔
1403
            pylab.gcf().set_layout_engine("tight")
3✔
1404
        except:  # pragma: no cover
1405
            pass
1406
        if filename:
3✔
1407
            pylab.savefig(filename)
3✔
1408

1409
    def plot_hist_coverage(
3✔
1410
        self,
1411
        logx=True,
1412
        logy=True,
1413
        fontsize=16,
1414
        N=25,
1415
        fignum=1,
1416
        hold=False,
1417
        alpha=0.8,
1418
        ec="k",
1419
        filename=None,
1420
        zorder=10,
1421
        **kw_hist,
1422
    ):
1423
        """
1424

1425
        :param N:
1426
        :param ec:
1427
        """
1428
        if hold is False:
3✔
1429
            pylab.figure(fignum)
3✔
1430
            pylab.clf()
3✔
1431
        ax = pylab.gca()
3✔
1432
        ax.set_facecolor("#eeeeee")
3✔
1433

1434
        data = self.df["cov"].dropna().values
3✔
1435

1436
        maxcov = data.max()
3✔
1437
        if logx is True and logy is True:
3✔
1438
            bins = pylab.logspace(0, pylab.log10(maxcov), N)
3✔
1439
            pylab.hist(
3✔
1440
                data,
1441
                bins=bins,
1442
                log=True,
1443
                label=self.chrom_name,
1444
                alpha=alpha,
1445
                ec=ec,
1446
                zorder=zorder,
1447
                **kw_hist,
1448
            )
1449
            pylab.semilogx()
3✔
1450
            pylab.xlabel("Coverage (log scale)", fontsize=fontsize)
3✔
1451
            pylab.ylabel("Count (log scale)", fontsize=fontsize)
3✔
1452
        elif logx is False and logy is True:
3✔
1453
            pylab.hist(
3✔
1454
                data,
1455
                bins=N,
1456
                log=True,
1457
                label=self.chrom_name,
1458
                alpha=alpha,
1459
                ec=ec,
1460
                zorder=zorder,
1461
                **kw_hist,
1462
            )
1463
            pylab.xlabel("Coverage", fontsize=fontsize)
3✔
1464
            pylab.ylabel("Count (log scale)", fontsize=fontsize)
3✔
1465
        elif logx is True and logy is False:
3✔
1466
            bins = pylab.logspace(0, pylab.log10(maxcov), N)
3✔
1467
            pylab.hist(
3✔
1468
                data,
1469
                bins=N,
1470
                label=self.chrom_name,
1471
                alpha=alpha,
1472
                zorder=zorder,
1473
                ec=ec,
1474
                **kw_hist,
1475
            )
1476
            pylab.xlabel("Coverage (log scale)", fontsize=fontsize)
3✔
1477
            pylab.ylabel("Count", fontsize=fontsize)
3✔
1478
            pylab.semilogx()
3✔
1479
        else:
1480
            pylab.hist(
3✔
1481
                data,
1482
                bins=N,
1483
                label=self.chrom_name,
1484
                alpha=alpha,
1485
                zorder=zorder,
1486
                ec=ec,
1487
                **kw_hist,
1488
            )
1489
            pylab.xlabel("Coverage", fontsize=fontsize)
3✔
1490
            pylab.ylabel("Count", fontsize=fontsize)
3✔
1491
        pylab.grid(True)
3✔
1492
        if filename:
3✔
1493
            pylab.savefig(filename)
3✔
1494

1495
    def to_csv(self, filename=None, start=None, stop=None, **kwargs):
3✔
1496
        """Write CSV file of the dataframe.
1497

1498
        :param str filename: csv output filename. If None, return string.
1499
        :param int start: start row index.
1500
        :param int stop: stop row index.
1501

1502
        Params of :meth:`pandas.DataFrame.to_csv`:
1503

1504
        :param list columns: columns you want to write.
1505
        :param bool header: determine if the header is written.
1506
        :param bool index: determine if the index is written.
1507
        :param str float_format: determine the float format.
1508
        """
1509
        # Create directory to avoid errno 2
1510
        if filename:
3✔
1511
            directory = os.path.dirname(os.path.realpath(filename))
3✔
1512
            try:
3✔
1513
                os.makedirs(directory)
3✔
1514
            except FileExistsError:
3✔
1515
                if os.path.isdir(directory):
3✔
1516
                    pass
3✔
1517
                else:
UNCOV
1518
                    msg = f"{directory} exist and it is not a directory"
×
UNCOV
1519
                    logger.error(msg)
×
UNCOV
1520
                    raise FileExistsError
×
1521
        return self.df.loc[start:stop].to_csv(filename, **kwargs)
3✔
1522

1523
    def plot_gc_vs_coverage(
3✔
1524
        self,
1525
        filename=None,
1526
        bins=None,
1527
        Nlevels=None,
1528
        fontsize=20,
1529
        norm="log",
1530
        ymin=0,
1531
        ymax=100,
1532
        contour=True,
1533
        cmap="BrBG",
1534
        **kwargs,
1535
    ):
1536
        """Plot histogram 2D of the GC content versus coverage"""
1537

1538
        if Nlevels is None or Nlevels == 0:
3✔
1539
            contour = False
3✔
1540

1541
        data = self.df[["cov", "gc"]].copy()
3✔
1542
        data["gc"] *= 100
3✔
1543
        data = data.dropna()
3✔
1544
        if bins is None:
3✔
1545
            bins = [
3✔
1546
                100,
1547
                min(
1548
                    int(data["gc"].max() - data["gc"].min() + 1),
1549
                    max(5, self.bed.gc_window_size - 10),
1550
                ),
1551
            ]
1552
            bins[0] = max(10, int(min(bins[0], self.df["cov"].max())))
3✔
1553

1554
        # FIXME jan 2018 there is currently an annoying warning in Hist2D
1555
        import warnings
3✔
1556

1557
        with warnings.catch_warnings():
3✔
1558
            # warnings.simplefilter("ignore")
1559
            from sequana.viz import Hist2D
3✔
1560

1561
            h2 = Hist2D(data)
3✔
1562

1563
            try:
3✔
1564
                h2.plot(
3✔
1565
                    bins=bins,
1566
                    xlabel="Per-base coverage",
1567
                    ylabel=r"GC content (%)",
1568
                    Nlevels=Nlevels,
1569
                    contour=contour,
1570
                    norm=norm,
1571
                    fontsize=fontsize,
1572
                    cmap=cmap,
1573
                    **kwargs,
1574
                )
1575
            except:  # pragma: no cover
1576
                h2.plot(
1577
                    bins=bins,
1578
                    xlabel="Per-base coverage",
1579
                    ylabel=r"GC content (%)",
1580
                    cmap=cmap,
1581
                    Nlevels=Nlevels,
1582
                    contour=False,
1583
                    norm=norm,
1584
                    fontsize=fontsize,
1585
                    **kwargs,
1586
                )
1587

1588
        pylab.ylim([ymin, ymax])
3✔
1589
        try:
3✔
1590
            pylab.gcf().set_layout_engine("tight")
3✔
1591
        except:  # pragma: no cover
1592
            pass
1593

1594
        if filename:
3✔
1595
            pylab.savefig(filename)
3✔
1596

1597
    def get_gc_correlation(self):
3✔
1598
        """Return the correlation between the coverage and GC content
1599

1600
        The GC content is the one computed in :meth:`SequanaCoverage.compute_gc_content`
1601
        (default window size is 101)
1602

1603
        """
1604
        self.df["gc"] = self._gc_content[self.df.index[0] - 1 : self.df.index[-1]]
3✔
1605
        C = self.df[["cov", "gc"]].corr().iloc[0, 1]
3✔
1606
        del self.df["gc"]
3✔
1607
        gc.collect()
3✔
1608
        return C
3✔
1609

1610
    def _get_hist_data(self, bins=30):
3✔
1611
        data = self.df["cov"].dropna()
3✔
1612
        m = data.quantile(0.01)
3✔
1613
        M = data.quantile(0.99)
3✔
1614
        # we
1615
        step = 1
3✔
1616
        bins = pylab.arange(m, M, step)
3✔
1617

1618
        while len(bins) > 150:
3✔
1619
            step *= 2
3✔
1620
            bins = pylab.arange(m, M, step)
3✔
1621

1622
        try:
3✔
1623
            (
3✔
1624
                Y,
1625
                X,
1626
            ) = np.histogram(data, bins=bins, density=True)
1627
            return {"X": list(X[1:]), "Y": list(Y)}
3✔
1628
        except:  # pragma: no cover
1629
            return {"X": [], "Y": []}
1630

1631
    def get_summary(self, C3=None, C4=None, stats=None, caller="sequana.bedtools"):
3✔
1632
        if stats is None:
3✔
1633
            stats = self.get_stats()
3✔
1634

1635
        ROI = len(self.rois)
3✔
1636
        ROIlow = len(self.rois.get_low_rois())
3✔
1637
        ROIhigh = len(self.rois.get_high_rois())
3✔
1638

1639
        fit = self._get_best_gaussian()
3✔
1640

1641
        d = {
3✔
1642
            "evenness": round(self.evenness, 4),
1643
            "C3": round(self.C3, 4),
1644
            "C4": round(self.C4, 4),
1645
            "BOC": self.BOC,
1646
            "length": len(self),
1647
            "DOC": self.DOC,
1648
            "CV": self.CV,
1649
            "chrom_name": self.chrom_name,
1650
            "ROI": ROI,
1651
            "ROI(low)": ROIlow,
1652
            "ROI(high)": ROIhigh,
1653
            "fit_mu": fit["mu"],
1654
            "fit_sigma": fit["sigma"],
1655
            "fit_pi": fit["pi"],
1656
            "hist_coverage": self._get_hist_data(),
1657
        }
1658
        if "gc" in stats:
3✔
UNCOV
1659
            d["GC"] = stats["gc"]
×
1660

1661
        # sample name will be the filename
1662
        # chrom name is the chromosome or contig name
1663
        # Fixes v0.8.0 get rid of the .bed extension
1664
        sample_name = os.path.basename(self._bed.input_filename.replace(".bed", ""))
3✔
1665
        summary = Summary("coverage", sample_name=sample_name, data=d, caller=caller)
3✔
1666

1667
        summary.data_description = {
3✔
1668
            "BOC": "Breadth of Coverage",
1669
            "DOC": "Depth of Coverage",
1670
            "chrom_name": "name of the contig/chromosome",
1671
            "CV": "coefficient of variation of the DOC",
1672
            "ROI": "number of regions of interest found",
1673
            "C3": "Centralness (1 - ratio outliers by genome length) using zscore of 3",
1674
            "C4": "Centralness (1 - ratio outliers by genome length) using zscore of 4",
1675
            "length": "length of the chromosome",
1676
        }
1677
        if "gc" in stats:
3✔
UNCOV
1678
            summary.data_description["GC"] = "GC content in %"
×
1679

1680
        return summary
3✔
1681

1682
    def get_stats(self):
3✔
1683
        """Return basic stats about the coverage data
1684

1685
        only "cov" column is required.
1686

1687
        :return: dictionary
1688
        """
1689
        MedianCOV = self.df["cov"].median()
3✔
1690

1691
        stats = {
3✔
1692
            "DOC": self.DOC,  # depth of coverage
1693
            "STD": self.STD,  # sigma of the DOC
1694
            "Median": MedianCOV,  # median of the DOC
1695
            "BOC": self.BOC,  # breadth of coverage
1696
        }
1697

1698
        # coefficient of variation
1699
        stats["CV"] = self.CV
3✔
1700

1701
        # median of the absolute median deviation
1702
        stats["MAD"] = np.median(abs(MedianCOV - self.df["cov"]).dropna())
3✔
1703

1704
        # GC content
1705
        if "gc" in self.df.columns:
3✔
1706
            stats["GC"] = self.df["gc"].mean() * 100
3✔
1707

1708
        stats["length"] = len(self)
3✔
1709

1710
        return stats
3✔
1711

1712
    def _compute_gc_content(self, ws=None):
3✔
1713
        if self.bed.reference_file is None:
3✔
UNCOV
1714
            return
×
1715

1716
        fasta = pysam.FastxFile(self.bed.reference_file)
3✔
1717
        chrom_gc_content = dict()
3✔
1718

1719
        if ws is None:
3✔
1720
            gc_window_size = self.bed.gc_window_size
3✔
1721
        else:
UNCOV
1722
            gc_window_size = ws
×
1723

1724
        mid = int(gc_window_size / 2.0)
3✔
1725

1726
        for chrom in fasta:
3✔
1727
            if chrom.name == self.chrom_name or chrom.name.split(".")[0] == self.chrom_name:
3✔
1728
                # Create gc_content array
1729
                gc_content = np.empty(len(chrom.sequence))
3✔
1730
                gc_content[:] = np.nan
3✔
1731
                if self.bed.circular:
3✔
UNCOV
1732
                    chrom.sequence = chrom.sequence[-mid:] + chrom.sequence + chrom.sequence[:mid]
×
1733
                    # Does not shift index of array
UNCOV
1734
                    mid = 0
×
1735

1736
                # Count first window content
1737
                counter = Counter(chrom.sequence[0:gc_window_size])
3✔
1738
                gc_count = 0
3✔
1739
                for letter in "GCgc":
3✔
1740
                    gc_count += counter[letter]
3✔
1741

1742
                gc_content[mid] = gc_count
3✔
1743

1744
                for i in range(1, len(chrom.sequence) - gc_window_size + 1):
3✔
1745
                    if chrom.sequence[i - 1] in "GCgc":
3✔
1746
                        gc_count -= 1
3✔
1747
                    if chrom.sequence[i + gc_window_size - 1] in "GCgc":
3✔
1748
                        gc_count += 1
3✔
1749
                    gc_content[i + mid] = gc_count
3✔
1750
                chrom_gc_content[chrom.name] = gc_content / gc_window_size
3✔
1751

1752
        # if accession processed by snpeff, the trailing version may be missing
1753
        try:
3✔
1754
            self._gc_content = chrom_gc_content[self.chrom_name]
3✔
UNCOV
1755
        except:
×
UNCOV
1756
            logger.warning(
×
1757
                f"chromosome name {chrom.name} from the reference does not match the chromosome names in the BAM/BED"
1758
            )
1759

1760

1761
class FilteredGenomeCov(object):
3✔
1762
    """Select a subset of :class:`ChromosomeCov`
1763

1764
    :target: developers only
1765
    """
1766

1767
    _feature_not_wanted = {"regulatory", "source"}
3✔
1768

1769
    def __init__(
3✔
1770
        self,
1771
        df,
1772
        threshold,
1773
        feature_list=None,
1774
        step=1,
1775
        apply_threshold_after_merging=True,
1776
    ):
1777
        """.. rubric:: constructor
1778

1779
        :param df: dataframe with filtered position used within
1780
            :class:`SequanaCoverage`. Must contain the following columns:
1781
            ["pos", "cov", "rm", "zscore"]
1782
        :param int threshold: a :class:`~sequana.bedtools.DoubleThresholds`
1783
            instance.
1784
        :param apply_threshold_after_merging: see :meth:`merge_rois_into_cnvs`.
1785

1786

1787
        """
1788
        self.rawdf = df.copy()
3✔
1789
        self.rawdf
3✔
1790
        self.thresholds = threshold
3✔
1791
        self.apply_threshold_after_merging = True
3✔
1792

1793
        if isinstance(feature_list, list) and len(feature_list) == 0:
3✔
UNCOV
1794
            feature_list = None
×
1795

1796
        self.feature_list = feature_list
3✔
1797

1798
        self.step = step
3✔
1799
        region_list = self._merge_region()
3✔
1800

1801
        if self.feature_list:
3✔
1802
            region_list = self._add_annotation(region_list, self.feature_list)
3✔
1803

1804
        self.df = self._dict_to_df(region_list, self.feature_list)
3✔
1805

1806
        def func(x):
3✔
1807
            try:
3✔
1808
                return x.split(".")[0]
3✔
UNCOV
1809
            except:
×
UNCOV
1810
                return x
×
1811

1812
        for column in ["gene_end", "gene_start"]:
3✔
1813
            if column in self.df.columns:
3✔
1814
                self.df[column] = self.df[column].astype(str)
3✔
1815
                self.df[column] = self.df[column].apply(func)
3✔
1816

1817
    def __iadd__(self, other):
3✔
UNCOV
1818
        self.df = self.df.append(other.df)
×
UNCOV
1819
        return self
×
1820

1821
    def __str__(self):
3✔
UNCOV
1822
        return self.df.__str__()
×
1823

1824
    def __len__(self):
3✔
1825
        return self.df.__len__()
3✔
1826

1827
    def _merge_row(self, start, stop, chrom=None):
3✔
1828
        df = self.rawdf
3✔
1829

1830
        chrom = df["chr"][start]
3✔
1831

1832
        cov = np.mean(df["cov"].loc[start:stop])
3✔
1833
        max_cov = np.max(df["cov"].loc[start:stop])
3✔
1834
        rm = np.mean(df["rm"].loc[start:stop])
3✔
1835
        zscore = np.mean(df["zscore"].loc[start:stop])
3✔
1836
        if zscore >= 0:
3✔
1837
            max_zscore = df["zscore"].loc[start:stop].max()
3✔
1838
        else:
1839
            max_zscore = df["zscore"].loc[start:stop].min()
3✔
1840
        size = stop - start + 1
3✔
1841

1842
        def log2ratio(mean_cov, rm):
3✔
1843
            if rm != 0 and mean_cov != 0:
3✔
1844
                return np.log2(mean_cov / rm)
3✔
1845
            else:
1846
                np.nan
3✔
1847

1848
        return {
3✔
1849
            "chr": chrom,
1850
            "start": start,
1851
            "end": stop + 1,
1852
            "size": size,
1853
            "mean_cov": cov,
1854
            "mean_rm": rm,
1855
            "mean_zscore": zscore,
1856
            "log2_ratio": log2ratio(cov, rm),
1857
            "max_zscore": max_zscore,
1858
            "max_cov": max_cov,
1859
        }
1860

1861
    def _merge_region(self, zscore_label="zscore"):
3✔
1862
        """Cluster regions within a dataframe.
1863

1864
        Uses a double thresholds method using the :attr:`threshold`
1865

1866
        """
1867
        region_start = None
3✔
1868
        region_stop = None
3✔
1869
        start = 1
3✔
1870
        stop = 1
3✔
1871
        prev = 1
3✔
1872
        # handle case where for example position n-1 have a zscore of -5 and n
1873
        # have a zscore of 5. It is two different regions.
1874
        region_zscore = 0
3✔
1875

1876
        merge_df = []
3✔
1877
        for pos, zscore in zip(self.rawdf["pos"], self.rawdf[zscore_label]):
3✔
1878
            stop = pos
3✔
1879
            if stop - self.step == prev and zscore * region_zscore >= 0:
3✔
1880
                prev = stop
3✔
1881
            else:
1882
                if region_start:
3✔
1883
                    merge_df.append(self._merge_row(region_start, region_stop))
3✔
1884
                    region_start = None
3✔
1885
                start = stop
3✔
1886
                prev = stop
3✔
1887
                region_zscore = zscore
3✔
1888

1889
            if zscore > 0 and zscore > self.thresholds.high:
3✔
1890
                if not region_start:
3✔
1891
                    region_start = pos
3✔
1892
                    region_stop = pos
3✔
1893
                else:
1894
                    region_stop = pos
3✔
1895
            elif zscore < 0 and zscore < self.thresholds.low:
3✔
1896
                if not region_start:
3✔
1897
                    region_start = pos
3✔
1898
                    region_stop = pos
3✔
1899
                else:
1900
                    region_stop = pos
3✔
1901

1902
        if start < stop and region_start:
3✔
1903
            merge_df.append(self._merge_row(region_start, region_stop))
3✔
1904
        return merge_df
3✔
1905

1906
    def _add_annotation(self, region_list, feature_list):
3✔
1907
        """Add annotation from a dictionary generated by parsers in
1908
        sequana.tools.
1909
        """
1910
        region_ann = []
3✔
1911
        # an iterator of features
1912
        iter_feature = iter(feature_list)
3✔
1913
        feature = next(iter_feature)
3✔
1914
        # pass "source" feature
1915
        while feature["type"] in FilteredGenomeCov._feature_not_wanted:
3✔
1916
            try:
3✔
1917
                feature = next(iter_feature)
3✔
1918
            except StopIteration:  # pragma: no cover
1919
                print(
1920
                    "Features types ({0}) are not present in the annotation"
1921
                    " file. Please change what types you want".format(feature["type"])
1922
                )
1923
                return region_ann
1924

1925
        # merge regions and annotations
1926
        for region in region_list:
3✔
1927
            feature_exist = False
3✔
1928
            while feature["gene_end"] <= region["start"]:
3✔
1929
                try:
3✔
1930
                    feature = next(iter_feature)
3✔
1931
                except StopIteration:
3✔
1932
                    break
3✔
1933
            while feature["gene_start"] < region["end"]:
3✔
1934
                # A feature exist for detected ROI
1935
                feature_exist = True
3✔
1936
                for item in ["gene", "product", "gene_name", "gene_id"]:
3✔
1937
                    if item not in feature:
3✔
1938
                        feature[item] = "not avail."
3✔
1939

1940
                region_ann.append(dict(region, **feature))
3✔
1941
                try:
3✔
1942
                    feature = next(iter_feature)
3✔
1943
                except StopIteration:
3✔
1944
                    break
3✔
1945

1946
            if not feature_exist:
3✔
1947
                region_ann.append(
3✔
1948
                    dict(
1949
                        region,
1950
                        **{
1951
                            "gene_start": None,
1952
                            "gene_end": None,
1953
                            "type": None,
1954
                            "gene": None,
1955
                            "strand": None,
1956
                            "product": None,
1957
                        },
1958
                    )
1959
                )
1960
        return region_ann
3✔
1961

1962
    def _dict_to_df(self, region_list, annotation):
3✔
1963
        """Convert dictionary as dataframe."""
1964
        merge_df = pd.DataFrame(region_list)
3✔
1965
        colnames = [
3✔
1966
            "chr",
1967
            "start",
1968
            "end",
1969
            "size",
1970
            "mean_cov",
1971
            "max_cov",
1972
            "mean_rm",
1973
            "mean_zscore",
1974
            "max_zscore",
1975
            "log2_ratio",
1976
            "gene_start",
1977
            "gene_end",
1978
            "type",
1979
            "gene",
1980
            "gene_name",
1981
            "gene_id",
1982
            "strand",
1983
            "product",
1984
        ]
1985
        if not annotation:
3✔
1986
            colnames = colnames[:10]
3✔
1987
        merge_df = pd.DataFrame(region_list, columns=colnames)
3✔
1988
        int_column = ["start", "end", "size"]
3✔
1989
        merge_df[int_column] = merge_df[int_column].astype(int)
3✔
1990
        if annotation:
3✔
1991
            # maybe let the user set what he wants
1992
            return merge_df.loc[~merge_df["type"].isin(FilteredGenomeCov._feature_not_wanted)]
3✔
1993
        return merge_df
3✔
1994

1995
    def _get_sub_range(self, seq_range):
3✔
1996
        try:
3✔
1997
            return self.df[(self.df["end"] > seq_range[0]) & (self.df["start"] < seq_range[1])]
3✔
1998
        except TypeError:
3✔
1999
            return self.df
3✔
2000

2001
    def get_low_rois(self, seq_range=None):
3✔
2002
        df = self._get_sub_range(seq_range)
3✔
2003
        return df.loc[df["max_zscore"] < 0]
3✔
2004

2005
    def get_high_rois(self, seq_range=None):
3✔
2006
        df = self._get_sub_range(seq_range)
3✔
2007
        return df.loc[df["max_zscore"] >= 0]
3✔
2008

2009
    def merge_rois_into_cnvs(self, delta=1000):
3✔
2010
        """
2011

2012
        E-E-E---------E-------E-E
2013

2014
        should be clustered as:
2015
        <EEE>---------E-------<EE>
2016

2017
        Where new large events will combine information from
2018
        the small events.
2019

2020
        When merging two events, we recompute the  max_zscore and mean_zscore
2021
        and apply the threshold again.
2022

2023
        To illustrate this feature, imagine two marginal events at position 1
2024
        and 1000 with a zscore of 4 each (marginals). When there are merged,
2025
        their mean_zscore is recomputed and should be smaller than 4. Those
2026
         events are just noise but will appear as significant events (since
2027
        long size). So, we can remove then by checking the new mean_zscore to
2028
        be > than the thresholds.
2029

2030
        """
2031
        logger.info("Merging ROIs into CNV-like events")
3✔
2032

2033
        # merge using "CNV" clustering
2034
        rois = self.df.copy()
3✔
2035
        logger.info("Found {} ROIs".format(len(rois)))
3✔
2036
        data = self._merge_rois_into_cnvs(rois, delta=delta)
3✔
2037
        logger.info("Merged into {} ROIs".format(len(data)))
3✔
2038

2039
        # sort data by chr and start
2040
        self.df = data.sort_values(by=["chr", "start"])
3✔
2041

2042
        # some rows have been merged, let us reset the index
2043
        self.df.reset_index(inplace=True, drop=True)
3✔
2044

2045
    def _merge_rois_into_cnvs(self, rois, delta=1000):
3✔
2046
        # rois is a copy, it can be changed
2047

2048
        clusterID = [-1] * len(rois)
3✔
2049
        cluster_counter = 0
3✔
2050
        for i in range(0, len(rois) - 2):
3✔
2051
            start = rois.iloc[i : i + 2]["start"].values
3✔
2052
            end = rois.iloc[i : i + 2]["end"].values
3✔
2053

2054
            # for the next ROI to be added as an item of
2055
            # the cluster, it should be close, and similar.
2056
            # similar for now means zscore have the same sign
2057
            d1 = start[1] - end[0]
3✔
2058
            z1 = rois.iloc[i]["max_zscore"]
3✔
2059
            z2 = rois.iloc[i + 1]["max_zscore"]
3✔
2060

2061
            if d1 < delta and z1 * z2 >= 0:  # and d2 <1000:
3✔
2062
                clusterID[i] = cluster_counter
3✔
2063
                clusterID[i + 1] = cluster_counter
3✔
2064
            else:
2065
                cluster_counter += 1
3✔
2066

2067
        # Now, we can cluster the events
2068
        # newdata contains the unclustered rows, then
2069
        # we will add one row per cluster.
2070
        rois["cluster"] = clusterID
3✔
2071
        self.clusterID = clusterID
3✔
2072
        self.rois = rois
3✔
2073

2074
        # just get start and end time of the clusters.
2075
        # we drop special case of unclustered rows (-1)
2076
        new_regions = rois.groupby("cluster").agg(
3✔
2077
            {
2078
                # min of ("A", "A") should return "A"
2079
                # "chr": lambda x: x.min(),
2080
                "start": lambda x: x.min(),
2081
                "end": lambda x: x.max(),
2082
            }
2083
        )
2084
        new_regions.drop(-1, inplace=True)
3✔
2085

2086
        # Now we build back the entire ROI dataframe
2087
        region_list = []
3✔
2088

2089
        for _, row in new_regions.iterrows():
3✔
2090
            this_cluster = self._merge_row(row.start, row.end)
3✔
2091
            region_list.append(this_cluster)
3✔
2092

2093
        for _, row in rois.query("cluster == -1").iterrows():
3✔
2094
            this_cluster = self._merge_row(row.start, row.end)
3✔
2095
            region_list.append(this_cluster)
3✔
2096

2097
        merge_df = self._dict_to_df(region_list, self.feature_list)
3✔
2098

2099
        # finally, remove events that are small.
2100
        if self.apply_threshold_after_merging:
3✔
2101
            merge_df = merge_df.query("mean_zscore>@self.thresholds.high or mean_zscore<@self.thresholds.low")
3✔
2102

2103
        return merge_df
3✔
2104

2105

2106
class ChromosomeCovMultiChunk(object):
3✔
2107
    """Contains several chunk of :class:`ChromosomeCov` results.
2108

2109

2110
    If genome/chromosome are too large, they cannot fit into memory.
2111
    We therefore implemented a way to analyse the chromosome in chunks.
2112

2113
    This is done in :class:`ChromosomeCov`. Results, which are summaries and
2114
    ROIs, are then stored into lists. For each chunk correspond a summary and a
2115
    list of Regions of Interests. This is not convenient and needs extra
2116
    processing.
2117

2118
    For instance, to get the final depth of coverage, (DOC), we need to retrieve
2119
    the DOC for each chunk.
2120

2121
    Individual results are stored in :attr:`data` as a list of lists where each
2122
    item contains one summary, and one data structure for the ROI.
2123

2124
    To retrieve all summaries, one would iterate as follows through the data::
2125

2126
        cc = ChromosomeCovMultiChunk(data)
2127
        summaries = [item[0] for item in cc]
2128

2129
    and to retrieve all ROIs::
2130

2131
        summaries = [item[1] for item in cc]
2132

2133
    but one really wants to extra from this data structure is the overall
2134
    summary::
2135

2136
        cc.get_summary()
2137

2138
    and concatenated list of ROIs::
2139

2140
        cc.get_rois()
2141

2142

2143
    """
2144

2145
    def __init__(self, chunk_rois):
3✔
2146
        self.data = chunk_rois
3✔
2147

2148
    def get_summary(self, caller="sequana.bedtools"):
3✔
2149
        # get all summaries
2150
        summaries = [this[0].as_dict() for this in self.data]
3✔
2151

2152
        # from the first one extract metadata
2153
        data = summaries[0]
3✔
2154
        sample_name = data["sample_name"]
3✔
2155
        summary = Summary("coverage", sample_name=sample_name, data=data["data"].copy(), caller=caller)
3✔
2156
        summary.data_description = data["data_description"].copy()
3✔
2157

2158
        # now replace the data field with proper values
2159
        N = sum([this["data"]["length"] for this in summaries])
3✔
2160
        summary.data["length"] = N
3✔
2161

2162
        # now, we need to update those values, which are means, so
2163
        # we need to multiply back by the length to get the sum, and finally
2164
        # divide by the total mean
2165
        for this in ["BOC", "DOC", "evenness"]:
3✔
2166
            summary.data[this] = sum([d["data"][this] * d["data"]["length"] for d in summaries]) / float(N)
3✔
2167

2168
        # For, CV, centralness, evenness, we simply takes the grand mean for now
2169
        for this in ["C3", "C4", "evenness"]:
3✔
2170
            summary.data[this] = np.mean([d["data"][this] for d in summaries])
3✔
2171

2172
        # For, ROI, just the sum
2173
        for this in ["ROI", "ROI(high)", "ROI(low)"]:
3✔
2174
            summary.data[this] = sum([d["data"][this] for d in summaries])
3✔
2175

2176
        return summary
3✔
2177

2178
    def get_rois(self):
3✔
2179

2180
        # all individual ROIs
2181
        data = [item[1] for item in self.data]
3✔
2182

2183
        # let us copy the first one
2184
        rois = copy.deepcopy(data[0])
3✔
2185
        rois.df = pd.concat([this.df for this in data], ignore_index=True)
3✔
2186
        return rois
3✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc