• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jlab / rust-debruijn / 22665993689

04 Mar 2026 10:46AM UTC coverage: 87.475% (+0.3%) from 87.212%
22665993689

Pull #37

github

web-flow
Merge 5063984f1 into 0b848af95
Pull Request #37: Feat/phred scores

717 of 788 new or added lines in 7 files covered. (90.99%)

6 existing lines in 3 files now uncovered.

8213 of 9389 relevant lines covered (87.47%)

4439203.1 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

80.57
/src/graph.rs
1
// Copyright 2017 10x Genomics
2

3
//! Containers for path-compressed De Bruijn graphs
4

5
use bimap::BiHashMap;
6
use bio::io::fasta;
7
use bit_set::BitSet;
8
use indicatif::ProgressBar;
9
use indicatif::ProgressIterator;
10
use indicatif::ProgressStyle;
11
use itertools::enumerate;
12
use log::warn;
13
use log::{debug, trace};
14
use rayon::prelude::*;
15
use rayon::current_num_threads;
16
use serde_derive::{Deserialize, Serialize};
17
use smallvec::SmallVec;
18
use std::borrow::Borrow;
19

20
use std::collections::HashSet;
21
use std::collections::VecDeque;
22
use std::f32;
23
use std::fmt::{self, Debug, Display};
24
use std::fs::{remove_file, File};
25
use std::hash::Hash;
26
use std::io::BufWriter;
27
use std::io::{BufReader, Error, Read};
28
use std::io::Write;
29
use std::iter::FromIterator;
30
use std::marker::PhantomData;
31
use std::path::Path;
32

33
use boomphf::hashmap::BoomHashMap;
34

35
use serde_json;
36
use serde_json::Value;
37

38
type SmallVec4<T> = SmallVec<[T; 4]>;
39
type SmallVec8<T> = SmallVec<[T; 8]>;
40

41
use crate::BaseQuality;
42
use crate::bits_to_base;
43
use crate::colors::ColorMode;
44
use crate::colors::Colors;
45
use crate::compression::CompressionSpec;
46
use crate::dna_string::{DnaString, DnaStringSlice, PackedDnaStringSet};
47
use crate::summarizer::SummaryConfig;
48
use crate::summarizer::SummaryData;
49
use crate::summarizer::Translator;
50
use crate::summarizer::ID;
51
use crate::BUF;
52
use crate::PROGRESS_STYLE;
53
use crate::{Dir, Exts, Kmer, Mer, Vmer};
54

55
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
56
/// This type does not carry the sorted index arrays the allow the graph
57
/// to be walked efficiently. The `DeBruijnGraph` type wraps this type and add those
58
/// vectors.
59
#[derive(Serialize, Deserialize, Clone, Debug)]
60
pub struct BaseGraph<K, D> {
61
    pub sequences: PackedDnaStringSet,
62
    pub exts: Vec<Exts>,
63
    pub data: Vec<D>,
64
    pub stranded: bool,
65
    phantom: PhantomData<K>,
66
}
67

68
impl<K, D> BaseGraph<K, D> {
69
    pub fn new(stranded: bool) -> Self {
2,351✔
70
        BaseGraph {
2,351✔
71
            sequences: PackedDnaStringSet::new(),
2,351✔
72
            exts: Vec::new(),
2,351✔
73
            data: Vec::new(),
2,351✔
74
            phantom: PhantomData,
2,351✔
75
            stranded,
2,351✔
76
        }
2,351✔
77
    }
2,351✔
78

79
    pub fn len(&self) -> usize {
272,212✔
80
        self.sequences.len()
272,212✔
81
    }
272,212✔
82

83
    pub fn is_empty(&self) -> bool {
1✔
84
        self.sequences.is_empty()
1✔
85
    }
1✔
86

87
    pub fn combine<I: Iterator<Item = BaseGraph<K, D>>>(graphs: I) -> Self {
12✔
88
        let mut sequences = PackedDnaStringSet::new();
12✔
89
        let mut exts = Vec::new();
12✔
90
        let mut data = Vec::new();
12✔
91
        let mut stranded = Vec::new();
12✔
92

93
        for g in graphs {
1,535✔
94
            for s in 0..g.sequences.len() {
7,514✔
95
                sequences.add(&g.sequences.get(s));
7,514✔
96
            }
7,514✔
97

98
            exts.extend(g.exts);
1,535✔
99
            data.extend(g.data);
1,535✔
100
            stranded.push(g.stranded);
1,535✔
101
        }
102

103
        let out_stranded = stranded.iter().all(|x| *x);
12✔
104

105
        if !out_stranded && !stranded.iter().all(|x| !*x) {
1,535✔
106
            panic!("attempted to combine stranded and unstranded graphs");
×
107
        }
12✔
108

109
        BaseGraph {
12✔
110
            sequences,
12✔
111
            stranded: out_stranded,
12✔
112
            exts,
12✔
113
            data,
12✔
114
            phantom: PhantomData,
12✔
115
        }
12✔
116
    }
12✔
117
}
118

119
impl<K: Kmer, D> BaseGraph<K, D> {
120
    pub fn add<R: Borrow<u8>, S: IntoIterator<Item = R>>(
2,221,953✔
121
        &mut self,
2,221,953✔
122
        sequence: S,
2,221,953✔
123
        exts: Exts,
2,221,953✔
124
        data: D,
2,221,953✔
125
    ) {
2,221,953✔
126
        self.sequences.add(sequence);
2,221,953✔
127
        self.exts.push(exts);
2,221,953✔
128
        self.data.push(data);
2,221,953✔
129
    }
2,221,953✔
130
}
131

132
impl<K: Kmer + Send + Sync, D> BaseGraph<K, D> {
133
    pub fn finish(self) -> DebruijnGraph<K, D> {
705✔
134
        let indices: Vec<u32> = (0..self.len() as u32).collect();
705✔
135

136
        let left_order = {
705✔
137
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
705✔
138
            for idx in &indices {
1,514,214✔
139
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
1,514,214✔
140
                
1,514,214✔
141
            }
1,514,214✔
142
            
143
            BoomHashMap::new_parallel(kmers, indices.clone())
705✔
144
        };
145

146
        let right_order = {
705✔
147
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
705✔
148
            for idx in &indices {
1,514,214✔
149
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
1,514,214✔
150
            }
1,514,214✔
151
            
152
            BoomHashMap::new_parallel(kmers, indices)
705✔
153
        };
154
        debug!("finish graph loops: 2x {}", self.len());
705✔
155

156
        DebruijnGraph {
705✔
157
            base: self,
705✔
158
            left_order,
705✔
159
            right_order,
705✔
160
        }
705✔
161
    }
705✔
162
}
163

164
impl<K: Kmer, D> BaseGraph<K, D> {
165
    pub fn finish_serial(self) -> DebruijnGraph<K, D> {
111✔
166
        let indices: Vec<u32> = (0..self.len() as u32).collect();
111✔
167

168
        let left_order = {
111✔
169
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
111✔
170
            let mut sequences: Vec<String> = Vec::new();
111✔
171
            for idx in &indices {
705,971✔
172
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
705,971✔
173
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
705,971✔
174
            }
705,971✔
175
            println!("left kmers: {:?}", kmers);
111✔
176
            println!("left seqs: {:?}", sequences);
111✔
177
            BoomHashMap::new(kmers, indices.clone())
111✔
178
        };
179

180
        let right_order = {
111✔
181
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
111✔
182
            let mut sequences: Vec<String> = Vec::new();
111✔
183
            for idx in &indices {
705,971✔
184
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
705,971✔
185
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
705,971✔
186
            }
705,971✔
187
            println!("right kmers: {:?}", kmers);
111✔
188
            println!("right seqs: {:?}", sequences);
111✔
189
            BoomHashMap::new(kmers, indices)
111✔
190
        };
191

192
        DebruijnGraph {
111✔
193
            base: self,
111✔
194
            left_order,
111✔
195
            right_order,
111✔
196
        }
111✔
197
    }
111✔
198
}
199

200
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
201
/// The struct carries sorted index arrays the allow the graph
202
/// to be walked efficiently.
203
#[derive(Serialize, Deserialize, Debug)]
204
pub struct DebruijnGraph<K: Hash, D> {
205
    pub base: BaseGraph<K, D>,
206
    left_order: BoomHashMap<K, u32>,
207
    right_order: BoomHashMap<K, u32>,
208
}
209

210
impl<K: Kmer, D: Debug> DebruijnGraph<K, D> {
211
    /// Total number of nodes in the DeBruijn graph
212
    pub fn len(&self) -> usize {
269,740✔
213
        self.base.len()
269,740✔
214
    }
269,740✔
215

216
    pub fn is_empty(&self) -> bool {
1✔
217
        self.base.is_empty()
1✔
218
    }
1✔
219

220
    /// Get a node given it's `node_id`
221
    pub fn get_node(&'_ self, node_id: usize) -> Node<'_, K, D> {
5,603,971✔
222
        Node {
5,603,971✔
223
            node_id,
5,603,971✔
224
            graph: self,
5,603,971✔
225
        }
5,603,971✔
226
    }
5,603,971✔
227

228
    /// Get a node given it's `node_id`
229
    pub fn get_node_kmer(&'_ self, node_id: usize) -> NodeKmer<'_, K, D> {
×
230
        let node = self.get_node(node_id);
×
231
        let node_seq = node.sequence();
×
232

233
        NodeKmer {
×
234
            node_id,
×
235
            node_seq_slice: node_seq,
×
236
            phantom_d: PhantomData,
×
237
            phantom_k: PhantomData,
×
238
        }
×
239
    }
×
240

241
    /// Return an iterator over all nodes in the graph
242
    pub fn iter_nodes(&'_ self) -> NodeIter<'_, K, D> {
130✔
243
        NodeIter {
130✔
244
            graph: self,
130✔
245
            node_id: 0,
130✔
246
        }
130✔
247
    }
130✔
248

249
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
250
    /// accessed via a Node wrapper object
251
    fn find_edges(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
441,918✔
252
        let exts = self.base.exts[node_id];
441,918✔
253
        let sequence = self.base.sequences.get(node_id);
441,918✔
254
        let kmer: K = sequence.term_kmer(dir);
441,918✔
255
        let mut edges = SmallVec4::new();
441,918✔
256

257
        for i in 0..4 {
1,767,672✔
258
            if exts.has_ext(dir, i) {
1,767,672✔
259
                let link = self.find_link(kmer.extend(i, dir), dir).expect("missing link");
827,198✔
260
                edges.push((i, link.0, link.1, link.2));
827,198✔
261
            }
940,474✔
262
        }
263

264
        edges
441,918✔
265
    }
441,918✔
266

267
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
268
    /// accessed via a Node wrapper object
269
    /// 
270
    /// allows missing links
271
    fn _find_edges_sharded(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
×
272
        let exts = self.base.exts[node_id];
×
273
        let sequence = self.base.sequences.get(node_id);
×
274
        let kmer: K = sequence.term_kmer(dir);
×
275
        let mut edges = SmallVec4::new();
×
276

277
        for i in 0..4 {
×
278
            if exts.has_ext(dir, i) {
×
279
                let link = self.find_link(kmer.extend(i, dir), dir); //.expect("missing link");
×
280
                if let Some(l) = link {
×
281
                    edges.push((i, l.0, l.1, l.2));
×
282
                }
×
283
                // Otherwise, this edge doesn't exist within this shard, so ignore it.
284
                // NOTE: this should be allowed in a 'complete' DBG
285
            }
×
286
        }
287

288
        edges
×
289
    }
×
290

291
    /// Seach for the kmer `kmer`, appearing at the given `side` of a node sequence.
292
    fn search_kmer(&self, kmer: K, side: Dir) -> Option<usize> {
6,413,586✔
293
        match side {
6,413,586✔
294
            Dir::Left => self.left_order.get(&kmer).map(|pos| *pos as usize),
3,203,673✔
295
            Dir::Right => self.right_order.get(&kmer).map(|pos| *pos as usize),
3,209,913✔
296
        }
297
    }
6,413,586✔
298

299
    /// Find a link in the graph, possibly handling a RC switch.
300
    pub fn find_link(&self, kmer: K, dir: Dir) -> Option<(usize, Dir, bool)> {
4,474,912✔
301
        // Only test self-consistent paths through
302
        // the edges
303
        // Avoids problems due to single kmer edges
304
        // (dir, next_side_incoming, rc)
305
        // (Dir::Left, Dir::Right, false) => true,
306
        // (Dir::Left, Dir::Left,  true) => true,
307
        // (Dir::Right, Dir::Left, false) => true,
308
        // (Dir::Right, Dir::Right, true) => true,
309

310
        let rc = kmer.rc();
4,474,912✔
311

312
        match dir {
4,474,912✔
313
            Dir::Left => {
314
                if let Some(idx) = self.search_kmer(kmer, Dir::Right) {
2,231,428✔
315
                    return Some((idx, Dir::Right, false));
1,271,239✔
316
                }
960,189✔
317

318
                if !self.base.stranded {
960,189✔
319
                    if let Some(idx) = self.search_kmer(rc, Dir::Left) {
960,189✔
320
                        return Some((idx, Dir::Left, true));
959,963✔
321
                    }
226✔
322
                }
×
323
            }
324

325
            Dir::Right => {
326
                if let Some(idx) = self.search_kmer(kmer, Dir::Left) {
2,243,484✔
327
                    return Some((idx, Dir::Left, false));
1,279,147✔
328
                }
964,337✔
329

330
                if !self.base.stranded {
964,337✔
331
                    if let Some(idx) = self.search_kmer(rc, Dir::Right) {
964,337✔
332
                        return Some((idx, Dir::Right, true));
964,114✔
333
                    }
223✔
334
                }
×
335
            }
336
        }
337

338
        None
449✔
339
    }
4,474,912✔
340

341
    /// Check whether the graph is fully compressed. Return `None` if it's compressed,
342
    /// otherwise return `Some(node1, node2)` representing a pair of node that could
343
    /// be collapsed. Probably only useful for testing.
344
    pub fn is_compressed<S: CompressionSpec<D>>(&self, spec: &S) -> Option<(usize, usize)> {
789✔
345
        for i in 0..self.len() {
123,210✔
346
            let n = self.get_node(i);
123,210✔
347

348
            for dir in &[Dir::Left, Dir::Right] {
246,420✔
349
                let dir_edges = n.edges(*dir);
246,420✔
350
                if dir_edges.len() == 1 {
246,420✔
351
                    let (_, next_id, return_dir, _) = dir_edges[0];
174,497✔
352
                    let next = self.get_node(next_id);
174,497✔
353

354
                    let ret_edges = next.edges(return_dir);
174,497✔
355
                    if ret_edges.len() == 1 {
174,497✔
356
                        // Test for us being a palindrome: this makes it OK
357
                        if n.len() == K::k() && n.sequence().first_kmer::<K>().is_palindrome() {
24✔
358
                            continue;
16✔
359
                        }
8✔
360

361
                        // Test for a neighbor being a palindrome: this makes it OK
362
                        if next.len() == K::k() && next.sequence().first_kmer::<K>().is_palindrome()
8✔
363
                        {
364
                            continue;
8✔
365
                        }
×
366

367
                        // Test for this edge representing a smooth circle (biting it's own tail):
368
                        if n.node_id == next_id {
×
369
                            continue;
×
370
                        }
×
371

372
                        if spec.join_test(n.data(), next.data()) {
×
373
                            // Found a unbranched edge that should have been eliminated
374
                            return Some((i, next_id));
×
375
                        }
×
376
                    }
174,473✔
377
                }
71,923✔
378
            }
379
        }
380

381
        None
789✔
382
    }
789✔
383

384
    /// Remove non-existent extensions that may be created due to filtered kmers
385
    /// 
386
    /// if `valid_nodes` if `None`, all nodes are valid
387
    pub fn fix_exts(&mut self, valid_nodes: Option<&BitSet>) {
358✔
388
        for i in 0..self.len() {
1,439,128✔
389
            let valid_exts = self.get_valid_exts(i, valid_nodes);
1,439,128✔
390
            self.base.exts[i] = valid_exts;
1,439,128✔
391
        }
1,439,128✔
392
    }
358✔
393

394
    pub fn get_valid_exts(&self, node_id: usize, valid_nodes: Option<&BitSet>) -> Exts {
1,439,128✔
395
        let mut new_exts = Exts::empty();
1,439,128✔
396
        let node = self.get_node(node_id);
1,439,128✔
397
        let exts = node.exts();
1,439,128✔
398
        let l_kmer: K = node.sequence().first_kmer();
1,439,128✔
399
        let r_kmer: K = node.sequence().last_kmer();
1,439,128✔
400

401
        let check_node = |id| match valid_nodes {
2,925,453✔
402
            Some(bs) => bs.contains(id),
1,443,568✔
403
            None => true,
1,481,885✔
404
        };
2,925,453✔
405

406
        for i in 0..4 {
5,756,512✔
407
            if exts.has_ext(Dir::Left, i) {
5,756,512✔
408
                match self.find_link(l_kmer.extend_left(i), Dir::Left) {
1,462,719✔
409
                    Some((target, _, _)) if check_node(target) => {
1,462,493✔
410
                        new_exts = new_exts.set(Dir::Left, i)
1,462,493✔
411
                    }
412
                    _ => (),
226✔
413
                }
414
            }
4,293,793✔
415

416
            if exts.has_ext(Dir::Right, i) {
5,756,512✔
417
                match self.find_link(r_kmer.extend_right(i), Dir::Right) {
1,463,183✔
418
                    Some((target, _, _)) if check_node(target) => {
1,462,960✔
419
                        new_exts = new_exts.set(Dir::Right, i)
1,462,960✔
420
                    }
421
                    _ => (),
223✔
422
                }
423
            }
4,293,329✔
424
        }
425

426
        new_exts
1,439,128✔
427
    }
1,439,128✔
428

429
    /// mutable reference to the auxiliary data of the node node_id
430
    pub fn mut_data(&mut self, node_id: usize) -> &mut D {
791✔
431
        &mut self.base.data[node_id]
791✔
432
    }
791✔
433

434
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
435
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
436
    /// paths will be terminated if there are multiple valid paths emanating from a node.
437
    pub fn max_path<F, F2>(&self, score: F, solid_path: F2) -> Vec<(usize, Dir)>
×
438
    where
×
439
        F: Fn(&D) -> f32,
×
440
        F2: Fn(&D) -> bool,
×
441
    {
442
        if self.is_empty() {
×
443
            return Vec::default();
×
444
        }
×
445

446
        let mut best_node = 0;
×
447
        let mut best_score = f32::MIN;
×
448
        for i in 0..self.len() {
×
449
            let node = self.get_node(i);
×
450
            let node_score = score(node.data());
×
451

452
            if node_score > best_score {
×
453
                best_node = i;
×
454
                best_score = node_score;
×
455
            }
×
456
        }
457

458
        let oscore = |state| match state {
×
459
            None => 0.0,
×
460
            Some((id, _)) => score(self.get_node(id).data()),
×
461
        };
×
462

463
        let osolid_path = |state| match state {
×
464
            None => false,
×
465
            Some((id, _)) => solid_path(self.get_node(id).data()),
×
466
        };
×
467

468
        // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
469
        // already put into the path
470
        let mut used_nodes = HashSet::new();
×
471
        let mut path = VecDeque::new();
×
472

473
        // Start w/ initial state
474
        used_nodes.insert(best_node);
×
475
        path.push_front((best_node, Dir::Left));
×
476

477
        for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
×
478
            let &(start_node, dir, do_flip) = init;
×
479
            let mut current = (start_node, dir);
×
480

481
            loop {
482
                let mut next = None;
×
483
                let (cur_id, incoming_dir) = current;
×
484
                let node = self.get_node(cur_id);
×
485
                let edges = node.edges(incoming_dir.flip());
×
486

487
                let mut solid_paths = 0;
×
488
                for (_, id, dir, _) in edges {
×
489
                    let cand = Some((id, dir));
×
490
                    if osolid_path(cand) {
×
491
                        solid_paths += 1;
×
492
                    }
×
493

494
                    if oscore(cand) > oscore(next) {
×
495
                        next = cand;
×
496
                    }
×
497
                }
498

499
                if solid_paths > 1 {
×
500
                    break;
×
501
                }
×
502

503
                match next {
×
504
                    Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
×
505
                        if do_flip {
×
506
                            path.push_front((next_id, next_incoming.flip()));
×
507
                        } else {
×
508
                            path.push_back((next_id, next_incoming));
×
509
                        }
×
510

511
                        used_nodes.insert(next_id);
×
512
                        current = (next_id, next_incoming);
×
513
                    }
514
                    _ => break,
×
515
                }
516
            }
517
        }
518

519
        Vec::from_iter(path)
×
520
    }
×
521

522

523
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
524
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
525
    /// paths will be terminated if there are multiple valid paths emanating from a node.
526
    /// Returns vec with path for each component
527
    pub fn max_path_comp<F, F2>(&self, score: F, solid_path: F2) -> Vec<VecDeque<(usize, Dir)>>
1✔
528
    where
1✔
529
        F: Fn(&D) -> f32,
1✔
530
        F2: Fn(&D) -> bool,
1✔
531
    {
532
        if self.is_empty() {
1✔
533
            let vec: Vec<VecDeque<(usize, Dir)>> = Vec::new();
×
534
            return vec;
×
535
        }
1✔
536

537
        let components = self.iter_components();
1✔
538
        let mut paths: Vec<VecDeque<(usize, Dir)>> = Vec::new();
1✔
539

540
        for component in components {
2✔
541

542
            let current_comp = &component;
2✔
543
            
544

545
            let mut best_node = current_comp[0];
2✔
546
            let mut best_score = f32::MIN;
2✔
547
            for c in current_comp.iter() {
136✔
548
                let node = self.get_node(*c);
136✔
549
                let node_score = score(node.data());
136✔
550

551
                if node_score > best_score {
136✔
552
                    best_node = *c;
2✔
553
                    best_score = node_score;
2✔
554
                }
134✔
555
            }
556

557
            let oscore = |state| match state {
162✔
558
                None => 0.0,
80✔
559
                Some((id, _)) => score(self.get_node(id).data()),
82✔
560
            };
162✔
561

562
            let osolid_path = |state| match state {
81✔
563
                None => false,
×
564
                Some((id, _)) => solid_path(self.get_node(id).data()),
81✔
565
            };
81✔
566

567
            // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
568
            // already put into the path
569
            let mut used_nodes = HashSet::new();
2✔
570
            let mut path = VecDeque::new();
2✔
571

572
            // Start w/ initial state
573
            used_nodes.insert(best_node);
2✔
574
            path.push_front((best_node, Dir::Left));
2✔
575

576
            for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
4✔
577
                let &(start_node, dir, do_flip) = init;
4✔
578
                let mut current = (start_node, dir);
4✔
579

580
                loop {
581
                    let mut next = None;
83✔
582
                    let (cur_id, incoming_dir) = current;
83✔
583
                    let node = self.get_node(cur_id);
83✔
584
                    let edges = node.edges(incoming_dir.flip());
83✔
585

586
                    let mut solid_paths = 0;
83✔
587
                    for (_, id, dir, _) in edges {
83✔
588
                        let cand = Some((id, dir));
81✔
589
                        if osolid_path(cand) {
81✔
590
                            solid_paths += 1;
81✔
591
                        }
81✔
592

593
                        if oscore(cand) > oscore(next) {
81✔
594
                            next = cand;
80✔
595
                        }
80✔
596
                    }
597

598
                    if solid_paths > 1 {
83✔
599
                        break;
1✔
600
                    }
82✔
601

602
                    match next {
79✔
603
                        Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
79✔
604
                            if do_flip {
79✔
605
                                path.push_front((next_id, next_incoming.flip()));
44✔
606
                            } else {
44✔
607
                                path.push_back((next_id, next_incoming));
35✔
608
                            }
35✔
609

610
                            used_nodes.insert(next_id);
79✔
611
                            current = (next_id, next_incoming);
79✔
612
                        }
613
                        _ => break,
3✔
614
                    }
615
                }
616
            }
617
            
618
            paths.push(path);
2✔
619
            //paths.push(Vec::from_iter(path));
620
        }
621

622
        paths
1✔
623
    
624
    }
1✔
625

626
    pub fn iter_max_path_comp<F, F2>(&'_ self, score: F, solid_path: F2) -> PathCompIter<'_, K, D, F, F2> 
3✔
627
    where 
3✔
628
    F: Fn(&D) -> f32,
3✔
629
    F2: Fn(&D) -> bool
3✔
630
    {
631
        let component_iterator = self.iter_components();
3✔
632
        PathCompIter { graph: self, component_iterator, graph_pos: 0, score, solid_path }
3✔
633
    }
3✔
634

635
    /// write the paths from `iter_max_path_comp` to a fasta file
636
    pub fn path_to_fasta<F, F2>(&self, f: &mut dyn std::io::Write, path_iter: PathCompIter<K, D, F, F2>, return_lens: bool) -> (Vec<usize>, Vec<usize>)
1✔
637
    where 
1✔
638
    F: Fn(&D) -> f32,
1✔
639
    F2: Fn(&D) -> bool
1✔
640
    {
641
        // width of fasta lines
642
        let columns = 80;
1✔
643

644
        // sizes of components and of paths
645
        let mut comp_sizes = Vec::new();
1✔
646
        let mut path_lens = Vec::new();
1✔
647

648
        for (seq_counter, (component, path)) in path_iter.enumerate() {
2✔
649
            // get dna sequence from path
650
            let seq = self.sequence_of_path(path.iter());
2✔
651

652
            //write header with length & start node
653
            writeln!(f, ">path{} len={} start_node={}", seq_counter, seq.len(), path[0].0).unwrap();
2✔
654

655
            // calculate how sequence has to be split up
656
            let slices = (seq.len() / columns) + 1;
2✔
657
            let mut ranges = Vec::with_capacity(slices);
2✔
658

659
            let mut start = 0;
2✔
660
            while start < seq.len() {
5✔
661
                ranges.push(start..start + columns);
3✔
662
                start += columns;
3✔
663
            }
3✔
664

665
            let last_start = ranges.pop().expect("no kmers in parallel ranges").start;
2✔
666
            ranges.push(last_start..seq.len());
2✔
667

668
            // split up sequence and write to file accordingly
669
            for range in ranges {
3✔
670
                writeln!(f, "{:?}", seq.slice(range.start, range.end)).unwrap();
3✔
671
            }
3✔
672

673
            if return_lens {
2✔
674
                comp_sizes.push(component.len());
×
675
                path_lens.push(path.len());
×
676
            }
2✔
677
        }    
678

679
        (comp_sizes, path_lens)
1✔
680
        
681
    }
1✔
682

683

684
    /// Get the sequence of a path through the graph. The path is given as a sequence of node_id integers
685
    pub fn sequence_of_path<'a, I: 'a + Iterator<Item = &'a (usize, Dir)>>(
19,764✔
686
        &self,
19,764✔
687
        path: I,
19,764✔
688
    ) -> DnaString {
19,764✔
689
        let mut seq = DnaString::new();
19,764✔
690

691
        for (idx, &(node_id, dir)) in path.enumerate() {
713,736✔
692
            let start = if idx == 0 { 0 } else { K::k() - 1 };
713,736✔
693

694
            let node_seq = match dir {
713,736✔
695
                Dir::Left => self.get_node(node_id).sequence(),
370,101✔
696
                Dir::Right => self.get_node(node_id).sequence().rc(),
343,635✔
697
            };
698

699
            for p in start..node_seq.len() {
1,369,734✔
700
                seq.push(node_seq.get(p))
1,369,734✔
701
            }
702
        }
703

704
        seq
19,764✔
705
    }
19,764✔
706

707
    /// map sequences from a fasta file to a **completely uncompressed** and **stranded** debruijn graph
708
    pub fn map_transcripts<P>(&self, path: P, translator: &mut Translator) -> Result<Vec<Box<[ID]>>, String> 
1✔
709
    where 
1✔
710
        P: AsRef<Path>
1✔
711
    {
712
        // return err if not stranded
713
        if !self.base.stranded { return Err("graph has to be stranded".to_string()) };
1✔
714

715
        let reader = fasta::Reader::new(BufReader::new(File::open(path).unwrap()));
1✔
716
        let mut node_transcript_ids = vec![Vec::new(); self.len()];
1✔
717

718
        let mut backup_id_tr = BiHashMap::new();
1✔
719

720
        let id_tr = if let Some(id_tr) = translator.mut_id_translator() {
1✔
721
            id_tr
1✔
722
        } else {
723
            &mut backup_id_tr
×
724
        };
725

726
        // go through each transcript and map to graph
727
        for result in reader.records() {
15✔
728
            let record = result.expect("error parsing transcripts fasta");
15✔
729

730
            // get gene id or make new id
731
            let gene_id = match id_tr.get_by_left(&record.id().to_string()) {
15✔
732
                Some(id) => *id,
×
733
                None => {
734
                    let new_id = id_tr.len() as ID;
15✔
735
                    id_tr.insert(record.id().to_string(), new_id);
15✔
736
                    new_id
15✔
737
                }
738
            };
739

740
            // iterate over k-mers in transcript and find each one in the graph
741
            let sequence = DnaString::from_acgt_bytes(record.seq());
15✔
742
            for kmer in sequence.iter_kmers::<K>() {
14,148✔
743
                if let Some(node) = self.search_kmer(kmer, Dir::Right) {
14,148✔
744
                    node_transcript_ids[node].push(gene_id);
13,284✔
745
                }
13,284✔
746
            }
747
        }
748

749
        // change to boxed slices to reduce memory
750
        let boxed_transcripts = node_transcript_ids.into_iter().map(|vec| vec.into()).collect();
21,993✔
751

752
        Ok(boxed_transcripts)
1✔
753
    }
1✔
754

755
    /// write a node to a dot file
756
    /// 
757
    /// ### Arguments: 
758
    /// 
759
    /// * `node`: [`Node<K, D>`] which will be written to a dot file
760
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
761
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
762
    ///   and if the neighbor is flipped - returns a string containing commands for dot edges, 
763
    /// * `f`: writer
764
    fn node_to_dot<FN: Fn(&Node<K, D>) -> String, FE: Fn(&Node<K, D>, u8, Dir, bool) -> String>(
1,756✔
765
        &self,
1,756✔
766
        node: &Node<'_, K, D>,
1,756✔
767
        node_label: &FN,
1,756✔
768
        edge_label: &FE,
1,756✔
769
        f: &mut dyn Write,
1,756✔
770
    ) {
1,756✔
771
        writeln!(f, "n{} {}", node.node_id, node_label(node)).unwrap();
1,756✔
772
        assert_eq!(node.exts().val.count_ones() as usize, node.l_edges().len() + node.r_edges().len());
1,756✔
773

774
        for (base, id, incoming_dir, flipped) in node.l_edges() {
1,756✔
775
            writeln!(f, "n{} -> n{} {}", id, node.node_id, edge_label(node, base, incoming_dir, flipped)).unwrap();
1,212✔
776
        }
1,212✔
777

778
        for (base, id, incoming_dir, flipped) in node.r_edges() {
1,756✔
779
            writeln!(f, "n{} -> n{} {}", node.node_id, id, edge_label(node, base, incoming_dir, flipped)).unwrap();
1,092✔
780
        }
1,092✔
781
    }
1,756✔
782

783
    /// Write the graph to a dot file
784
    /// 
785
    /// ### Arguments: 
786
    /// 
787
    /// * `path`: path to the output file
788
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes, e.g. [`Node::node_dot_default`]
789
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge, e.g. [`Node::edge_dot_default`]
790
    ///   and if the neighbor is flipped - returns a string containing commands for dot edges, 
791
    pub fn to_dot<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
1✔
792
    where 
1✔
793
    P: AsRef<Path>,
1✔
794
    FN: Fn(&Node<K, D>) -> String,
1✔
795
    FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
796
    {
797
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
798

799
        let pb = ProgressBar::new(self.len() as u64);
1✔
800
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
801
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
1✔
802

803
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset").unwrap();
1✔
804
        for i in (0..self.len()).progress_with(pb) {
584✔
805
            self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
584✔
806
        }
584✔
807
        writeln!(&mut f, "}}").unwrap();
1✔
808
        
809
        f.flush().unwrap();
1✔
810
        debug!("large to dot loop: {}", self.len());
1✔
811
    }
1✔
812

813
    /// Write the graph to a dot file, highlight the nodes which form the 
814
    /// "best" path, according to [`PathCompIter`], with the number of occurences 
815
    /// as the score and `solid_path` always `true`.
816
    /// The nodes are formatted according to [`Node::node_dot_default`].
817
    /// 
818
    /// ### Arguments: 
819
    /// 
820
    /// * `path`: path to the output file
821
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge, e.g. [`Node::edge_dot_default`]
822
    ///   and if the neighbor is flipped - returns a string containing commands for dot edges, 
823
    /// * `colors`: a [`Colors`] with the color settings for the graph
824
    /// * `translator`: a [`Translator`] which translates tags or IDs to strings
825
    /// * `config`: a [`SummaryConfig`] which contains settings for the graph
826
    pub fn to_dot_with_path<P, FE, DI>(&self, path: P, edge_label: &FE, colors: &Colors<'_, D, DI>, translator: &Translator, config: &SummaryConfig, translate_id_groups: bool)
1✔
827
    where 
1✔
828
    P: AsRef<Path>,
1✔
829
    D: SummaryData<DI>,
1✔
830
    FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
831
    {
832
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
833

834
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset").unwrap();
1✔
835

836
        // iterate over components
837
        for (component, path) in self.iter_max_path_comp(|d| d.sum().unwrap_or(1) as f32, |_| true) {
974✔
838
            let hashed_path = path.into_iter().map(|(id, _)| id).collect::<HashSet<usize>>();
233✔
839
            for node_id in component {
584✔
840
                self.node_to_dot(
584✔
841
                    &self.get_node(node_id),
584✔
842
                    &|node| node.node_dot_default(colors, config, translator, hashed_path.contains(&node_id), translate_id_groups), 
584✔
843
                    edge_label, 
584✔
844
                    &mut f
584✔
845
                );
846
            }
847
        }
848

849
        writeln!(&mut f, "}}").unwrap();
1✔
850
        
851
        f.flush().unwrap();
1✔
852
        debug!("large to dot loop: {}", self.len());
1✔
853
    }
1✔
854

855
    /// Write the graph to a dot file in parallel
856
    /// Will write in to n_threads files simultaniously,
857
    /// then go though the files and add the contents to a larger file, 
858
    /// and delete the small files.
859
    /// 
860
    /// ### Arguments: 
861
    /// 
862
    /// * `path`: path to the output file
863
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
864
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
865
    ///   and if the neighbor is flipped - returns a string containing commands for dot edges, 
866
    pub fn to_dot_parallel<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
1✔
867
    where 
1✔
868
        D: Sync,
1✔
869
        K: Sync,
1✔
870
        P: AsRef<Path> + Display + Sync,
1✔
871
        FN: Fn(&Node<K, D>) -> String + Sync,
1✔
872
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String + Sync,
1✔
873
    {        
874
        let slices = current_num_threads();
1✔
875
        let n_nodes = self.len();
1✔
876
        let sz = n_nodes / slices + 1;
1✔
877

878
        debug!("n_nodes: {}", n_nodes);
1✔
879
        debug!("sz: {}", sz);
1✔
880

881
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
882
        let mut start = 0;
1✔
883
        while start < n_nodes {
5✔
884
            parallel_ranges.push(start..start + sz);
4✔
885
            start += sz;
4✔
886
        }
4✔
887

888
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
889
        parallel_ranges.push(last_start..n_nodes);
1✔
890
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
891

892
        let mut files = Vec::with_capacity(current_num_threads());
1✔
893

894
        for i in 0..parallel_ranges.len() {
4✔
895
            files.push(format!("{}-{}.dot", path, i));
4✔
896
        } 
4✔
897

898
        let pb = ProgressBar::new(self.len() as u64);
1✔
899
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
900
        pb.set_message(format!("{:<32}", "writing graph to DOT files"));
1✔
901
    
902
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
4✔
903
            let mut f = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel dot file"));
4✔
904

905
            for i in range {
584✔
906
                self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
584✔
907
                pb.inc(1);
584✔
908
            }
584✔
909

910
            f.flush().unwrap();
4✔
911
        });
4✔
912
        pb.finish_and_clear();
1✔
913

914
        let mut out_file = BufWriter::with_capacity(BUF, File::create(path).expect("error creating combined dot file"));
1✔
915

916
        writeln!(&mut out_file, "digraph {{\nrankdir=\"LR\"\nmodel=subset").unwrap();
1✔
917

918
        let pb = ProgressBar::new(files.len() as u64);
1✔
919
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
920
        pb.set_message(format!("{:<32}", "combining files"));
1✔
921

922
        for file in files.iter().progress_with(pb) {
4✔
923
            let open_file = File::open(file).expect("error opening parallel dot file");
4✔
924
            let mut reader = BufReader::new(open_file);
4✔
925
            let mut buffer = [0; BUF];
4✔
926

927
            loop {
928
                let linecount = reader.read(&mut buffer).unwrap();
12✔
929
                if linecount == 0 { break }
12✔
930
                out_file.write_all(&buffer[..linecount]).unwrap();
8✔
931
            }
932

933
            remove_file(file).unwrap();
4✔
934
        }
935

936
        writeln!(&mut out_file, "}}").unwrap();
1✔
937

938
        out_file.flush().unwrap();
1✔
939

940

941
    }
1✔
942

943

944
    /// Write part of the graph to a dot file
945
    /// 
946
    /// ### Arguments: 
947
    /// 
948
    /// * `path`: path to the output file
949
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
950
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
951
    ///   and if the neighbor is flipped - returns a string containing commands for dot edges, 
952
    /// * `nodes`: [`Vec<usize>`] listing all IDs of nodes which should be included
953
    pub fn to_dot_partial<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE, nodes: &[usize]) 
1✔
954
    where 
1✔
955
        P: AsRef<Path>,
1✔
956
        FN: Fn(&Node<K, D>) -> String,
1✔
957
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
958
    {
959
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
960

961
        let pb = ProgressBar::new(nodes.len() as u64);
1✔
962
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
963
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
1✔
964

965
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset").unwrap();
1✔
966
        for i in nodes.iter().progress_with(pb) {
4✔
967
            self.node_to_dot(&self.get_node(*i), node_label, edge_label, &mut f);
4✔
968
        }
4✔
969
        writeln!(&mut f, "}}").unwrap();
1✔
970

971
        f.flush().unwrap();
1✔
972

973
        debug!("large to dot loop: {}", self.len());
1✔
974
    }
1✔
975

976
    fn node_to_gfa<F: Fn(&Node<'_, K, D>) -> String>(
1,756✔
977
        &self,
1,756✔
978
        node: &Node<'_, K, D>,
1,756✔
979
        w: &mut dyn Write,
1,756✔
980
        tag_func: Option<&F>,
1,756✔
981
    ) -> Result<(), Error> {
1,756✔
982
        match tag_func {
1,756✔
983
            Some(f) => {
1,172✔
984
                let tags = (f)(node);
1,172✔
985
                writeln!(
1,172✔
986
                    w,
1,172✔
987
                    "S\t{}\t{}\t{}",
988
                    node.node_id,
989
                    node.sequence().to_dna_string(),
1,172✔
990
                    tags
991
                )?;
×
992
            }
993
            _ => writeln!(
584✔
994
                w,
584✔
995
                "S\t{}\t{}",
996
                node.node_id,
997
                node.sequence().to_dna_string()
584✔
998
            )?,
×
999
        }
1000

1001
        for (_, target, dir, _) in node.l_edges() {
1,756✔
1002
            if target >= node.node_id {
1,212✔
1003
                let to_dir = match dir {
612✔
1004
                    Dir::Left => "+",
306✔
1005
                    Dir::Right => "-",
306✔
1006
                };
1007
                writeln!(
612✔
1008
                    w,
612✔
1009
                    "L\t{}\t-\t{}\t{}\t{}M",
1010
                    node.node_id,
1011
                    target,
1012
                    to_dir,
1013
                    K::k() - 1
612✔
1014
                )?;
×
1015
            }
600✔
1016
        }
1017

1018
        for (_, target, dir, _) in node.r_edges() {
1,756✔
1019
            if target > node.node_id {
1,092✔
1020
                let to_dir = match dir {
540✔
1021
                    Dir::Left => "+",
294✔
1022
                    Dir::Right => "-",
246✔
1023
                };
1024
                writeln!(
540✔
1025
                    w,
540✔
1026
                    "L\t{}\t+\t{}\t{}\t{}M",
1027
                    node.node_id,
1028
                    target,
1029
                    to_dir,
1030
                    K::k() - 1
540✔
1031
                )?;
×
1032
            }
552✔
1033
        }
1034

1035
        Ok(())
1,756✔
1036
    }
1,756✔
1037

1038
    /// Write the graph to GFA format
1039
    pub fn to_gfa<P: AsRef<Path>>(&self, gfa_out: P) -> Result<(), Error> {
1✔
1040
        let wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
1✔
1041
        self.write_gfa(&mut std::io::BufWriter::new(wtr))
1✔
1042
    }
1✔
1043

1044
    pub fn write_gfa(&self, wtr: &mut impl Write) -> Result<(), Error> {
1✔
1045
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1046

1047
        type DummyFn<K, D> = fn(&Node<'_, K, D>) -> String;
1048
        let dummy_opt: Option<&DummyFn<K, D>> = None;
1✔
1049

1050
        let pb = ProgressBar::new(self.len() as u64);
1✔
1051
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1052
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1053

1054
        for i in (0..self.len()).progress_with(pb) {
584✔
1055
            let n = self.get_node(i);
584✔
1056
            self.node_to_gfa(&n, wtr, dummy_opt)?;
584✔
1057
        }
1058

1059
        wtr.flush().unwrap();
1✔
1060

1061
        Ok(())
1✔
1062
    }
1✔
1063

1064
    /// Write the graph to GFA format
1065
    pub fn to_gfa_with_tags<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(
1✔
1066
        &self,
1✔
1067
        gfa_out: P,
1✔
1068
        tag_func: F,
1✔
1069
    ) -> Result<(), Error> {
1✔
1070
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creatinf gfa file"));
1✔
1071
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1072

1073
        let pb = ProgressBar::new(self.len() as u64);
1✔
1074
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1075
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1076

1077
        for i in (0..self.len()).progress_with(pb) {
584✔
1078
            let n = self.get_node(i);
584✔
1079
            self.node_to_gfa(&n, &mut wtr, Some(&tag_func))?;
584✔
1080
        }
1081

1082
        wtr.flush().unwrap();
1✔
1083

1084
        Ok(())
1✔
1085
    }
1✔
1086

1087
    /// Write the graph to GFA format, with multithreading, 
1088
    /// pass `tag_func=None` to write without tags
1089
    pub fn to_gfa_otags_parallel<P: AsRef<Path> + Display + Sync, F: Fn(&Node<'_, K, D>) -> String + Sync>(
1✔
1090
        &self,
1✔
1091
        gfa_out: P,
1✔
1092
        tag_func: Option<&F>,
1✔
1093
    ) -> Result<(), Error> 
1✔
1094
    where 
1✔
1095
    K: Sync,
1✔
1096
    D: Sync,
1✔
1097
    {
1098
        // split into ranges according to thread count
1099
        let slices = current_num_threads();
1✔
1100
        let n_nodes = self.len();
1✔
1101
        let sz = n_nodes / slices + 1;
1✔
1102

1103
        debug!("n_nodes: {}", n_nodes);
1✔
1104
        debug!("sz: {}", sz);
1✔
1105

1106
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
1107
        let mut start = 0;
1✔
1108
        while start < n_nodes {
5✔
1109
            parallel_ranges.push(start..start + sz);
4✔
1110
            start += sz;
4✔
1111
        }
4✔
1112

1113
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
1114
        parallel_ranges.push(last_start..n_nodes);
1✔
1115
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
1116

1117
        let mut files = Vec::with_capacity(current_num_threads());
1✔
1118

1119
        for i in 0..parallel_ranges.len() {
4✔
1120
            files.push(format!("{}-{}.gfa", gfa_out, i));
4✔
1121
        } 
4✔
1122

1123
        let pb = ProgressBar::new(self.len() as u64);
1✔
1124
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1125
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1126
        
1127
        
1128
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
4✔
1129
            let mut wtr = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel gfa file"));
4✔
1130

1131
            for i in range {
584✔
1132
                let n = self.get_node(i);
584✔
1133
                self.node_to_gfa(&n, &mut wtr, tag_func).unwrap();
584✔
1134
                pb.inc(1);
584✔
1135
            }
584✔
1136

1137
            wtr.flush().unwrap();
4✔
1138
        });
4✔
1139

1140
        pb.finish_and_clear();
1✔
1141

1142
        // combine files
1143
        let mut out_file = BufWriter::with_capacity(BUF, File::create(format!("{}.gfa", gfa_out)).expect("error creating combined gfa file"));
1✔
1144
        writeln!(out_file, "H\tVN:Z:debruijn-rs")?;
1✔
1145

1146
        let pb = ProgressBar::new(files.len() as u64);
1✔
1147
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1148
        pb.set_message(format!("{:<32}", "combining files"));
1✔
1149

1150
        for file in files.iter() {
4✔
1151
            let open_file = File::open(file).expect("error opening parallel gfa file");
4✔
1152
            let mut reader = BufReader::new(open_file);
4✔
1153
            let mut buffer = [0; BUF];
4✔
1154

1155
            loop {
1156
                let linecount = reader.read(&mut buffer).unwrap();
8✔
1157
                if linecount == 0 { break }
8✔
1158
                out_file.write_all(&buffer[..linecount]).unwrap();
4✔
1159
            }
1160

1161
            remove_file(file).unwrap();
4✔
1162
        }
1163

1164
        out_file.flush().unwrap();
1✔
1165

1166
        Ok(())
1✔
1167
    }
1✔
1168

1169
    /// Write the graph to GFA format
1170
    pub fn to_gfa_partial<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(&self, gfa_out: P, tag_func: Option<&F>, nodes: Vec<usize>) -> Result<(), Error> {
1✔
1171
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
1✔
1172
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1173

1174
        let pb = ProgressBar::new(self.len() as u64);
1✔
1175
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1176
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1177

1178
        for i in nodes.into_iter().progress_with(pb) {
4✔
1179
            let n = self.get_node(i);
4✔
1180
            self.node_to_gfa(&n, &mut wtr, tag_func)?;
4✔
1181
        }
1182

1183
        wtr.flush().unwrap();
1✔
1184

1185
        Ok(())    
1✔
1186
    }
1✔
1187

1188
    fn node_to_tsv<W: Write, F>(&self, writer: &mut W, node_id: usize, data_format: F) -> Result<(), Box<dyn std::error::Error>> 
536✔
1189
    where 
536✔
1190
        F: Fn(&Node<'_, K, D>) -> String,
536✔
1191
    {
1192

1193
        let node = self.get_node(node_id);
536✔
1194
        let l_e = node.l_edges();
536✔
1195
        let r_e = node.r_edges();
536✔
1196

1197
        
1198
        if self.base.stranded {
536✔
1199
            // format: node id    l nb    r nb    seq    data
1200
            let l_nb = l_e.iter().map(|(_b, nb, _d, _f)| *nb).collect::<Vec<_>>();
209✔
1201
            let r_nb = r_e.iter().map(|(_b, nb, _d, _f)| *nb).collect::<Vec<_>>();
209✔
1202
            writeln!(writer, "{node_id}\t{:?}\t{:?}\t{}\t{}", l_nb, r_nb, node.sequence(), data_format(&node))?
209✔
1203
        } else {
1204
            // format: node id    l nb    l inc dir    r nb    r inc dir    seq    data
1205
            let l_nb = l_e.iter().map(|(_b, nb, _d, _f)| *nb).collect::<Vec<_>>();
327✔
1206
            let r_nb = r_e.iter().map(|(_b, nb, _d, _f)| *nb).collect::<Vec<_>>();
327✔
1207
            let l_nb_dirs = l_e.iter().map(|(_b, _nb, dir, _f)| *dir).collect::<Vec<_>>();
327✔
1208
            let r_nb_dirs = r_e.iter().map(|(_b, _nb, dir, _f)| *dir).collect::<Vec<_>>();
327✔
1209
            writeln!(writer, "{node_id}\t{:?}\t{:?}\t{:?}\t{:?}\t{}\t{}", l_nb, l_nb_dirs, r_nb, r_nb_dirs, node.sequence(), data_format(&node))?
327✔
1210
        }
1211

1212
        Ok(())
536✔
1213
    }
536✔
1214

1215
    /// save the graph as a tsv file with custom formatting for the node data
1216
    pub fn to_tsv<P, F>(&self, path: P, data_format: F) -> Result<(), Box<dyn std::error::Error>> 
2✔
1217
    where 
2✔
1218
        F: Fn(&Node<'_, K, D>) -> String,
2✔
1219
        P: AsRef<Path> + Display,
2✔
1220
    { 
1221
        let mut writer = BufWriter::new(File::create(path)?);
2✔
1222

1223
        // different format if stranded vs unstranded
1224
        if self.base.stranded {
2✔
1225
            writeln!(writer, "node id\tleft neighbors\tright neighbors\tsequence\tdata")?;
1✔
1226
        } else {
1227
            writeln!(writer, "node id\tleft neighbors\tleft nb incoming dirs\tright neighbors\tright nb incoming dirs\tsequence\tdata")?;
1✔
1228
        }
1229
            
1230
        for i in 0..self.len() {
536✔
1231
            self.node_to_tsv(&mut writer, i, &data_format)?
536✔
1232
        }
1233
        
1234
        Ok(())
2✔
1235
    }
2✔
1236

1237
    pub fn to_json_rest<W: Write, F: Fn(&D) -> Value>(
×
1238
        &self,
×
1239
        fmt_func: F,
×
1240
        mut writer: &mut W,
×
1241
        rest: Option<Value>,
×
1242
    ) {
×
1243
        writeln!(writer, "{{\n\"nodes\": [").unwrap();
×
1244
        for i in 0..self.len() {
×
1245
            let node = self.get_node(i);
×
1246
            node.to_json(&fmt_func, writer);
×
1247
            if i == self.len() - 1 {
×
1248
                writeln!(writer).unwrap();
×
1249
            } else {
×
1250
                writeln!(writer, ",").unwrap();
×
1251
            }
×
1252
        }
1253
        writeln!(writer, "],").unwrap();
×
1254

1255
        writeln!(writer, "\"links\": [").unwrap();
×
1256
        for i in 0..self.len() {
×
1257
            let node = self.get_node(i);
×
1258
            match node.edges_to_json(writer) {
×
1259
                true => {
1260
                    if i == self.len() - 1 {
×
1261
                        writeln!(writer).unwrap();
×
1262
                    } else {
×
1263
                        writeln!(writer, ",").unwrap();
×
1264
                    }
×
1265
                }
1266
                _ => continue,
×
1267
            }
1268
        }
1269
        writeln!(writer, "]").unwrap();
×
1270

1271
        match rest {
×
1272
            Some(Value::Object(v)) => {
×
1273
                for (k, v) in v.iter() {
×
1274
                    writeln!(writer, ",").expect("io error");
×
1275
                    write!(writer, "\"{}\": ", k).expect("io error");
×
1276
                    serde_json::to_writer(&mut writer, v).expect("io error");
×
1277
                    writeln!(writer).expect("io error");
×
1278
                }
×
1279
            }
1280
            _ => {
×
1281
                writeln!(writer).expect("io error");
×
1282
            }
×
1283
        }
1284

1285
        writeln!(writer, "}}").expect("io error");
×
1286
    }
×
1287

1288
    /// Write the graph to JSON
1289
    pub fn to_json<W: Write, F: Fn(&D) -> Value, RF: Fn(&mut W)>(
×
1290
        &self,
×
1291
        fmt_func: F,
×
1292
        writer: &mut W,
×
1293
    ) {
×
1294
        self.to_json_rest(fmt_func, writer, None);
×
1295
    }
×
1296

1297
    /// Print a text representation of the graph.
1298
    pub fn print(&self) {
3✔
1299
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
3✔
1300
        for node in self.iter_nodes() {
140✔
1301
            println!("{:?}", node);
140✔
1302
        }
140✔
1303
    }
3✔
1304

1305
    pub fn print_with_data(&self) {
×
1306
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
×
1307
        for node in self.iter_nodes() {
×
1308
            println!("{:?} ({:?})", node, node.data());
×
1309
        }
×
1310
    }
×
1311

1312
    pub fn max_path_beam<F, F2>(&self, beam: usize, score: F, _solid_path: F2) -> Vec<(usize, Dir)>
×
1313
    where
×
1314
        F: Fn(&D) -> f32,
×
1315
        F2: Fn(&D) -> bool,
×
1316
    {
1317
        if self.is_empty() {
×
1318
            return Vec::default();
×
1319
        }
×
1320

1321
        let mut states = Vec::new();
×
1322

1323
        for i in 0..self.len() {
×
1324
            let node = self.get_node(i);
×
1325

1326
            // Initialize beam search on terminal nodes
1327
            if node.exts().num_exts_l() == 0 || node.exts().num_exts_r() == 0 {
×
1328
                let dir = if node.exts().num_exts_l() > 0 {
×
1329
                    Dir::Right
×
1330
                } else {
1331
                    Dir::Left
×
1332
                };
1333

1334
                let status = if node.exts().num_exts_l() == 0 && node.exts().num_exts_r() == 0 {
×
1335
                    Status::End
×
1336
                } else {
1337
                    Status::Active
×
1338
                };
1339

1340
                let mut path = SmallVec8::new();
×
1341
                path.push((i as u32, dir));
×
1342

1343
                let s = State {
×
1344
                    path,
×
1345
                    status,
×
1346
                    score: score(node.data()),
×
1347
                };
×
1348
                states.push(s);
×
1349
            }
×
1350
        }
1351

1352
        // No end nodes -- just start on the first node
1353
        if states.is_empty() {
×
1354
            // Make a start
×
1355
            let node = self.get_node(0);
×
1356
            let mut path = SmallVec8::new();
×
1357
            path.push((0, Dir::Left));
×
1358
            states.push(State {
×
1359
                path,
×
1360
                status: Status::Active,
×
1361
                score: score(node.data()),
×
1362
            });
×
1363
        }
×
1364

1365
        // Beam search until we can't find any more expansions
1366
        let mut active = true;
×
1367
        while active {
×
1368
            let mut new_states = Vec::with_capacity(states.len());
×
1369
            active = false;
×
1370

1371
            for s in states {
×
1372
                if s.status == Status::Active {
×
1373
                    active = true;
×
1374
                    let expanded = self.expand_state(&s, &score);
×
1375
                    new_states.extend(expanded);
×
1376
                } else {
×
1377
                    new_states.push(s)
×
1378
                }
1379
            }
1380

1381
            // workaround to sort by descending score - will panic if there are NaN scores
1382
            new_states.sort_by(|a, b| (-(a.score)).partial_cmp(&-(b.score)).unwrap());
×
1383
            new_states.truncate(beam);
×
1384
            states = new_states;
×
1385
        }
1386

1387
        for (i, state) in states.iter().take(5).enumerate() {
×
1388
            trace!("i:{}  -- {:?}", i, state);
×
1389
        }
1390

1391
        // convert back to using usize for node_id
1392
        states[0]
×
1393
            .path
×
1394
            .iter()
×
1395
            .map(|&(node, dir)| (node as usize, dir))
×
1396
            .collect()
×
1397
    }
×
1398

1399
    fn expand_state<F>(&self, state: &State, score: &F) -> SmallVec4<State>
×
1400
    where
×
1401
        F: Fn(&D) -> f32,
×
1402
    {
1403
        if state.status != Status::Active {
×
1404
            panic!("only attempt to expand active states")
×
1405
        }
×
1406

1407
        let (node_id, dir) = state.path[state.path.len() - 1];
×
1408
        let node = self.get_node(node_id as usize);
×
1409
        let mut new_states = SmallVec4::new();
×
1410

1411
        for (_, next_node_id, incoming_dir, _) in node.edges(dir.flip()) {
×
1412
            let next_node = self.get_node(next_node_id);
×
1413
            let new_score = state.score + score(next_node.data());
×
1414

1415
            let cycle = state
×
1416
                .path
×
1417
                .iter()
×
1418
                .any(|&(prev_node, _)| prev_node == (next_node_id as u32));
×
1419

1420
            let status = if cycle {
×
1421
                Status::Cycle
×
1422
            } else if next_node.edges(incoming_dir.flip()).is_empty() {
×
1423
                Status::End
×
1424
            } else {
1425
                Status::Active
×
1426
            };
1427

1428
            let mut new_path = state.path.clone();
×
1429
            new_path.push((next_node_id as u32, incoming_dir));
×
1430

1431
            let next_state = State {
×
1432
                path: new_path,
×
1433
                score: new_score,
×
1434
                status,
×
1435
            };
×
1436

1437
            new_states.push(next_state);
×
1438
        }
1439

1440
        new_states
×
1441
    }
×
1442

1443

1444
    pub fn iter_components(&'_ self) -> IterComponents<'_, K, D> {
5✔
1445
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
5✔
1446
        let pos = 0;
5✔
1447

1448
        for _i in 0..self.len() {
1,128✔
1449
            visited.push(false);
1,128✔
1450
        }
1,128✔
1451

1452
        IterComponents { 
5✔
1453
            graph: self, 
5✔
1454
            visited, 
5✔
1455
            pos }
5✔
1456
    }
5✔
1457

1458

1459
    /// iteratively returns 2D Vec with node_ids grouped according to the connected components they form
1460
    pub fn components_i(&self) -> Vec<Vec<usize>> {
1✔
1461
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
1✔
1462
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
1✔
1463

1464
        for _i in 0..self.len() {
136✔
1465
            visited.push(false);
136✔
1466
        }
136✔
1467

1468
        for i in 0..self.len() {
136✔
1469
            if !visited[i] {
136✔
1470
                let comp = self.component_i(&mut visited, i);
2✔
1471
                components.push(comp);
2✔
1472
            }
134✔
1473
        }
1474

1475
        components
1✔
1476
    }
1✔
1477

1478
    /// recursively detects which nodes form separate graph components
1479
    /// returns 2D vector with node ids per component
1480
    /// (may lead to stack overflow)
1481
    pub fn components_r(&self) -> Vec<Vec<usize>> {
2✔
1482
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
2✔
1483
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
2✔
1484

1485
        for _i in 0..self.len() {
138✔
1486
            visited.push(false);
138✔
1487
        }
138✔
1488

1489
        for i in 0..self.len() {
138✔
1490
            if !visited[i] {
138✔
1491
                let mut comp: Vec<usize> = Vec::new();
4✔
1492
                self.component_r(&mut visited, i, &mut comp);
4✔
1493
                components.push(comp);
4✔
1494
            }
134✔
1495
        }
1496

1497
        components
2✔
1498

1499
    }
2✔
1500

1501
    fn component_r<'a>(&'a self, visited: &'a mut Vec<bool>, i: usize, comp: &'a mut Vec<usize>) {
138✔
1502
        
1503
        visited[i] = true;
138✔
1504
        comp.push(i);
138✔
1505
        let mut edges = self.find_edges(i, Dir::Left);
138✔
1506
        let mut r_edges = self.find_edges(i, Dir::Right);
138✔
1507

1508
        edges.append(&mut r_edges);
138✔
1509

1510
        for (_, edge, _, _) in edges.iter() {
268✔
1511
            if !visited[*edge] {
268✔
1512
                self.component_r(visited, *edge, comp);
134✔
1513
            }
134✔
1514
        }
1515
    }
138✔
1516

1517
    fn component_i<'a>(&'a self, visited: &'a mut [bool], i: usize) -> Vec<usize> {
243✔
1518
        let mut edges: Vec<usize> = Vec::new();
243✔
1519
        let mut comp: Vec<usize> = Vec::new();
243✔
1520

1521
        edges.push(i);
243✔
1522

1523
        while let Some(current_edge) = edges.pop() {
1,538✔
1524
            if !visited[current_edge] { 
1,295✔
1525
                comp.push(current_edge);
1,264✔
1526
                visited[current_edge] = true;
1,264✔
1527

1528
                let mut l_edges = self.find_edges(current_edge, Dir::Left);
1,264✔
1529
                let mut r_edges = self.find_edges(current_edge, Dir::Right);
1,264✔
1530

1531
                l_edges.append(&mut r_edges);
1,264✔
1532

1533
                for (_, new_edge, _, _) in l_edges.into_iter() {
2,108✔
1534
                    if !visited[new_edge] {
2,108✔
1535
                        edges.push(new_edge);
1,052✔
1536
                    }
1,056✔
1537
                }
1538
            }
31✔
1539
        }
1540
        comp
243✔
1541
    }
243✔
1542

1543
    /// iterate over all edges of the graph, item: (node, ext base, ext dir, target node)
1544
    pub fn iter_edges(&self) -> EdgeIter<'_, K, D> {
17✔
1545
        EdgeIter::new(self)
17✔
1546
    }
17✔
1547

1548
    pub fn find_bad_nodes<F: Fn(&Node<'_, K, D>) -> bool>(&self, valid: F) -> Vec<usize> {
×
1549
        let mut bad_nodes = Vec::new();
×
1550

1551
        for (i, node) in enumerate(self.iter_nodes()) {
×
1552
            if !valid(&node) { bad_nodes.push(i); }
×
1553
        }
1554
        
1555
        bad_nodes
×
1556
    }
×
1557
}
1558

1559
impl<K: Kmer, SD: Debug> DebruijnGraph<K, SD> {
1560
    pub fn create_colors<'a, 'b: 'a, DI>(&'a self, config: &SummaryConfig, color_mode: ColorMode<'b>) -> Colors<'b, SD, DI> 
×
1561
    where 
×
1562
    SD: SummaryData<DI>,
×
1563
    {
1564
        Colors::new(self, config, color_mode)
×
1565
    }
×
1566
    
1567
    /// [`crate::EdgeMult`] will contain hanging edges if the nodes were filtered
1568
    pub fn fix_edge_mults<DI>(&mut self) 
1✔
1569
    where 
1✔
1570
        SD: SummaryData<DI>
1✔
1571
    {
1572
        if self.get_node(0).data().edge_mults().is_some() {
1✔
1573
            for i in 0..self.len() {
136✔
1574
                self.base.data[i].fix_edge_mults(self.base.exts[i]);
136✔
1575
            }
136✔
1576
        }
×
1577
    }
1✔
1578

1579
    /// if there are [`crate::EdgeMult`]s in the data, prune the graph by removing edges that have a low coverage
1580
    pub fn filter_edges<DI>(&mut self, min: u32) -> Result<(), String>
1✔
1581
    where 
1✔
1582
        SD: SummaryData<DI>
1✔
1583
    {
1584
        // return if there is no edge coverage available
1585
        if self.get_node(0).data().edge_mults().is_none() { return Err(String::from("no edge mults available")) };
1✔
1586

1587
        for i in 0..self.len() {
136✔
1588
            let em = self.get_node(i).data().edge_mults().expect("shold have em").clone();
136✔
1589
            let edges = [(Dir::Left, 0), (Dir::Left, 1), (Dir::Left, 2), (Dir::Left, 3), 
136✔
1590
                (Dir::Right, 0), (Dir::Right, 1), (Dir::Right, 2), (Dir::Right, 3)];
136✔
1591
            
1592
            for (dir, base) in edges {
1,088✔
1593
                if min > em.edge_mult(base, dir) {
1,088✔
1594
                    // remove invalid ext from node
820✔
1595
                    let ext = self.base.exts[i].remove(dir, base);
820✔
1596
                    self.base.exts[i] = ext;
820✔
1597
                }
820✔
1598
            }
1599
        }
1600

1601
        Ok(())
1✔
1602
    }
1✔
1603

1604
    /// if a node has a connection to a high quality node and low quality nodes 
1605
    /// in the same direction, remove the connections to the low quality ndoes
1606
    pub fn remove_lq_splits<DI>(&mut self, min_quality: BaseQuality) -> Result<(), String>
2✔
1607
    where 
2✔
1608
        SD: SummaryData<DI>
2✔
1609
    {
1610
        // check we do indeed have quality and graph is stranded
1611
        if self.get_node(0).data().quality().is_none() { return Err(String::from("no quality scores available")); }
2✔
1612
        if !self.base.stranded { return Err(String::from("graph must be stranded to remove ladders")) };
2✔
1613

1614
        // iterate over all nodes and look in both directions if there is a low quality node splitting off
1615
        for (node_id, out_dir) in (0..self.len()).flat_map(|id| [(id, Dir::Right), (id, Dir::Left)]) {
582✔
1616
            let out_edges = self.get_node(node_id).edges(out_dir);
582✔
1617

1618
            // check for split
1619
            if out_edges.len() < 2 {
582✔
1620
                // no split, move on
1621
                continue;
556✔
1622
            }
26✔
1623

1624
            // get qualities 
1625
            let nb_qualities = out_edges
26✔
1626
                .iter()
26✔
1627
                .map(|(_, nb_id, _, _)| (*nb_id, self
52✔
1628
                    .get_node(*nb_id)
52✔
1629
                    .data()
52✔
1630
                    .quality()
52✔
1631
                    .unwrap()
52✔
1632
                )).collect::<Vec<_>>();
26✔
1633

1634
            // check for good neighbor
1635
            let has_good_nb = nb_qualities.iter()
26✔
1636
                .any(|(_, quality)| *quality >= min_quality);
40✔
1637

1638
            if !has_good_nb {
26✔
1639
                // split but no good path, move on
1640
                continue;
4✔
1641
            }
22✔
1642

1643
            // remove split with low quality
1644
            for (nb_id, _) in nb_qualities.iter().filter(|(_, quality)| *quality < min_quality ) {
44✔
1645
                let path = vec![node_id, *nb_id];
22✔
1646
                if self.remove_path(path, out_dir).is_err() {
22✔
NEW
1647
                    warn!("lq tip path could not be removed")
×
1648
                }
22✔
1649
            }
1650
        }
1651

1652
        Ok(())
2✔
1653
    }
2✔
1654

1655
    /// remove bubbles/ladders and tips in which one path has a quality lower than the given `min_quality`
1656
    pub fn remove_lq_ladders_tips<DI>(&mut self, min_quality: BaseQuality) -> Result<(), String>
2✔
1657
    where
2✔
1658
        SD: SummaryData<DI>
2✔
1659
    {
1660
        // check we do indeed have quality and graph is stranded
1661
        if self.get_node(0).data().quality().is_none() { return Err(String::from("no quality scores available")); }
2✔
1662
        if !self.base.stranded { return Err(String::from("graph must be stranded to remove ladders")) };
2✔
1663

1664
        let min_path = 2 * K::k() - 1;
2✔
1665
        //let max_path = 4 * K::k() - 1;
1666

1667
        // iterate over nodes
1668
        for (node_id, out_dir) in (0..self.len()).flat_map(|id| [(id, Dir::Right), (id, Dir::Left)]) {
582✔
1669
            let in_dir = out_dir.flip();
582✔
1670

1671
            // check if node has multile outs to the right, at least one with bad quality and one with good quality
1672
            let node_out_edges = self.get_node(node_id).edges(out_dir);
582✔
1673

1674
            let good_neighbors = node_out_edges.iter()
582✔
1675
                .map(|(_, target_id, _, _)| *target_id)
582✔
1676
                .filter(|target_id| self.get_node(*target_id)
582✔
1677
                    .data()
481✔
1678
                    .quality()
481✔
1679
                    .unwrap() >= min_quality
481✔
1680
                ).collect::<Vec<_>>();
582✔
1681

1682
            let bad_neighbors = node_out_edges.iter()
582✔
1683
                .map(|(_, target_id, _, _)| *target_id)
582✔
1684
                .filter(|target_id| self.get_node(*target_id)
582✔
1685
                    .data()
481✔
1686
                    .quality()
481✔
1687
                    .unwrap() < min_quality
481✔
1688
                ).collect::<Vec<_>>();
582✔
1689

1690
            if good_neighbors.is_empty() | bad_neighbors.is_empty() { continue; }
582✔
1691

1692
            // follow the bad quality paths until we reach nodes with high quality again (or max search radius)
1693
            let mut possible_paths = Vec::new();
14✔
1694
            let mut tips = Vec::new();
14✔
1695

1696
            for bn in bad_neighbors {
14✔
1697
                let mut current_node_id = bn;
14✔
1698
                let mut path_groups = vec![vec![node_id]];
14✔
1699
                let mut path_length = K::k() - 1;
14✔
1700

1701
                let mut state = LadderState::Singular;
14✔
1702
                let mut q_state_high = false;
14✔
1703

1704
                loop {
1705
                    // check if path has reached max length -> interrupt
1706
                    /* if path_length > max_path {
1707
                        break;
1708
                    } */
1709

1710
                    let current_node = self.get_node(current_node_id);
175✔
1711
                    let out_edges = current_node.edges(out_dir);
175✔
1712

1713
                    // add current node length to path
1714
                    path_length += current_node.len() - K::k() + 1;
175✔
1715
                    
1716
                    // check state and add current node to path
1717
                    let path_index = path_groups.len() - 1;
175✔
1718

1719
                    // if in singular state now or before, add node to path
1720
                    if matches!(state, LadderState::Singular) { 
175✔
1721
                        path_groups[path_index].push(current_node_id);
149✔
1722
                    }
149✔
1723

1724
                    // check if we increase ladder state
1725
                    let q_increase = (current_node.data().quality().unwrap() >= min_quality) & !q_state_high;
175✔
1726
                    let mult_increase = current_node.edges(in_dir).len() > 1;
175✔
1727
                    
1728
                    if q_increase {
175✔
1729
                        q_state_high = true
15✔
1730
                    }
160✔
1731

1732
                    if q_increase | mult_increase {
175✔
1733
                        match state {
17✔
1734
                            LadderState::Singular => {
17✔
1735
                                state = LadderState::Double;
17✔
1736
                            }
17✔
NEW
1737
                            LadderState::Double => () // ignore
×
1738
                        };
1739
                    }
158✔
1740

1741
                    // check if we decrease ladder state
1742
                    let q_decrease = (current_node.data().quality().unwrap() < min_quality) & q_state_high;
175✔
1743
                    let mult_decrease = out_edges.len() > 1;
175✔
1744

1745
                    if q_decrease {
175✔
1746
                        q_state_high = false;
5✔
1747
                    }
170✔
1748

1749
                    if q_decrease | mult_decrease {
175✔
1750
                        match state {
13✔
1751
                            LadderState::Singular => (), // ignore
5✔
1752
                            LadderState::Double => {
8✔
1753
                                state = LadderState::Singular;
8✔
1754
                                path_groups.push(vec![current_node_id]); // start new path group
8✔
1755
                            }
8✔
1756
                        }
1757
                    }
162✔
1758

1759
                    // check if we have met end criterium -> save path
1760
                    let quality_req =  current_node.data().quality().unwrap() >= min_quality;
175✔
1761
                    let len_req = path_length >= min_path;
175✔
1762
                    // path is a simple tip -> save as tip
1763
                    let is_tip = out_edges.is_empty() & (path_groups.len() == 1); // TODO maybe remove req 2 in future
175✔
1764

1765
                    if quality_req & len_req {
175✔
1766
                        possible_paths.push(path_groups);
10✔
1767
                        break;
10✔
1768
                    } else if is_tip {
165✔
1769
                        tips.push(path_groups);
2✔
1770
                        break;
2✔
1771
                    }
163✔
1772

1773
                    // we have not met the conditions and keep moving
1774

1775
                    // find next node
1776
                    let next_node_id = if out_edges.len() == 1 {
163✔
1777
                        let (_, next_node_id, _, _) = out_edges[0];
154✔
1778
                        next_node_id
154✔
1779
                    } else if out_edges.is_empty() {
9✔
1780
                        // dead end which has not qualified as tip
1781
                        break;
2✔
1782
                    } else {
1783
                        // choose the lowest quality path
1784

1785
                        let worst_neighbor = out_edges.iter()
7✔
1786
                            .map(|(_, target_id, _, _)| (*target_id, self.get_node(*target_id)
14✔
1787
                                .data()
14✔
1788
                                .quality()
14✔
1789
                                .unwrap()))
14✔
1790
                            .min_by(|(_, q_a), (_, q_b)| q_a.cmp(q_b));
7✔
1791

1792
                        if let Some((worst_nb_id, _)) = worst_neighbor {
7✔
1793
                            worst_nb_id
7✔
1794
                        } else {
NEW
1795
                            break;
×
1796
                        }
1797
                    };
1798

1799
                    current_node_id = next_node_id;
161✔
1800
                }
1801
            }
1802

1803
            let possible_targets = possible_paths.iter().map(|p| p.last().unwrap().last().unwrap()).collect::<Vec<_>>();
14✔
1804
            let mut confirmed_targets = Vec::new();
14✔
1805
            // follow the good quality paths until we reach a possible target node (or max search radius)
1806
            // if we reached a target node, send bad path to be removed from graph
1807
            for gn in good_neighbors {
14✔
1808
                let mut path_length = K::k() - 1;
14✔
1809
                let mut current_node_id = gn;
14✔
1810

1811
                loop {
1812
                    // check if we have exceeded the search radius
1813
                    /* if path_length > max_path {
1814
                        break;
1815
                    } */
1816

1817
                    // add current node length to path length
1818
                    let current_node = self.get_node(current_node_id);
191✔
1819
                    path_length += current_node.len() - K::k() + 1;
191✔
1820

1821
                    // check if we have found a target
1822
                    if possible_targets.contains(&&current_node_id) {
191✔
1823
                        confirmed_targets.push(current_node_id);
8✔
1824
                        break;
8✔
1825
                    }
183✔
1826

1827
                    // look for next node
1828
                    let out_edges = current_node.edges(out_dir);
183✔
1829

1830
                    let next_node_id = if out_edges.len() == 1 {
183✔
1831
                        let (_, next_node_id, _, _) = out_edges[0];
177✔
1832
                        next_node_id
177✔
1833
                    } else if out_edges.is_empty() {
6✔
1834
                        // dead end, break
1835
                        break;
6✔
1836
                    } else {
1837
                        // use node with highest quality
1838
                        // TODO future: use read mapping?
NEW
1839
                        let good_neighbors = out_edges.iter()
×
NEW
1840
                            .map(|(_, target_id, _, _)| (*target_id, self.get_node(*target_id)
×
NEW
1841
                                .data()
×
NEW
1842
                                .quality()
×
NEW
1843
                                .unwrap()))
×
NEW
1844
                            .max_by(|(_, q_a), (_, q_b)| q_a.cmp(q_b));
×
NEW
1845
                        if let Some((best_nb_id, _)) = good_neighbors {
×
NEW
1846
                            best_nb_id
×
1847
                        } else {
NEW
1848
                            break;
×
1849
                        }
1850
                    };
1851

1852
                    current_node_id = next_node_id;
177✔
1853
                }
1854
            }
1855

1856
            // check if we have found end nodes of possible paths by following good quality paths
1857
            // if so, remove path
1858
            for path_group in possible_paths {
14✔
1859
                let target = path_group.last().unwrap().last().unwrap();
10✔
1860

1861
                if confirmed_targets.contains(target) {
10✔
1862
                    for path in path_group {
13✔
1863
                        if let Err(err) = self.remove_path(path.clone(), out_dir) {
13✔
NEW
1864
                            warn!("lq ladder partial path could not be removed, likely cause: loop, edges were already removed. parital path: {:?}\nerr: {err}", path)
×
1865
                        }
13✔
1866
                    }
1867
                }
2✔
1868
            }
1869

1870
            // remove tip paths
1871
            for path_group in tips {
14✔
1872
                let path = path_group.into_iter().next().expect("empty tip path found");
2✔
1873
                if let Err(err) = self.remove_path(path.clone(), out_dir) {
2✔
NEW
1874
                    warn!("lq tip partial path could not be removed, likely cause: loop, edges were already removed. parital path: {:?}\nerr: {err}", path)
×
1875
                }
2✔
1876
            }
1877

1878

1879
        }
1880

1881

1882
        Ok(())
2✔
1883
    }
2✔
1884

1885
    /// remove simple ladder structures (bubbles) caused by 1-base sequencing errors from the graph
1886
    /// 
1887
    /// graph must contain edge mults and be stranded
1888
    /// this function will likely leave tips on the graph, so it is recommended to run
1889
    /// [`DebruijnGraph::remove_tips`] afterwards
1890
    /// 
1891
    /// ladder structure refers to bubbles where one side has been compressed into one node
1892
    /// but the other has low compression, due to differences in coverage and thus data variance,
1893
    /// leading to a ladder-like appearance
1894
    pub fn remove_ladders<DI, P>(&mut self, min_diff_factor: u32, max_avg_low_cov: f32, out_path: Option<P>) -> Result<(), String> 
2✔
1895
    where 
2✔
1896
        SD: SummaryData<DI>,
2✔
1897
        P: AsRef<Path>
2✔
1898
    {
1899
        // interrupt if we dont have edge mults
1900
        if self.get_node(0).data().edge_mults().is_none() { return Err(String::from("no edge mults available")) };
2✔
1901
        if !self.base.stranded { return Err(String::from("must be stranded")) };
2✔
1902

1903
        let mut writer = out_path.map(|path| BufWriter::new(File::create(path).expect("error creating ladder stats file")));
2✔
1904
        if let Some(wtr) = writer.as_mut() {
2✔
1905
            writeln!(wtr, "avg high cov,avg low cov,high truth ratio,low truth ratio").unwrap();
×
1906
        }
2✔
1907

1908
        // iterate over nodes
1909
        for node_id in 0..self.len() {
142✔
1910
            // check if node has outgoing edge(s) with both high and low coverage
1911
            let outs = self.get_node(node_id).data().edge_mults().expect("should have em").right();
142✔
1912

1913
            let Some((out_max_base, &out_max_cov)) = outs.iter().rev().enumerate().filter(|&(_, &c)| c  > 0).max_by(|&(_b1, &c1), &(_b2, c2)| c1.cmp(c2)) else { continue };
568✔
1914
            let smaller_outs = outs.iter().copied().rev().enumerate().filter(|&(_b, c)| (c > 0) & (out_max_cov > c)).collect::<Vec<_>>();
544✔
1915

1916
            // TODO add factor back in, remove writer
1917

1918
            if smaller_outs.is_empty() { continue; }
136✔
1919

1920
            // follow path with highest coverage until target length is reached
1921
            let Some((target_node, avg_high_cov, high_cc)) = self.follow_ladder_path_high(node_id, out_max_base as u8, out_max_cov) else { continue; };
8✔
1922
            
1923
            // check all small outs
1924
            for (s_base, s_cov) in smaller_outs {
5✔
1925
                let Some((target_paths, avg_low_cov, low_cc)) = self.follow_ladder_path_low(node_id, s_base as u8, s_cov) else { continue; };
5✔
1926
                let other_target_node = *target_paths.last().expect("should have at least one element").last().expect("should have at least two elements");
5✔
1927

1928
                if target_node == other_target_node {
5✔
1929
                    // the two paths landed on the same node -> remove all edges in the low coverage path
1930
                    if (s_cov * min_diff_factor <= out_max_cov) & (avg_low_cov <= max_avg_low_cov) {
2✔
1931
                        for path in target_paths.iter() {
7✔
1932
                            if self.remove_path(path.clone(), Dir::Right).is_err() {
7✔
1933
                                warn!("removing ladders: partial path could not be removed, likely cause: loop, edges were already removed. parital path: {:?}", path)
×
1934
                            }
7✔
1935
                        } 
1936
                    }
×
1937
                    
1938
                    if let Some(wtr) = writer.as_mut() {
2✔
1939
                        writeln!(wtr, "{},{},{},{}", avg_high_cov, avg_low_cov, high_cc, low_cc).unwrap();
×
1940
                    }
2✔
1941
                }
3✔
1942
            }
1943
        }
1944

1945
        Ok(())
2✔
1946
    }
2✔
1947

1948
    /// follow the presumably erroneous ladder path with low coverage
1949
    fn follow_ladder_path_low<DI>(&self, start_node_id: usize, start_ext: u8, start_cov: u32) -> Option<(Vec<Vec<usize>>, f32, f32)> 
5✔
1950
    where SD: SummaryData<DI>
5✔
1951
    {
1952
        // state is switched if coverage rises by 20% + 2 (so it's at least 2 more)
1953
        // nodes with higher state are kept connected together
1954
        // tests have shown that increase in coverage does not necessarily indicate 
1955
        // a multiplicity change
1956
        // however, we are trying to avoid false positives - false negatives will 
1957
        // most likely be cut off from the component or would be removed by remove_tips
1958
        // as a next step
1959
        // increasing these values will increase the number of removed edges
1960
        const COV_STATE_FACTOR: f32 = 0.2; 
1961
        const COV_STATE_ADD: f32 = 2.;
1962

1963
        // path, including start and target node
1964
        let mut paths = Vec::new();
5✔
1965
        paths.push(Vec::new());
5✔
1966
        paths[0].push(start_node_id);
5✔
1967

1968
        let target_length = K::k();
5✔
1969
        let mut path_length = 0;
5✔
1970
        let mut n_correct_edges = 0;
5✔
1971

1972
        // get fist next node
1973
        let sequence = self.base.sequences.get(start_node_id);
5✔
1974
        let term_kmer: K = sequence.term_kmer(Dir::Right);
5✔
1975
        let next_kmer = term_kmer.extend(start_ext, Dir::Right);
5✔
1976
        let (mut current_node_id, _, _) = self.find_link(next_kmer, Dir::Right).expect("link should exist"); 
5✔
1977
        paths[0].push(current_node_id);
5✔
1978

1979
        if self.check_edge_truth(start_node_id, current_node_id) {
5✔
1980
            n_correct_edges += 1;
×
1981
        }
5✔
1982

1983
        let mut current_cov = start_cov as f32;
5✔
1984
        let mut sum_path_cov = start_cov as f32;
5✔
1985
        let mut coverage_counter = 1; 
5✔
1986

1987
        // state to track if nodes should be removed or we're suspecting there's another path here
1988
        // only track edges for removal while state is Singular
1989
        // only track one additional path, else too complicated, return None
1990
        // increase state if there is another incoming edge or if the coverage suddenly increases drastically
1991
        // decrease state if there is another outgoing edge ot if the coverage suddenly decreases drastically
1992
        // if state cannot be increased or decreased further, return None
1993
        let mut state = LadderState::Singular;
5✔
1994

1995
        loop {
1996
            // check if current node is "target node", i.e., the path has reached the desired length
1997
            match path_length {
59✔
1998
                len if len == target_length => return Some((paths, (sum_path_cov / coverage_counter as f32), (n_correct_edges as f32 / (path_length + 1) as f32))), // path has target length
59✔
1999
                len if len > target_length => return None, // path has surpassed desired length => invalid
54✔
2000
                _ => () // path has not yet reached desired length, continue
54✔
2001
            }
2002

2003
            // since current node is not target node, add length to path
2004
            let len = self.get_node(current_node_id).len() - (K::k() - 1);
54✔
2005
            path_length += len;
54✔
2006

2007
            // get current node
2008
            let current_node = self.get_node(current_node_id);
54✔
2009
            
2010
            // get edges
2011
            let in_edges = current_node.l_edges();
54✔
2012
            let out_edges = current_node.r_edges();
54✔
2013

2014
            // get coverage
2015
            // edge cov must be available
2016
            let edge_coverages = current_node.data().edge_mults().expect("must have edge mults");
54✔
2017

2018
            // choose next edge by choosing coverage closest to current coverage
2019
            let mut out_ext = None;
54✔
2020
            let mut out_node_id = None;
54✔
2021
            let mut cov_diff = i32::MAX;
54✔
2022
            for (e, id, _, _) in out_edges.iter() {
59✔
2023
                let cov = edge_coverages.edge_mult(*e, Dir::Right) as i32;
59✔
2024
                let new_cov_diff = (current_cov as i32 - cov).abs();
59✔
2025
                if new_cov_diff < cov_diff {
59✔
2026
                    (out_ext, out_node_id, cov_diff) = (Some(*e), Some(*id), new_cov_diff);
59✔
2027
                }
59✔
2028
            }
2029
            let (Some(out_ext), Some(out_node_id)) = (out_ext, out_node_id) else { return None; };
54✔
2030

2031
            // ideally, low path nodes should have one incoming and one outgoing edge
2032
            // if two incoming edges, increase state from singular to double if possible
2033
            match in_edges.len() {
54✔
2034
                1 => (),
49✔
2035
                2 => match state {
5✔
2036
                    LadderState::Singular => state = LadderState::Double,
5✔
2037
                    LadderState::Double => return None // getting too complicated
×
2038
                }
2039
                _ => return None
×
2040
            }
2041

2042
            // if two outgoing edges, decrease state from double to singular if possible
2043
            // if already singular, 
2044
            match out_edges.len() {
54✔
2045
                1 => (),
49✔
2046
                2 => match state {
5✔
2047
                    LadderState::Singular => (), // could return None, but would kill if overlap is only one node long
1✔
2048
                    LadderState::Double => {
4✔
2049
                        state = LadderState::Singular;
4✔
2050
                        paths.push(vec![current_node_id]); // start new path
4✔
2051
                    }
4✔
2052
                }
2053
                _ => return None
×
2054
            }
2055

2056
            // check coverage, in theoretical ladder, coverage should be uniform
2057
            let coverage = edge_coverages.edge_mult(out_ext, Dir::Right) as f32;
54✔
2058
            match state {
54✔
2059
                LadderState::Singular => {
2060
                    // single state: check if next coverage is similar enough to current coverage
2061
                    // if way bigger, increase state, else adapt current coverage
2062
                    if  coverage > current_cov + current_cov * COV_STATE_FACTOR + COV_STATE_ADD {
34✔
2063
                        // higher by too much, change state
2✔
2064
                        state = LadderState::Double;
2✔
2065
                    } else {
32✔
2066
                        // in acceptable frame
32✔
2067
                        current_cov = coverage;
32✔
2068
                    }
32✔
2069
                }
2070
                LadderState::Double => {
2071
                    // double state: if way smaller (back in acceptable frame), decrease state, else dont treat as current coverage
2072
                    if coverage < current_cov + current_cov * COV_STATE_FACTOR + COV_STATE_ADD {
20✔
2073
                        // back in acceptable range
2✔
2074
                        state = LadderState::Singular;
2✔
2075
                        paths.push(vec![current_node_id]); // start new path
2✔
2076
                        current_cov = coverage;
2✔
2077
                    } // else continue on 
18✔
2078
                    // TODO check if better to also interrupt if coverage increases further
2079
                }
2080
            }
2081

2082
            // if state singular try to check if edge is "correct", add extra length of current node to it to account for compression
2083
            match state {
54✔
2084
                LadderState::Singular => {
2085
                    if self.check_edge_truth(current_node_id, out_node_id) {
34✔
2086
                        n_correct_edges += len;
×
2087
                    }
34✔
2088
                }
2089
                LadderState::Double => ()
20✔
2090
            }
2091

2092
            // set current node id to next node to be visited
2093
            current_node_id = out_node_id;
54✔
2094

2095
            // add current node to path, unless state is double
2096
            match state {
54✔
2097
                LadderState::Double => (),
20✔
2098
                LadderState::Singular => {
34✔
2099
                    sum_path_cov += current_cov;
34✔
2100
                    coverage_counter += 1;
34✔
2101
                    let last_path = paths.len() - 1;
34✔
2102
                    paths[last_path].push(current_node_id); // add node to latest path
34✔
2103
                }
34✔
2104
            }
2105
        }
2106
    }
5✔
2107

2108
    /// follow a path of the length 2*k - 1 by choosing the edges with the hightest coverage
2109
    /// requres the graph to have edge mults
2110
    fn follow_ladder_path_high<DI>(&self, start_node_id: usize, start_ext: u8, start_cov: u32) -> Option<(usize, f32, f32)> 
8✔
2111
    where SD: SummaryData<DI>
8✔
2112
    {
2113

2114
        let target_length = K::k();
8✔
2115
        let mut path_length = 0;
8✔
2116
        let mut sum_path_cov = start_cov; 
8✔
2117
        let mut coverage_counter = 1;
8✔
2118
        let mut n_correct_edges = 0; 
8✔
2119

2120
        // get fist next node
2121
        let sequence = self.base.sequences.get(start_node_id);
8✔
2122
        let term_kmer: K = sequence.term_kmer(Dir::Right);
8✔
2123
        let next_kmer = term_kmer.extend(start_ext, Dir::Right);
8✔
2124
        let (mut current_node_id, _, _) = self.find_link(next_kmer, Dir::Right).expect("link should exist"); 
8✔
2125

2126
        if self.check_edge_truth(start_node_id, current_node_id) {
8✔
2127
            n_correct_edges += 1;
4✔
2128
        }
4✔
2129

2130
        loop {
2131
            // check if current node is "target node", i.e., the path has reached the desired length
2132
            match path_length {
68✔
2133
                pl if pl == target_length => return Some((current_node_id, (sum_path_cov as f32 / coverage_counter as f32), (n_correct_edges as f32 / (path_length + 1) as f32))), // path has target length
68✔
2134
                pl if pl > target_length => return None, // path has surpassed desired length => invalid
63✔
2135
                _ => () // path has not yet reached desired length, continue
63✔
2136
            }
2137

2138
            // since current node is not target node, add length to path
2139
            let len = self.get_node(current_node_id).len() - (K::k() - 1);
63✔
2140
            path_length += len;
63✔
2141

2142
            // get current node
2143
            let current_node = self.get_node(current_node_id);
63✔
2144
        
2145
            // find outgoing edge with highest coverage
2146
            let em = current_node.data().edge_mults().expect("must have edge mults").right();
63✔
2147
            let (max_cov_base, &max_cov) = em.iter().rev().enumerate().filter(|&(_, &c)| c > 0).max_by(|&(_b1, &c1), &(_b2, c2)| c1.cmp(c2))?;
252✔
2148

2149
            // get next node id
2150
            let sequence = self.base.sequences.get(current_node_id);
60✔
2151
            let term_kmer: K = sequence.term_kmer(Dir::Right);
60✔
2152
            let next_kmer = term_kmer.extend(max_cov_base as u8, Dir::Right);
60✔
2153
            let (out_node_id, _, _) = self.find_link(next_kmer, Dir::Right).expect("link should exist"); 
60✔
2154

2155
            // try to check if edge is "correct", add extra length of current node to it to account for compression
2156
            if self.check_edge_truth(current_node_id, out_node_id) {
60✔
2157
                n_correct_edges += len;
35✔
2158
            }
35✔
2159

2160
            // set current node id to next node to be visited
2161
            current_node_id = out_node_id;
60✔
2162
            sum_path_cov += max_cov;
60✔
2163
            coverage_counter += 1;
60✔
2164
        }
2165
    }
8✔
2166

2167
    /// remove tips from the graph, reqires the graoh to have edge mults and be stranded
2168
    /// it is recommended to use this function after [`DebruijnGraph::remove_ladders`], since 
2169
    /// the latter will likely leave tips in the graph
2170
    pub fn remove_tips<DI, P>(&mut self, min_diff_factor: u32, max_avg_tip_cov: f32, out_path: Option<P>) -> Result<(), String> 
2✔
2171
    where 
2✔
2172
        SD: SummaryData<DI>,
2✔
2173
        P: AsRef<Path>
2✔
2174
    {
2175
        // interrupt if we dont have edge mults
2176
        if self.get_node(0).data().edge_mults().is_none() { return Err(String::from("no edge mults available")) };
2✔
2177
        if !self.base.stranded { return Err(String::from("must be stranded")) };
2✔
2178

2179
        let mut writer = out_path.map(|path| BufWriter::new(File::create(path).expect("error creating ladder stats file")));
2✔
2180
        if let Some(wtr) = writer.as_mut() {
2✔
2181
            writeln!(wtr, "high cov,avg low cov,max true,tip truth ratio,tip len,dir").unwrap();
×
2182
        }
2✔
2183

2184
        let max_len = K::k();
2✔
2185

2186
        // iterate over nodes
2187
        for node_id in 0..self.len() {
66✔
2188
            // do for both left and right as "outgoing" direction 
2189
            // right for regular outgoing and tips from the read-end
2190
            // left for the reverse -> tips from the read-start
2191
            for dir in [Dir::Left, Dir::Right] {
132✔
2192
                let current_node = self.get_node(node_id);
132✔
2193
                // check if node has outgoing edge(s) with both high and low coverage
2194
                let outs = current_node.data().edge_mults().expect("should have em").single_dir(dir).edge_mults;
132✔
2195

2196
                let Some((out_max_base, &out_max_cov)) = outs.iter().rev().enumerate().filter(|&(_, &c)| c  > 0).max_by(|&(_b1, &c1), &(_b2, c2)| c1.cmp(c2)) else { continue };
528✔
2197
                let smaller_outs = outs.iter().copied().rev().enumerate().filter(|&(_b, c)| (c > 0) & (out_max_cov > c)).collect::<Vec<_>>();
444✔
2198
                
2199
                if smaller_outs.is_empty() { continue; }
111✔
2200

2201
                // try and check if max cov edge is correct
2202
                let max_connection_correct = {
4✔
2203
                    let sequence = self.base.sequences.get(node_id);
4✔
2204
                    let term_kmer: K = sequence.term_kmer(dir);
4✔
2205
                    let next_kmer = term_kmer.extend(out_max_base as u8, dir);
4✔
2206
                    let (next_node_id, _, _) = self.find_link(next_kmer, dir).expect("missing link");
4✔
2207

2208
                    self.check_edge_truth(node_id, next_node_id)
4✔
2209
                }; // if current node is incorrect, connection is incorrect in any case
2210
                
2211
                // check all small outs
2212
                for (s_base, s_cov) in smaller_outs {
4✔
2213
                    // path has to be short + unambiguous + consistently low coverage
2214
                    let Some((tip_path, avg_tip_coverage, truth_ratio, tip_len)) = self.follow_tip_path(node_id, s_base as u8, s_cov, dir) else { continue; };
4✔
2215

2216
                    // remove path if coverage below threshold
2217
                    if (s_cov * min_diff_factor <= out_max_cov) & (avg_tip_coverage <= max_avg_tip_cov) & (tip_len <= max_len) {
4✔
2218
                        self.remove_path(tip_path, dir)?;
4✔
2219
                    }
×
2220
                        
2221
                    if let Some(wtr) = writer.as_mut() {
4✔
2222
                        writeln!(wtr, "{},{},{},{},{},{:?}", out_max_cov, avg_tip_coverage, max_connection_correct, truth_ratio, tip_len, dir).unwrap();
×
2223
                    }
4✔
2224
                }
2225
            }
2226
        }
2227
        
2228
        Ok(())
2✔
2229
    }
2✔
2230

2231
    /// follow a tip path, returns path, average coverage and ration of correct to incorrect connections
2232
    fn follow_tip_path<DI>(&self, start_node_id: usize, start_ext: u8, start_cov: u32, dir: Dir) -> Option<(Vec<usize>, f32, f32, usize)> 
4✔
2233
    where 
4✔
2234
        SD: SummaryData<DI>
4✔
2235
    {
2236
        const COV_MARGIN: f32 = 0.3;
2237
        const COV_ADD_MARGIN: f32 = 4.;
2238

2239
        // path, including start and target node
2240
        let mut path = Vec::new();
4✔
2241
        path.push(start_node_id);
4✔
2242

2243
        let mut path_length = 0;
4✔
2244
        let mut n_correct_edges = 0;
4✔
2245

2246
        // get fist next node
2247
        let sequence = self.base.sequences.get(start_node_id);
4✔
2248
        let term_kmer: K = sequence.term_kmer(dir);
4✔
2249
        let next_kmer = term_kmer.extend(start_ext, dir);
4✔
2250
        let (mut current_node_id, _, _) = self.find_link(next_kmer, dir).expect("link should exist"); 
4✔
2251
        path.push(current_node_id);
4✔
2252

2253
        if self.check_edge_truth(start_node_id, current_node_id) {
4✔
2254
            n_correct_edges += 1;
×
2255
        }
4✔
2256

2257
        let mut current_cov = start_cov as f32;
4✔
2258
        let mut sum_path_cov = start_cov as f32;
4✔
2259
        let mut coverage_counter = 1;
4✔
2260

2261
        loop {     
2262
            // get current node
2263
            let current_node = self.get_node(current_node_id);
13✔
2264

2265
            // add length to path
2266
            let len = current_node.len() - (K::k() - 1);
13✔
2267
            path_length += len;
13✔
2268

2269
            // low path nodes should have only one incoming and one outgoing edge
2270
            let in_edges = current_node.edges(dir.flip());
13✔
2271
            if in_edges.len() != 1 { return None; }
13✔
2272

2273
            let out_edges = current_node.edges(dir);
13✔
2274
            match out_edges.len() {
13✔
2275
                oe if oe > 1 => return None,
13✔
2276
                0 => return Some((path, (sum_path_cov / coverage_counter as f32), (n_correct_edges as f32 / path_length as f32), path_length)),
4✔
2277
                _ => ()
9✔
2278
            }
2279

2280
            // if edge cov avalable, check for similarity
2281
            let (out_ext, out_node_id, _, _) = out_edges[0];
9✔
2282
            if let Some(em) = current_node.data().edge_mults() {
9✔
2283
                let coverage = em.edge_mult(out_ext, dir) as f32;
9✔
2284
                if coverage < current_cov - current_cov * COV_MARGIN - COV_ADD_MARGIN // extra two for small values
9✔
2285
                    || coverage > current_cov + current_cov * COV_MARGIN + COV_ADD_MARGIN {
9✔
2286
                    return None;
×
2287
                } else {
9✔
2288
                    current_cov = coverage;
9✔
2289
                }
9✔
2290
            }
×
2291

2292
            // try to check if edge is "correct", add extra length of current node to it to account for compression
2293
            if self.check_edge_truth(current_node_id, out_node_id) {
9✔
2294
                n_correct_edges += len;
×
2295
            }
9✔
2296

2297
            // set current node id to next node to be visited
2298
            current_node_id = out_node_id;
9✔
2299
            sum_path_cov += current_cov;
9✔
2300
            coverage_counter += 1;
9✔
2301
            // add current node to path
2302
            path.push(current_node_id);
9✔
2303

2304
        }
2305
    }
4✔
2306

2307
    /// remove the edges (not the nodes) of a path in the graph
2308
    fn remove_path<DI>(&mut self, path: Vec<usize>, base_dir: Dir) -> Result<(), String> 
48✔
2309
    where SD: SummaryData<DI>
48✔
2310
    {
2311
        let mut path_iter = path.clone().into_iter();
48✔
2312
        let Some(mut current_node_id) = path_iter.next() else { return Ok(()); };
48✔
2313

2314
        loop {
2315
            // get next node id
2316
            let Some(next_node_id) = path_iter.next() else { return Ok(()); }; // whole path has been covered
204✔
2317
            // remove ext to the right of current node
2318
            let Some((out_base, _, _, _)) = self.get_node(current_node_id).edges(base_dir).iter().find(|&(_, id, _, _)| *id == next_node_id).copied()
181✔
2319
                else { return Err(format!(
×
2320
"no edge to remove (base dir)
×
2321
path: {:?}
×
2322
dir: {:?}
×
2323
node1:
×
2324
    id: {current_node_id}
×
2325
    left e: {:?}
×
2326
    right e: {:?}
×
2327
    exts: {:?}
×
2328
    data: {:?}
×
2329
node2:
×
2330
    id: {next_node_id}
×
2331
    left e: {:?}
×
2332
    right e: {:?}
×
2333
    exts: {:?}
×
2334
    data: {:?}",
×
2335
                path,
×
2336
                base_dir,
×
2337
                self.get_node(current_node_id).l_edges(),
×
2338
                self.get_node(current_node_id).r_edges(),
×
2339
                self.get_node(current_node_id).exts(),
×
2340
                self.get_node(current_node_id).data(),
×
2341
                self.get_node(next_node_id).l_edges(),
×
2342
                self.get_node(next_node_id).r_edges(),
×
2343
                self.get_node(next_node_id).exts(),
×
2344
                self.get_node(next_node_id).data(),
×
2345
                ))
×
2346
            };
2347

2348
            // remove ext 
2349
            self.base.exts[current_node_id] = self.base.exts[current_node_id].remove(base_dir, out_base);
156✔
2350
        
2351

2352
            // remove ext to the left of the next node
2353
            let Some((in_base, _, _, _)) = self.get_node(next_node_id).edges(base_dir.flip()).iter().find(|&(_, id, _, _)| *id == current_node_id).copied() 
165✔
2354
                else { return Err( format!(
×
2355
"no edge to remove (other dir)
×
2356
path: {:?}
×
2357
dir: {:?}
×
2358
node1:
×
2359
    id: {current_node_id}
×
2360
    left e: {:?}
×
2361
    right e: {:?}
×
2362
    exts: {:?}
×
2363
    data: {:?}
×
2364
node2:
×
2365
    id: {next_node_id}
×
2366
    left e: {:?}
×
2367
    right e: {:?}
×
2368
    exts: {:?}
×
2369
    data: {:?}",
×
2370
                path,
×
2371
                base_dir,
×
2372
                self.get_node(current_node_id).l_edges(),
×
2373
                self.get_node(current_node_id).r_edges(),
×
2374
                self.get_node(current_node_id).exts(),
×
2375
                self.get_node(current_node_id).data(),
×
2376
                self.get_node(next_node_id).l_edges(),
×
2377
                self.get_node(next_node_id).r_edges(),
×
2378
                self.get_node(next_node_id).exts(),
×
2379
                self.get_node(next_node_id).data(),
×
2380
                ))
×
2381
            };
2382

2383
            // remove ext
2384
            self.base.exts[next_node_id] = self.base.exts[next_node_id].remove(base_dir.flip(), in_base); 
156✔
2385

2386
            // use new exts to fix edge mults
2387
            self.base.data[current_node_id].fix_edge_mults(self.base.exts[current_node_id]);
156✔
2388
            self.base.data[next_node_id].fix_edge_mults(self.base.exts[next_node_id]);
156✔
2389

2390

2391
            current_node_id = next_node_id;
156✔
2392
        }
2393
    }
48✔
2394

2395
    /// use mapped ids to check if the nodes of an edge were mapped to the same id
2396
    /// returns false if mapped ids are not available
2397
    pub fn check_edge_truth<DI>(&self, node_id_1: usize, node_id_2: usize) -> bool
124✔
2398
    where 
124✔
2399
        SD: SummaryData<DI>
124✔
2400
    {
2401
        let mapped_ids_1 = self.get_node(node_id_1).data().mapped_ids();
124✔
2402
        let mapped_ids_2 = self.get_node(node_id_2).data().mapped_ids();
124✔
2403

2404
        if let (Some(mids1), Some(mids2)) = (mapped_ids_1, mapped_ids_2) {
124✔
2405
            let hashed_mi2 = mids2.iter().copied().collect::<HashSet<_>>();
124✔
2406
            for mid in mids1 {
124✔
2407
                if hashed_mi2.contains(mid) {
51✔
2408
                    return true;
43✔
2409
                }
8✔
2410
            }
2411
        }
×
2412

2413
        false
81✔
2414
    }
124✔
2415
}
2416

2417

2418
#[derive(Debug, Eq, PartialEq)]
2419
enum Status {
2420
    Active,
2421
    End,
2422
    Cycle,
2423
}
2424

2425
#[derive(Debug)]
2426
struct State {
2427
    path: SmallVec8<(u32, Dir)>,
2428
    score: f32,
2429
    status: Status,
2430
}
2431

2432
impl State {}
2433

2434
#[derive(Debug, PartialEq, Eq)]
2435
pub enum LadderState {
2436
    Singular,
2437
    Double
2438
}
2439

2440
/// Iterator over nodes in a `DeBruijnGraph`
2441
pub struct NodeIter<'a, K: Kmer + 'a, D: Debug + 'a> {
2442
    graph: &'a DebruijnGraph<K, D>,
2443
    node_id: usize,
2444
}
2445

2446
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIter<'a, K, D> {
2447
    type Item = Node<'a, K, D>;
2448

2449
    fn next(&mut self) -> Option<Node<'a, K, D>> {
26,853✔
2450
        if self.node_id < self.graph.len() {
26,853✔
2451
            let node = self.graph.get_node(self.node_id);
26,723✔
2452
            self.node_id += 1;
26,723✔
2453
            Some(node)
26,723✔
2454
        } else {
2455
            None
130✔
2456
        }
2457
    }
26,853✔
2458
}
2459

2460
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for &'a DebruijnGraph<K, D> {
2461
    type Item = NodeKmer<'a, K, D>;
2462
    type IntoIter = NodeIntoIter<'a, K, D>;
2463

2464
    fn into_iter(self) -> Self::IntoIter {
2,226✔
2465
        NodeIntoIter {
2,226✔
2466
            graph: self,
2,226✔
2467
            node_id: 0,
2,226✔
2468
        }
2,226✔
2469
    }
2,226✔
2470
}
2471

2472
/// Iterator over nodes in a `DeBruijnGraph`
2473
pub struct NodeIntoIter<'a, K: Kmer + 'a, D: Debug + 'a> {
2474
    graph: &'a DebruijnGraph<K, D>,
2475
    node_id: usize,
2476
}
2477

2478
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIntoIter<'a, K, D> {
2479
    type Item = NodeKmer<'a, K, D>;
2480

2481
    fn next(&mut self) -> Option<Self::Item> {
238,256✔
2482
        if self.node_id < self.graph.len() {
238,256✔
2483
            let node_id = self.node_id;
238,256✔
2484
            let node = self.graph.get_node(node_id);
238,256✔
2485
            let node_seq = node.sequence();
238,256✔
2486

2487
            self.node_id += 1;
238,256✔
2488
            Some(NodeKmer {
238,256✔
2489
                node_id,
238,256✔
2490
                node_seq_slice: node_seq,
238,256✔
2491
                phantom_d: PhantomData,
238,256✔
2492
                phantom_k: PhantomData,
238,256✔
2493
            })
238,256✔
2494
        } else {
2495
            None
×
2496
        }
2497
    }
238,256✔
2498
}
2499

2500
/// A `DebruijnGraph` node with a reference to the sequence of the node.
2501
#[derive(Clone)]
2502
pub struct NodeKmer<'a, K: Kmer + 'a, D: Debug + 'a> {
2503
    pub node_id: usize,
2504
    node_seq_slice: DnaStringSlice<'a>,
2505
    phantom_k: PhantomData<K>,
2506
    phantom_d: PhantomData<D>,
2507
}
2508

2509
/// An iterator over the kmers in a `DeBruijn graph node`
2510
pub struct NodeKmerIter<'a, K: Kmer + 'a, D: Debug + 'a> {
2511
    kmer_id: usize,
2512
    kmer: K,
2513
    num_kmers: usize,
2514
    node_seq_slice: DnaStringSlice<'a>,
2515
    phantom_k: PhantomData<K>,
2516
    phantom_d: PhantomData<D>,
2517
}
2518

2519
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for NodeKmer<'a, K, D> {
2520
    type Item = K;
2521
    type IntoIter = NodeKmerIter<'a, K, D>;
2522

2523
    fn into_iter(self) -> Self::IntoIter {
476,512✔
2524
        let num_kmers = self.node_seq_slice.len() - K::k() + 1;
476,512✔
2525

2526
        let kmer = if num_kmers > 0 {
476,512✔
2527
            self.node_seq_slice.get_kmer::<K>(0)
476,512✔
2528
        } else {
2529
            K::empty()
×
2530
        };
2531

2532
        NodeKmerIter {
476,512✔
2533
            kmer_id: 0,
476,512✔
2534
            kmer,
476,512✔
2535
            num_kmers,
476,512✔
2536
            node_seq_slice: self.node_seq_slice,
476,512✔
2537
            phantom_k: PhantomData,
476,512✔
2538
            phantom_d: PhantomData,
476,512✔
2539
        }
476,512✔
2540
    }
476,512✔
2541
}
2542

2543
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeKmerIter<'a, K, D> {
2544
    type Item = K;
2545

2546
    fn next(&mut self) -> Option<Self::Item> {
3,508,540✔
2547
        if self.num_kmers == self.kmer_id {
3,508,540✔
2548
            None
×
2549
        } else {
2550
            let current_kmer = self.kmer;
3,508,540✔
2551

2552
            self.kmer_id += 1;
3,508,540✔
2553
            if self.kmer_id < self.num_kmers {
3,508,540✔
2554
                let next_base = self.node_seq_slice.get(self.kmer_id + K::k() - 1);
3,447,130✔
2555
                let new_kmer = self.kmer.extend_right(next_base);
3,447,130✔
2556
                self.kmer = new_kmer;
3,447,130✔
2557
            }
3,447,130✔
2558

2559
            Some(current_kmer)
3,508,540✔
2560
        }
2561
    }
3,508,540✔
2562

2563
    fn size_hint(&self) -> (usize, Option<usize>) {
238,256✔
2564
        (self.num_kmers, Some(self.num_kmers))
238,256✔
2565
    }
238,256✔
2566

2567
    /// Provide a 'fast-forward' capability for this iterator
2568
    /// MPHF will use this to reduce the number of kmers that
2569
    /// need to be produced.
2570
    fn nth(&mut self, n: usize) -> Option<Self::Item> {
2,529,434✔
2571
        if n <= 4 {
2,529,434✔
2572
            // for small skips forward, shift one base at a time
2573
            for _ in 0..n {
2,273,342✔
2574
                self.next();
979,106✔
2575
            }
979,106✔
2576
        } else {
256,092✔
2577
            self.kmer_id += n;
256,092✔
2578
            self.kmer = self.node_seq_slice.get_kmer::<K>(self.kmer_id);
256,092✔
2579
        }
256,092✔
2580

2581
        self.next()
2,529,434✔
2582
    }
2,529,434✔
2583
}
2584

2585
/// Marker signifying that NodeKmerIter has a known size.
2586
impl<'a, K: Kmer + 'a, D: Debug + 'a> ExactSizeIterator for NodeKmerIter<'a, K, D> {}
2587

2588
/// Unbranched sequence in the DeBruijn graph
2589
pub struct Node<'a, K: Kmer + 'a, D: 'a> {
2590
    pub node_id: usize,
2591
    pub graph: &'a DebruijnGraph<K, D>,
2592
}
2593

2594
impl<'a, K: Kmer, D: Debug> Node<'a, K, D> {
2595
    /// Length of the sequence of this node
2596
    pub fn len(&self) -> usize {
1,448,418✔
2597
        self.graph.base.sequences.get(self.node_id).len()
1,448,418✔
2598
    }
1,448,418✔
2599

2600
    pub fn is_empty(&self) -> bool {
×
2601
        self.graph.base.sequences.get(self.node_id).is_empty()
×
2602
    }
×
2603

2604
    /// Sequence of the node
2605
    pub fn sequence(&self) -> DnaStringSlice<'a> {
5,290,709✔
2606
        self.graph.base.sequences.get(self.node_id)
5,290,709✔
2607
    }
5,290,709✔
2608

2609
    /// Reference to auxiliarly data associated with the node
2610
    pub fn data(&self) -> &'a D {
3,596,301✔
2611
        &self.graph.base.data[self.node_id]
3,596,301✔
2612
    }
3,596,301✔
2613

2614
    /// Extension bases from this node
2615
    pub fn exts(&self) -> Exts {
3,603,521✔
2616
        self.graph.base.exts[self.node_id]
3,603,521✔
2617
    }
3,603,521✔
2618

2619
    /// Edges leaving the left side of the node in the format
2620
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
2621
    pub fn l_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
7,606✔
2622
        self.graph.find_edges(self.node_id, Dir::Left)
7,606✔
2623
    }
7,606✔
2624

2625
    /// Edges leaving the right side of the node in the format
2626
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
2627
    pub fn r_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
7,606✔
2628
        self.graph.find_edges(self.node_id, Dir::Right)
7,606✔
2629
    }
7,606✔
2630

2631
    /// Edges leaving the 'dir' side of the node in the format
2632
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
2633
    pub fn edges(&self, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
423,902✔
2634
        self.graph.find_edges(self.node_id, dir)
423,902✔
2635
    }
423,902✔
2636

2637
    fn to_json<F: Fn(&D) -> Value>(&self, func: &F, f: &mut dyn Write) {
×
2638
        write!(
×
2639
            f,
×
2640
            "{{\"id\":\"{}\",\"L\":{},\"D\":{},\"Se\":\"{:?}\"}}",
2641
            self.node_id,
2642
            self.sequence().len(),
×
2643
            (func)(self.data()),
×
2644
            self.sequence(),
×
2645
        )
2646
        .unwrap();
×
2647
    }
×
2648

2649
    fn edges_to_json(&self, f: &mut dyn Write) -> bool {
×
2650
        let mut wrote = false;
×
2651
        let edges = self.r_edges();
×
2652
        for (idx, &(_, id, incoming_dir, _)) in edges.iter().enumerate() {
×
2653
            write!(
×
2654
                f,
×
2655
                "{{\"source\":\"{}\",\"target\":\"{}\",\"D\":\"{}\"}}",
2656
                self.node_id,
2657
                id,
2658
                match incoming_dir {
×
2659
                    Dir::Left => "L",
×
2660
                    Dir::Right => "R",
×
2661
                }
2662
            )
2663
            .unwrap();
×
2664

2665
            if idx < edges.len() - 1 {
×
2666
                write!(f, ",").unwrap();
×
2667
            }
×
2668

2669
            wrote = true;
×
2670
        }
2671
        wrote
×
2672
    }
×
2673
}
2674

2675
// TODO make generic instead of u8 (u8 is sufficient for dbg)
2676
impl<K: Kmer, SD: Debug> Node<'_, K, SD>  {
2677
    /// get default format for dot edges based on node data
2678
    pub fn edge_dot_default<DI>(&self, colors: &Colors<SD, DI>, base: u8, incoming_dir: Dir, flipped: bool) -> String 
2,306✔
2679
    where SD: SummaryData<DI>
2,306✔
2680
    {
2681
        // set color based on dir
2682
        let color = match incoming_dir {
2,306✔
2683
            Dir::Left => "blue",
1,213✔
2684
            Dir::Right => "red"
1,093✔
2685
        };
2686
        
2687
        if let Some(em) = self.data().edge_mults() {
2,306✔
2688
            
2689
            let dir = if flipped { 
2,306✔
2690
                incoming_dir 
1,106✔
2691
            } else {
2692
                incoming_dir.flip()
1,200✔
2693
            };
2694

2695
            // set penwidth based on count
2696
            let count = em.edge_mult(base, dir);
2,306✔
2697
            let penwidth = colors.edge_width(count);
2,306✔
2698

2699
            format!("[color={color}, penwidth={penwidth}, label=\"{}: {count}\", weight={count}]", bits_to_base(base))
2,306✔
2700
        } else {
2701
            format!("[color={color}, penwidth={}]", colors.edge_width(1)) // since there should be no edge mults, this will return default value
×
2702
        }
2703
    }
2,306✔
2704

2705
    /// get default format for dot nodes, based on node data
2706
    pub fn node_dot_default<DI>(&self, colors: &Colors<SD, DI>, config: &SummaryConfig, translator: &Translator, outline: bool, translate_id_groups: bool) -> String
1,774✔
2707
    where SD: SummaryData<DI>
1,774✔
2708
    {
2709
        // set color based on labels/fold change/p-value
2710
        let color = colors.node_color(self.data(), config, outline);
1,774✔
2711
        let translate_id_groups = if translate_id_groups { colors.id_group_ids() } else { None };
1,774✔
2712

2713
        let data_info = self.data().print(translator, config, translate_id_groups);
1,774✔
2714
        const MIN_TEXT_WIDTH: usize = 40;
2715
        let wrap = if self.len() > MIN_TEXT_WIDTH { self.len() } else { MIN_TEXT_WIDTH };
1,774✔
2716

2717
        let label = textwrap::fill(&format!("id: {}, len: {}, exts: {:?}, seq: {}\n{}", 
1,774✔
2718
            self.node_id,
1,774✔
2719
            self.len(),
1,774✔
2720
            self.exts(),
1,774✔
2721
            self.sequence(),
1,774✔
2722
            data_info
1,774✔
2723
        ), wrap);
1,774✔
2724

2725
        format!("[{color}, label=\"{label}\"]")
1,774✔
2726
    }
1,774✔
2727
}
2728

2729
impl<K: Kmer, D> fmt::Debug for Node<'_, K, D>
2730
where
2731
    D: Debug,
2732
{
2733
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
140✔
2734
        write!(
140✔
2735
            f,
140✔
2736
            "Node {{ id:{}, Exts: {:?}, L:{:?} R:{:?}, Seq: {:?}, Data: {:?} }}",
2737
            self.node_id,
2738
            self.exts(),
140✔
2739
            self.l_edges(),
140✔
2740
            self.r_edges(),
140✔
2741
            self.sequence().len(),
140✔
2742
            self.data()
140✔
2743
        )
2744
    }
140✔
2745
}
2746

2747
pub struct IterComponents<'a, K: Kmer, D> {
2748
    graph: &'a DebruijnGraph<K, D>,
2749
    visited: Vec<bool>,
2750
    pos: usize,
2751
}
2752

2753
impl<K: Kmer, D: Debug> Iterator for IterComponents<'_, K, D> {
2754
    type Item = Vec<usize>;
2755
    fn next(&mut self) -> Option<Self::Item> {
246✔
2756
        while self.pos < self.graph.len() {
1,133✔
2757
            if !self.visited[self.pos] {
1,128✔
2758
                let comp = self.graph.component_i(&mut self.visited, self.pos);
241✔
2759
                self.pos += 1;
241✔
2760
                return Some(comp)
241✔
2761
            } else {
887✔
2762
                self.pos += 1;
887✔
2763
            }
887✔
2764
        }
2765
        assert!(self.visited.iter().map(|x| *x as usize).sum::<usize>() == self.graph.len());
1,128✔
2766
        None
5✔
2767
    }
246✔
2768
    
2769
}
2770

2771
pub struct PathCompIter<'a, K: Kmer, D: Debug, F, F2> 
2772
where 
2773
F: Fn(&D) -> f32,
2774
F2: Fn(&D) -> bool
2775
{
2776
    graph: &'a DebruijnGraph<K, D>,
2777
    component_iterator: IterComponents<'a, K, D>,
2778
    graph_pos: usize,
2779
    score: F,
2780
    solid_path: F2,
2781
}
2782

2783
/// returns the component and the "best" path in the component
2784
impl<K: Kmer, D: Debug, F, F2> Iterator for PathCompIter<'_, K, D, F, F2> 
2785
where 
2786
F: Fn(&D) -> f32,
2787
F2: Fn(&D) -> bool
2788
{
2789
    type Item = (Vec<usize>, VecDeque<(usize, Dir)>,);
2790
    fn next(&mut self) -> Option<Self::Item> {
240✔
2791
        match self.component_iterator.next() {
240✔
2792
            Some(component) => {
237✔
2793
                let current_comp = component;
237✔
2794
                
2795
    
2796
                let mut best_node = current_comp[0];
237✔
2797
                let mut best_score = f32::MIN;
237✔
2798
                for c in current_comp.iter() {
856✔
2799
                    let node = self.graph.get_node(*c);
856✔
2800
                    let node_score = (self.score)(node.data());
856✔
2801
    
2802
                    if node_score > best_score {
856✔
2803
                        best_node = *c;
280✔
2804
                        best_score = node_score;
280✔
2805
                    }
576✔
2806
                }
2807
    
2808
                let oscore = |state| match state {
956✔
2809
                    None => 0.0,
396✔
2810
                    Some((id, _)) => (self.score)(self.graph.get_node(id).data()),
560✔
2811
                };
956✔
2812
    
2813
                let osolid_path = |state| match state {
478✔
2814
                    None => false,
×
2815
                    Some((id, _)) => (self.solid_path)(self.graph.get_node(id).data()),
478✔
2816
                };
478✔
2817
    
2818
                // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
2819
                // already put into the path
2820
                let mut used_nodes = HashSet::new();
237✔
2821
                let mut path = VecDeque::new();
237✔
2822
    
2823
                // Start w/ initial state
2824
                used_nodes.insert(best_node);
237✔
2825
                path.push_front((best_node, Dir::Left));
237✔
2826
    
2827
                for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
474✔
2828
                    let &(start_node, dir, do_flip) = init;
474✔
2829
                    let mut current = (start_node, dir);
474✔
2830
    
2831
                    loop {
2832
                        let mut next = None;
867✔
2833
                        let (cur_id, incoming_dir) = current;
867✔
2834
                        let node = self.graph.get_node(cur_id);
867✔
2835
                        let edges = node.edges(incoming_dir.flip());
867✔
2836
    
2837
                        let mut solid_paths = 0;
867✔
2838
                        for (_, id, dir, _) in edges {
867✔
2839
                            let cand = Some((id, dir));
478✔
2840
                            if osolid_path(cand) {
478✔
2841
                                solid_paths += 1;
478✔
2842

2843
                                // second if clause is outside of first in original code (see max_path) 
2844
                                // but would basically ignore path validity.
2845
                                if oscore(cand) > oscore(next) {
478✔
2846
                                    next = cand;
431✔
2847
                                }
431✔
2848
                            }
×
2849
                        }
2850
                        
2851
                        // break if multiple solid paths are available
2852
                        /* if solid_paths > 1 {
2853
                            break;
2854
                        } */
2855
    
2856
                        match next {
396✔
2857
                            Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
396✔
2858
                                if do_flip {
393✔
2859
                                    path.push_front((next_id, next_incoming.flip()));
200✔
2860
                                } else {
200✔
2861
                                    path.push_back((next_id, next_incoming));
193✔
2862
                                }
193✔
2863
    
2864
                                used_nodes.insert(next_id);
393✔
2865
                                current = (next_id, next_incoming);
393✔
2866
                            }
2867
                            _ => break,
474✔
2868
                        }
2869
                    }
2870
                }
2871
                
2872
                
2873
                Some((current_comp, path))
237✔
2874
            }, 
2875
            None => {
2876
                // should technically not need graph_pos after this 
2877
                self.graph_pos += 1;
3✔
2878
                None
3✔
2879
            }
2880
        }
2881
    }
240✔
2882
}
2883

2884

2885
/// iterator over the edges of the de bruijn graph
2886
pub struct EdgeIter<'a, K: Kmer, D: Debug> {
2887
    graph: &'a DebruijnGraph<K, D>,
2888
    visited_edges: HashSet<(usize, usize)>,
2889
    current_node: usize,
2890
    current_dir: Dir,
2891
    node_edge_iter: smallvec::IntoIter<[(u8, usize, Dir, bool); 4]>
2892
}
2893

2894
impl<K: Kmer, D: Debug> EdgeIter<'_, K, D> {
2895
    pub fn new(graph: &DebruijnGraph<K, D>) -> EdgeIter<'_, K, D>{
17✔
2896
        let node_edge_iter = graph.get_node(0).l_edges().into_iter();
17✔
2897

2898
        EdgeIter { 
17✔
2899
            graph, 
17✔
2900
            visited_edges: HashSet::new(), 
17✔
2901
            current_node: 0, 
17✔
2902
            current_dir: Dir::Left, 
17✔
2903
            node_edge_iter
17✔
2904
        }
17✔
2905
    }
17✔
2906
}
2907

2908
impl<K: Kmer, D: Debug> Iterator for EdgeIter<'_, K, D> {
2909
    type Item = (usize, Dir, u8, usize); // node, direction leaving node, base, target node
2910

2911
    fn next(&mut self) -> Option<Self::Item> {
1,499✔
2912
        loop {
2913
            if let Some((base, nb_node_id, _, _)) = self.node_edge_iter.next() {
6,180✔
2914
                let edge = if self.current_node > nb_node_id { (nb_node_id, self.current_node) } else { (self.current_node, nb_node_id) };
2,964✔
2915

2916
                if self.visited_edges.insert(edge) { return Some((self.current_node, self.current_dir, base, nb_node_id)); } // else simply skip and move on
2,964✔
2917

2918
            } else {
2919
                match self.current_dir {
3,216✔
2920
                    Dir::Left => {
1,608✔
2921
                        // no left edges, switch to right edges
1,608✔
2922
                        self.current_dir = Dir::Right;
1,608✔
2923
                        self.node_edge_iter = self.graph.get_node(self.current_node).r_edges().into_iter();
1,608✔
2924
                        
1,608✔
2925
                    }
1,608✔
2926
                    Dir::Right => {
2927
                        // no right edges, switch to next node left edges
2928
                        self.current_node += 1;
1,608✔
2929

2930
                        // quit if end of graph is reached
2931
                        if self.current_node == self.graph.len() { return None }
1,608✔
2932

2933
                        self.current_dir = Dir::Left;
1,591✔
2934
                        self.node_edge_iter = self.graph.get_node(self.current_node).l_edges().into_iter();
1,591✔
2935
                    }
2936
                }
2937
            }
2938
            
2939
        }
2940
    }
1,499✔
2941
}
2942

2943
#[cfg(test)]
2944
mod test {
2945
    use std::fs::remove_file;
2946

2947
    use crate::{BaseQuality, Exts, colors::Colors, compression::{CheckCompress, ScmapCompress, compress_kmers_with_hash, uncompressed_graph}, dna_string::DnaString, filter::filter_kmers, kmer::{Kmer6, Kmer16, Kmer22}, reads::{Reads, ReadsPaired}, serde::SerKmers, summarizer::{IDMapEMData, IDMapEMQualityData, IDTag, SampleInfo, SummaryConfig, TagsCountsData, TagsCountsSumData, Translator}, test::random_dna};
2948

2949
    use crate::{summarizer::SummaryData, Dir};
2950

2951

2952
    #[test]
2953
    #[cfg(not(feature = "sample128"))]
2954
    fn test_components() {
2955

2956
        // cargo run -- -i data/test_400.fastq.gz -o ../rust-debruijn/test_data/400 -s tags-counts-sum -k 18 -t t
2957

2958
        use crate::{kmer::Kmer16, serde::SerGraph};
2959
        let path = "test_data/400.graph.dbg";
2960

2961
        let ser_graph: SerGraph<Kmer16, TagsCountsSumData> = SerGraph::deserialize_from(path);
2962
        let graph = ser_graph.graph();
2963

2964
        let components = graph.iter_components();
2965

2966
        let check_components = [
2967
            vec![3, 67, 130, 133, 59, 119, 97, 110, 68, 137, 29, 84, 131, 43, 30, 91, 14, 70, 79, 142, 136, 105, 103, 62, 
2968
                141, 104, 134, 88, 38, 81, 108, 92, 135, 96, 116, 121, 63, 124, 106, 129, 132, 126, 93, 109, 83, 112, 118, 
2969
                123, 125, 78, 122, 115, 75, 128, 140, 111, 26, 143, 113],
2970
            vec![41, 138, 100, 139, 86],
2971
            vec![53, 117, 127],
2972
            vec![69, 144, 77, 120, 114, 107, 101],
2973
        ];
2974

2975
        let mut counter = 0;
2976

2977
        for component in components {
2978
            if component.len() > 1 { 
2979
                println!("component: {:?}", component);
2980
                assert_eq!(component, check_components[counter]);
2981
                counter += 1;
2982
            }
2983
        }
2984
        assert_eq!(vec![(139, Dir::Left)], graph.max_path(|data| data.sum().unwrap_or(1) as f32, |_| true));
2985
    }
2986

2987
    #[test]
2988
    fn test_iter_edges() {
1✔
2989
        use crate::{compression::uncompressed_graph, filter::filter_kmers, reads::{Reads, ReadsPaired}, summarizer::{SampleInfo, SummaryConfig, TagsData}};
2990

2991
        let read1 = "CAGCATCGATGCGACGAGCGCTCGCATCGA".as_bytes();
1✔
2992
        let read2 = "ACGATCGTACGTAGCTAGCTGACTGAGC".as_bytes();
1✔
2993

2994
        let mut reads = Reads::new(crate::reads::Strandedness::Forward);
1✔
2995
        reads.add_from_bytes(read1, None, 0u8);
1✔
2996
        reads.add_from_bytes(read2, None, 1);
1✔
2997

2998
        let reads_paired = ReadsPaired::Unpaired { reads };
1✔
2999

3000
        let sample_info = SampleInfo::new(0b1, 0b10, vec![12, 12]);
1✔
3001
        let summary_config = SummaryConfig::new(sample_info);
1✔
3002
        let (kmers, _) = filter_kmers::<TagsData, Kmer16, _>(&reads_paired, &summary_config, false, 1., false);
1✔
3003

3004
        let graph = uncompressed_graph(&kmers, true).finish();
1✔
3005

3006
        let check_edges: Vec<(usize, Dir, u8, usize)> = vec![(0, Dir::Left, 2, 16), (0, Dir::Right, 1, 2), (1, Dir::Left, 0, 11), 
1✔
3007
        (1, Dir::Right, 0, 16), (3, Dir::Left, 0, 13), (3, Dir::Right, 3, 10), (4, Dir::Left, 2, 21), (4, Dir::Right, 1, 17), (5, Dir::Left, 1, 27), 
1✔
3008
        (5, Dir::Right, 2, 19), (6, Dir::Left, 1, 24), (6, Dir::Right, 2, 12), (7, Dir::Left, 2, 23), (7, Dir::Right, 3, 11), (8, Dir::Left, 0, 12), 
1✔
3009
        (9, Dir::Left, 3, 17), (9, Dir::Right, 3, 24), (10, Dir::Right, 1, 21), (13, Dir::Left, 1, 18), (14, Dir::Right, 0, 26), 
1✔
3010
        (15, Dir::Left, 2, 20), (15, Dir::Right, 3, 22), (18, Dir::Left, 2, 19), (20, Dir::Left, 1, 26), (22, Dir::Right, 2, 25), (23, Dir::Left, 1, 25)];
1✔
3011

3012
        let edges = graph.iter_edges().collect::<Vec<_>>();
1✔
3013

3014
        assert_eq!(check_edges, edges);
1✔
3015
    }
1✔
3016

3017
    // dbg -c ../marbel_datasets/sim_reads_100.csv -s sum --stranded -o ../rust-debruijn/test_data/marbel_100_sum --checkpoint -k 22
3018
    #[cfg(not(feature = "sample128"))]
3019
    const TEST_GRAPH: &str = "test_data/marbel_100_sum.kmers.dbg";
3020

3021
    // cargo run --features sample128 -- -c ../marbel_datasets/sim_reads_100.csv -s sum --stranded -o ../rust-debruijn/test_data/marbel_100_sum_128 --checkpoint -k 22
3022
    #[cfg(feature = "sample128")]
3023
    const TEST_GRAPH: &str = "test_data/marbel_100_sum_128.kmers.dbg";
3024

3025
    #[test]
3026
    fn test_map_transcripts() {
1✔
3027
        // dbg -c ../marbel_datasets/sim_reads_100.csv -s sum --stranded -o ../rust-debruijn/test_data/marbel_100_sum --checkpoint -k 22
3028
        let graph_path = TEST_GRAPH;
1✔
3029
        let t_ref_path = "test_data/marbel_100_tr_ref.fasta";
1✔
3030
        let (kmers, mut translator, _) = SerKmers::<Kmer22, u32>::deserialize_from(graph_path).dissolve();
1✔
3031

3032
        let unc_graph = uncompressed_graph(&kmers, true).finish();
1✔
3033

3034
        let t_map = unc_graph.map_transcripts(t_ref_path, &mut translator).unwrap();
1✔
3035
        assert_eq!(t_map.len(), unc_graph.len());
1✔
3036
        assert_eq!(t_map.iter().filter(|&ids| !ids.is_empty()).collect::<Vec<_>>().len(), 10720);
21,993✔
3037
    }
1✔
3038

3039
    fn build_reads_quality_test() -> ReadsPaired<IDTag> {
2✔
3040
        let correct1 = "CGATGCTGCTGATGCTGAGTCTGACGTATGCGATCGATCGACGATCGTACTAGCTGACTGTGCAGCTAGCTGACTGATCGTAGCTAGCTACGTGCTAGCTACTAGCACTGATGC";
2✔
3041
        let qu_corr1 = "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC";
2✔
3042
        let incorrect1 =                                     "CGACGATCGTACTAGCTGACTGTGCAGCTAGCTGACTGATCGTGGCTAGCTACGTGCTAGCTA";
2✔
3043
        let qu_incorr1 =                                     "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC-CCCCCCCCCCCCCCCCCCC";
2✔
3044
        let correct2 =         "GCATCGATCGACGACGTTACGTACGATCTACGTAGCTAGCTAGCTGACATGCTAGCTAGCTCTGACTGATCGTGGCTAGCTGACTGACTGTAGCT";
2✔
3045
        let qu_corr2 =         "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC";
2✔
3046
        let incorrect2 = "ATGCTGCTGATGCTGAGTCTGACGTAGGCGATCGATCGACGATCGTACTAGCTGACT";
2✔
3047
        let qu_incorr2 = "CCCCCCCCCCCCCCCCCCCCCCCCCC-CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC";
2✔
3048
        let incorrect3 = "ATGCTGCTGATGCTGACTCTGACGTAGGCGATCGATCGATGATCGTACTAGCTGACT";
2✔
3049
        let qu_incorr3 = "CCCCCCCCCCCCCCCC-CCCCCCCCC-CCCCCCCCCCCC-CCCCCCCCCCCCCCCCC";
2✔
3050

3051
        let incorrect4_ins =                 "GACGTATGCGATCGATCGACGATCGTACTAGCTGACTTGTGCAGCTAGCTGACTGAT";
2✔
3052
        let qu_incorr4_ins =                 "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC-CCCCCCCCCCCCCCCCCCC";
2✔
3053

3054
        let incorrect5_tip =                                                             "AGCTGACTGATCGTAGCTAGCTACGTGCTAGCTACTATCACTGATGC";
2✔
3055
        let qu_incorr5_tip =                                                             "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC-CCCCCCCCC";
2✔
3056

3057

3058
        let mut reads = Reads::new_with_quality(crate::reads::Strandedness::Forward);
2✔
3059

3060
        for _i in 0..20 {
40✔
3061
            reads.add_read(DnaString::from_acgt_bytes(correct1.as_bytes()), None, IDTag::new(1, 0), Some(qu_corr1.as_bytes()));
40✔
3062
        }
40✔
3063

3064
        for _i in 0..40 {
80✔
3065
            reads.add_read(DnaString::from_acgt_bytes(correct2.as_bytes()), None, IDTag::new(2, 0), Some(qu_corr2.as_bytes()));
80✔
3066
        }
80✔
3067

3068
        reads.add_read(DnaString::from_acgt_bytes(incorrect1.as_bytes()), None, IDTag::new(3, 1), Some(qu_incorr1.as_bytes()));
2✔
3069
        reads.add_read(DnaString::from_acgt_bytes(incorrect2.as_bytes()), None, IDTag::new(4, 1), Some(qu_incorr2.as_bytes()));
2✔
3070
        reads.add_read(DnaString::from_acgt_bytes(incorrect3.as_bytes()), None, IDTag::new(5, 1), Some(qu_incorr3.as_bytes()));
2✔
3071
        reads.add_read(DnaString::from_acgt_bytes(incorrect4_ins.as_bytes()), None, IDTag::new(6, 1), Some(qu_incorr4_ins.as_bytes()));
2✔
3072
        reads.add_read(DnaString::from_acgt_bytes(incorrect5_tip.as_bytes()), None, IDTag::new(7, 1), Some(qu_incorr5_tip.as_bytes()));
2✔
3073

3074

3075
        ReadsPaired::Unpaired { reads }
2✔
3076
    }
2✔
3077

3078
    #[test]
3079
    fn test_remove_lq_splits() {
1✔
3080
        let print = false;
1✔
3081

3082
        type K = Kmer16;
3083

3084
        let seqs = build_reads_quality_test();
1✔
3085
        let sample_info = SampleInfo::new(1, 0b111110, vec![1000, 10, 20, 20, 20, 20]);
1✔
3086
        let summary_config = SummaryConfig::new(sample_info);
1✔
3087
        let (kmers, _) = filter_kmers::<IDMapEMQualityData, K, IDTag>(&seqs, &summary_config, false, 1., false);
1✔
3088

3089

3090
        // make uncompressed graph
3091
        let mut unc_graph = uncompressed_graph(&kmers, true).finish();
1✔
3092

3093
        // add "mapped" ids to graph
3094
        for i in 0..unc_graph.len() {
263✔
3095
            let data = unc_graph.mut_data(i);
263✔
3096
            if let Some(ids) = data.ids() {
263✔
3097
                if ids.contains(&1) {
263✔
3098
                    data.set_mapped_ids(vec![1].into());
99✔
3099
                }
99✔
3100
                else if ids.contains(&2) {
164✔
3101
                    data.set_mapped_ids(vec![2].into());
80✔
3102
                }
84✔
NEW
3103
            }
×
3104
        }
3105

3106
        let colors = Colors::new(&unc_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 7 });
1✔
3107

3108
        if print { unc_graph.to_dot("uncompressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3109
        let n_edges = unc_graph.iter_edges().count();
1✔
3110
        unc_graph.remove_lq_splits(BaseQuality::Medium).unwrap();
1✔
3111
        if print { unc_graph.to_dot("uncompressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3112
    
3113
        let n_edges_af = unc_graph.iter_edges().count();
1✔
3114
        assert_eq!(n_edges, n_edges_af + 11);
1✔
3115

3116
        // make compressed graph
3117
        let spec = CheckCompress::new(|d: IDMapEMQualityData, _| d, |d, d1| d.join_test(d1));
260✔
3118
        let mut c_graph = compress_kmers_with_hash(true, &spec, &kmers, false, false).finish();
1✔
3119
    
3120
         // add "mapped" ids to graph
3121
        for i in 0..c_graph.len() {
28✔
3122
            let data = c_graph.mut_data(i);
28✔
3123
            if let Some(ids) = data.ids() {
28✔
3124
                if ids.contains(&1) {
28✔
3125
                    data.set_mapped_ids(vec![1].into());
16✔
3126
                }
16✔
3127
                else if ids.contains(&2) {
12✔
3128
                    data.set_mapped_ids(vec![2].into());
3✔
3129
                }
9✔
NEW
3130
            }
×
3131
        }
3132

3133
        let colors = Colors::new(&c_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 7 });
1✔
3134

3135
        if print { c_graph.to_dot("compressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3136
        let n_edges = c_graph.iter_edges().count();
1✔
3137
        c_graph.remove_lq_splits(BaseQuality::Medium).unwrap();
1✔
3138
        if print { c_graph.to_dot("compressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3139
    
3140
        let n_edges_af = c_graph.iter_edges().count();
1✔
3141
        assert_eq!(n_edges, n_edges_af + 11);
1✔
3142
    }
1✔
3143

3144
    #[test]
3145
    fn test_remove_lq_ladders_tips() {
1✔
3146
        let print = false;
1✔
3147

3148
        type K = Kmer16;
3149

3150
        let seqs = build_reads_quality_test();
1✔
3151
        let sample_info = SampleInfo::new(1, 0b111110, vec![1000, 10, 20, 20, 20, 20]);
1✔
3152
        let summary_config = SummaryConfig::new(sample_info);
1✔
3153
        let (kmers, _) = filter_kmers::<IDMapEMQualityData, K, IDTag>(&seqs, &summary_config, false, 1., false);
1✔
3154

3155

3156
        // make uncompressed graph
3157
        let mut unc_graph = uncompressed_graph(&kmers, true).finish();
1✔
3158

3159
        // add "mapped" ids to graph
3160
        for i in 0..unc_graph.len() {
263✔
3161
            let data = unc_graph.mut_data(i);
263✔
3162
            if let Some(ids) = data.ids() {
263✔
3163
                if ids.contains(&1) {
263✔
3164
                    data.set_mapped_ids(vec![1].into());
99✔
3165
                }
99✔
3166
                else if ids.contains(&2) {
164✔
3167
                    data.set_mapped_ids(vec![2].into());
80✔
3168
                }
84✔
NEW
3169
            }
×
3170
        }
3171

3172
        let colors = Colors::new(&unc_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 7 });
1✔
3173

3174
        if print { unc_graph.to_dot("uncompressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3175
        let n_edges = unc_graph.iter_edges().count();
1✔
3176
        unc_graph.remove_lq_ladders_tips(BaseQuality::Medium).unwrap();
1✔
3177
        if print { unc_graph.to_dot("uncompressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3178
    
3179
        let n_edges_af = unc_graph.iter_edges().count();
1✔
3180
        assert_eq!(n_edges, n_edges_af + 90);
1✔
3181

3182
        // make compressed graph
3183
        let spec = CheckCompress::new(|d: IDMapEMQualityData, _| d, |d, d1| d.join_test(d1));
260✔
3184
        let mut c_graph = compress_kmers_with_hash(true, &spec, &kmers, false, false).finish();
1✔
3185
    
3186
         // add "mapped" ids to graph
3187
        for i in 0..c_graph.len() {
28✔
3188
            let data = c_graph.mut_data(i);
28✔
3189
            if let Some(ids) = data.ids() {
28✔
3190
                if ids.contains(&1) {
28✔
3191
                    data.set_mapped_ids(vec![1].into());
16✔
3192
                }
16✔
3193
                else if ids.contains(&2) {
12✔
3194
                    data.set_mapped_ids(vec![2].into());
3✔
3195
                }
9✔
NEW
3196
            }
×
3197
        }
3198

3199
        let colors = Colors::new(&c_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 7 });
1✔
3200

3201
        if print { c_graph.to_dot("compressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3202
        let n_edges = c_graph.iter_edges().count();
1✔
3203
        c_graph.remove_lq_ladders_tips(BaseQuality::Medium).unwrap();
1✔
3204
        if print { c_graph.to_dot("compressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3205
    
3206
        let n_edges_af = c_graph.iter_edges().count();
1✔
3207
        assert_eq!(n_edges, n_edges_af + 15);
1✔
3208
    }
1✔
3209

3210
    #[test]
3211
    fn test_remove_ladders() {
1✔
3212
        let print = false; 
1✔
3213
        let c_csv = if print { Some("c_ladders.csv") } else { None };
1✔
3214
        let uc_csv = if print { Some("uc_ladders.csv") } else { None };
1✔
3215

3216
        let   correct = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
1✔
3217
        let incorrect = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGGCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
1✔
3218
        let incorrec2 = "TGACAGCTGACGGCTGACTACTACGTCACTGACGATGCTGACAC".as_bytes();
1✔
3219
        let incorrec3 = "AAAAAAAAAGCTGACTGCTGACGGCTG".as_bytes();
1✔
3220
        let incorrec4 = "ACGGCTGACTACTGACTGAAAAAAAAAAA".as_bytes();
1✔
3221

3222
        let insertion = "ACGATCGATCGCGATCGATAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
1✔
3223

3224
        let mut reads = Reads::new(crate::reads::Strandedness::Forward);
1✔
3225
        for _i in 0..1000 {
1,000✔
3226
            reads.add_from_bytes(correct, None, IDTag::new(0, 0));
1,000✔
3227
        }
1,000✔
3228

3229
        for _i in 0..2 {
2✔
3230
            reads.add_from_bytes(incorrect, None, IDTag::new(1, 1)); // should be removed
2✔
3231
        }
2✔
3232

3233
        for _i in 0..15 {
15✔
3234
            reads.add_from_bytes(incorrec2, None, IDTag::new(2, 2)); // should be removed
15✔
3235
        }
15✔
3236

3237
        for _i in 0..25 {
25✔
3238
            reads.add_from_bytes(incorrec3, None, IDTag::new(3, 3)); // should be removed
25✔
3239
        }
25✔
3240

3241
        for _i in 0..30 {
30✔
3242
            reads.add_from_bytes(incorrec4, None, IDTag::new(4, 4)); // should be removed
30✔
3243
        }
30✔
3244

3245

3246
        for _i in 0..1 {
1✔
3247
            reads.add_from_bytes(insertion, None, IDTag::new(1, 3)); // should not be removed
1✔
3248
        }
1✔
3249

3250
        let seqs = ReadsPaired::Unpaired { reads };
1✔
3251
        let sample_info = SampleInfo::new(1, 0b111110, vec![1000, 10, 20, 20, 20, 20]);
1✔
3252
        let summary_config = SummaryConfig::new(sample_info);
1✔
3253
        let (kmers, _) = filter_kmers::<IDMapEMData, Kmer16, IDTag>(&seqs, &summary_config, false, 1., false);
1✔
3254

3255

3256
        // test with uncompressed graph
3257
        let mut unc_graph = uncompressed_graph(&kmers, true).finish();
1✔
3258
        // add ids to graph
3259
        for i in 0..unc_graph.len() {
127✔
3260
            let data = unc_graph.mut_data(i);
127✔
3261
            if let Some(ids) = data.ids() {
127✔
3262
                if ids.contains(&0) {
127✔
3263
                    data.set_mapped_ids(vec![0].into());
50✔
3264
                }
77✔
3265
            }
×
3266
        }
3267
        let colors = Colors::new(&unc_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 5 });
1✔
3268
        if print { unc_graph.to_dot("uncompressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3269
        let n_edges = unc_graph.iter_edges().count();
1✔
3270
        unc_graph.remove_ladders(10, 10., uc_csv).unwrap();
1✔
3271
        if print { unc_graph.to_dot("uncompressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3272
        assert_eq!(n_edges - 10, unc_graph.iter_edges().count());
1✔
3273

3274
        // test with compressed graph
3275
        let spec = CheckCompress::new(|d: IDMapEMData, _| d, |d, d1| d.join_test(d1));
124✔
3276
        let mut c_graph = compress_kmers_with_hash(true, &spec, &kmers, false, false).finish();
1✔
3277
        // add ids to graph
3278
        for i in 0..c_graph.len() {
15✔
3279
            let data = c_graph.mut_data(i);
15✔
3280
            if let Some(ids) = data.ids() {
15✔
3281
                if ids.contains(&0) {
15✔
3282
                    data.set_mapped_ids(vec![0].into());
5✔
3283
                }
10✔
3284
            }
×
3285
        }
3286
        let colors = Colors::new(&c_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 5 });
1✔
3287
        if print { c_graph.to_dot("compressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3288
        let n_edges = c_graph.iter_edges().count();
1✔
3289
        c_graph.remove_ladders(10, 10., c_csv).unwrap();
1✔
3290
        if print { c_graph.to_dot("compressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3291
        assert_eq!(n_edges - 6, c_graph.iter_edges().count());
1✔
3292
    }
1✔
3293

3294
    #[test]
3295
    fn test_remove_tips() {
1✔
3296

3297
        let print = false;
1✔
3298
        let c_csv = if print { Some("c_tips.csv") } else { None };
1✔
3299
        let uc_csv = if print { Some("uc_tips.csv") } else { None };
1✔
3300

3301
        let     correct = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
1✔
3302
        let incorrect_r = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTAACGTGAC".as_bytes();
1✔
3303
        let incorrect_l = "ACGTTCGATCGCGATCGTAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
1✔
3304

3305

3306

3307
        let mut reads = Reads::new(crate::reads::Strandedness::Forward);
1✔
3308
        for _i in 0..1000 {
1,000✔
3309
            reads.add_from_bytes(correct, None, IDTag::new(0, 0));
1,000✔
3310
        }
1,000✔
3311

3312
        for _i in 0..10 {
10✔
3313
            reads.add_from_bytes(incorrect_r, None, IDTag::new(1, 1)); // should be removed
10✔
3314
        }
10✔
3315

3316
        for _i in 0..10 {
10✔
3317
            reads.add_from_bytes(incorrect_l, None, IDTag::new(2, 2)); // should be removed
10✔
3318
        }
10✔
3319

3320
        let seqs = ReadsPaired::Unpaired { reads };
1✔
3321
        let sample_info = SampleInfo::new(1, 6, vec![1000, 10, 20]);
1✔
3322
        let summary_config = SummaryConfig::new(sample_info);
1✔
3323
        let (kmers, _) = filter_kmers::<IDMapEMData, Kmer16, IDTag>(&seqs, &summary_config, false, 1., false);
1✔
3324

3325

3326
        // test with uncompressed graph
3327
        let mut unc_graph = uncompressed_graph(&kmers, true).finish();
1✔
3328
        // add ids to graph
3329
        for i in 0..unc_graph.len() {
61✔
3330
            let data = unc_graph.mut_data(i);
61✔
3331
            if let Some(ids) = data.ids() {
61✔
3332
                if ids.contains(&0) {
61✔
3333
                    data.set_mapped_ids(vec![0].into());
50✔
3334
                }
50✔
3335
            }
×
3336
        }
3337

3338
        let colors = Colors::new(&unc_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 3 });
1✔
3339
        if print { unc_graph.to_dot("uncompressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3340
        let n_edges = unc_graph.iter_edges().count();
1✔
3341
        
3342
        unc_graph.remove_tips(10, 10., uc_csv).unwrap();
1✔
3343
        if print { unc_graph.to_dot("uncompressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3344
        assert_eq!(n_edges - 11, unc_graph.iter_edges().count());
1✔
3345

3346
        // test with compressed graph
3347
        let spec = CheckCompress::new(|d: IDMapEMData, _| d, |d, d1| d.join_test(d1));
56✔
3348
        let mut c_graph = compress_kmers_with_hash(true, &spec, &kmers, false, false).finish();
1✔
3349
        // add ids to graph
3350
        for i in 0..c_graph.len() {
5✔
3351
            let data = c_graph.mut_data(i);
5✔
3352
            if let Some(ids) = data.ids() {
5✔
3353
                if ids.contains(&0) {
5✔
3354
                    data.set_mapped_ids(vec![0].into());
3✔
3355
                }
3✔
3356
            }
×
3357
        }
3358

3359
        let colors = Colors::new(&c_graph, &summary_config, crate::colors::ColorMode::SampleGroups);
1✔
3360
        if print { c_graph.to_dot("compressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3361
        let n_edges = c_graph.iter_edges().count();
1✔
3362
        
3363
        c_graph.remove_tips(10, 10., c_csv).unwrap();
1✔
3364
        if print { c_graph.to_dot("compressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3365
        assert_eq!(n_edges - 2, c_graph.iter_edges().count());
1✔
3366

3367
    }
1✔
3368

3369
    #[test]
3370
    fn test_to_tsv() {
1✔
3371
        let reads_us = Reads::from_vmer_vec(
1✔
3372
            (0..10).map(|i| (DnaString::from_bytes(&random_dna(100)), Exts::empty(), i as u8)).collect::<Vec<_>>(), 
10✔
3373
            crate::reads::Strandedness::Unstranded
1✔
3374
        );
3375

3376
        let reads_paired = ReadsPaired::Unpaired { reads: reads_us };
1✔
3377

3378
        let sample_info = SampleInfo::new(0b1111100000, 0b0000011111, vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
1✔
3379
        let summary_config = SummaryConfig::new(sample_info);
1✔
3380
        let (kmers, _) = filter_kmers::<TagsCountsData, Kmer6, _>(&reads_paired, &summary_config, false, 1., false);
1✔
3381

3382
        let graph = compress_kmers_with_hash(false, &ScmapCompress::new(), &kmers, false, false).finish();
1✔
3383

3384
        graph.to_tsv("test_graph_unstranded.tsv", |node| node.data().print_ol(&Translator::empty(), &summary_config, None)).unwrap();
327✔
3385

3386

3387
        let reads_us = Reads::from_vmer_vec(
1✔
3388
            (0..10).map(|i| (DnaString::from_bytes(&random_dna(100)), Exts::empty(), i as u8)).collect::<Vec<_>>(), 
10✔
3389
            crate::reads::Strandedness::Forward
1✔
3390
        );
3391

3392
        let reads_paired = ReadsPaired::Unpaired { reads: reads_us };
1✔
3393

3394
        let sample_info = SampleInfo::new(0b1111100000, 0b0000011111, vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
1✔
3395
        let summary_config = SummaryConfig::new(sample_info);
1✔
3396
        let (kmers, _) = filter_kmers::<TagsCountsData, Kmer6, _>(&reads_paired, &summary_config, false, 1., false);
1✔
3397

3398
        let graph = compress_kmers_with_hash(true, &ScmapCompress::new(), &kmers, false, false).finish();
1✔
3399

3400
        graph.to_tsv("test_graph_stranded.tsv", |node| node.data().print_ol(&Translator::empty(), &summary_config, None)).unwrap();
209✔
3401
    
3402
        remove_file("test_graph_unstranded.tsv").unwrap();
1✔
3403
        remove_file("test_graph_stranded.tsv").unwrap();
1✔
3404
    }
1✔
3405
}
3406

STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc