• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jlab / rust-debruijn / 25375300776

05 May 2026 12:05PM UTC coverage: 88.403% (+0.2%) from 88.239%
25375300776

Pull #43

github

web-flow
Merge cea6d41d5 into 32f2ffd65
Pull Request #43: Memory fixes, extended error removal methods, add `EdgeMap`

774 of 855 new or added lines in 6 files covered. (90.53%)

8 existing lines in 2 files now uncovered.

9308 of 10529 relevant lines covered (88.4%)

3804882.38 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.9
/src/graph.rs
1
// Copyright 2017 10x Genomics
2

3
//! Containers for path-compressed De Bruijn graphs
4

5
use bimap::BiHashMap;
6
use bio::io::fasta;
7
use bit_set::BitSet;
8
use indicatif::ProgressBar;
9
use indicatif::ProgressIterator;
10
use indicatif::ProgressStyle;
11
use itertools::enumerate;
12
use log::warn;
13
use log::{debug, trace};
14
use rayon::prelude::*;
15
use rayon::current_num_threads;
16
use serde_derive::{Deserialize, Serialize};
17
use smallvec::SmallVec;
18
use std::borrow::Borrow;
19

20
use std::collections::HashSet;
21
use std::collections::VecDeque;
22
use std::f32;
23
use std::fmt::{self, Debug, Display};
24
use std::fs::{remove_file, File};
25
use std::hash::Hash;
26
use std::io::BufWriter;
27
use std::io::{BufReader, Error, Read};
28
use std::io::Write;
29
use std::iter::FromIterator;
30
use std::marker::PhantomData;
31
use std::path::Path;
32

33
use boomphf::hashmap::BoomHashMap;
34

35
use serde_json;
36
use serde_json::Value;
37

38
type SmallVec4<T> = SmallVec<[T; 4]>;
39
type SmallVec8<T> = SmallVec<[T; 8]>;
40

41
use crate::BaseQuality;
42
use crate::EdgeMap;
43
use crate::bits_to_base;
44
use crate::colors::ColorMode;
45
use crate::colors::Colors;
46
use crate::compression::CompressionSpec;
47
use crate::dna_string::{DnaString, DnaStringSlice, PackedDnaStringSet};
48
use crate::summarizer::SummaryConfig;
49
use crate::summarizer::SummaryData;
50
use crate::summarizer::Translator;
51
use crate::summarizer::ID;
52
use crate::BUF;
53
use crate::PROGRESS_STYLE;
54
use crate::{Dir, Exts, Kmer, Mer, Vmer};
55

56
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
57
/// This type does not carry the sorted index arrays the allow the graph
58
/// to be walked efficiently. The `DeBruijnGraph` type wraps this type and add those
59
/// vectors.
60
#[derive(Serialize, Deserialize, Clone, Debug)]
61
pub struct BaseGraph<K, D> {
62
    pub sequences: PackedDnaStringSet,
63
    pub exts: Vec<Exts>,
64
    pub data: Vec<D>,
65
    pub stranded: bool,
66
    phantom: PhantomData<K>,
67
}
68

69
impl<K, D> BaseGraph<K, D> {
70
    pub fn new(stranded: bool) -> Self {
2,391✔
71
        BaseGraph {
2,391✔
72
            sequences: PackedDnaStringSet::new(),
2,391✔
73
            exts: Vec::new(),
2,391✔
74
            data: Vec::new(),
2,391✔
75
            phantom: PhantomData,
2,391✔
76
            stranded,
2,391✔
77
        }
2,391✔
78
    }
2,391✔
79

80
    pub fn len(&self) -> usize {
279,307✔
81
        self.sequences.len()
279,307✔
82
    }
279,307✔
83

84
    pub fn is_empty(&self) -> bool {
1✔
85
        self.sequences.is_empty()
1✔
86
    }
1✔
87

88
    pub fn combine<I: Iterator<Item = BaseGraph<K, D>>>(graphs: I) -> Self {
12✔
89
        let mut sequences = PackedDnaStringSet::new();
12✔
90
        let mut exts = Vec::new();
12✔
91
        let mut data = Vec::new();
12✔
92
        let mut stranded = Vec::new();
12✔
93

94
        for g in graphs {
1,554✔
95
            for s in 0..g.sequences.len() {
5,073✔
96
                sequences.add(&g.sequences.get(s));
5,073✔
97
            }
5,073✔
98

99
            exts.extend(g.exts);
1,554✔
100
            data.extend(g.data);
1,554✔
101
            stranded.push(g.stranded);
1,554✔
102
        }
103

104
        let out_stranded = stranded.iter().all(|x| *x);
12✔
105

106
        if !out_stranded && !stranded.iter().all(|x| !*x) {
1,554✔
107
            panic!("attempted to combine stranded and unstranded graphs");
×
108
        }
12✔
109

110
        BaseGraph {
12✔
111
            sequences,
12✔
112
            stranded: out_stranded,
12✔
113
            exts,
12✔
114
            data,
12✔
115
            phantom: PhantomData,
12✔
116
        }
12✔
117
    }
12✔
118

119
    /// shrink the storage of the `BaseGraph` to fit its contents
120
    pub fn shrink_to_fit(&mut self) {
2,046✔
121
        self.sequences.shrink_to_fit();
2,046✔
122
        self.exts.shrink_to_fit();
2,046✔
123
        self.data.shrink_to_fit();
2,046✔
124
    }
2,046✔
125
}
126

127
impl<K: Kmer, D> BaseGraph<K, D> {
128
    pub fn add<R: Borrow<u8>, S: IntoIterator<Item = R>>(
2,133,583✔
129
        &mut self,
2,133,583✔
130
        sequence: S,
2,133,583✔
131
        exts: Exts,
2,133,583✔
132
        data: D,
2,133,583✔
133
    ) {
2,133,583✔
134
        self.sequences.add(sequence);
2,133,583✔
135
        self.exts.push(exts);
2,133,583✔
136
        self.data.push(data);
2,133,583✔
137
    }
2,133,583✔
138
}
139

140
impl<K: Kmer + Send + Sync, D> BaseGraph<K, D> {
141
    pub fn finish(self) -> DebruijnGraph<K, D> {
726✔
142
        let indices: Vec<u32> = (0..self.len() as u32).collect();
726✔
143

144
        let left_order = {
726✔
145
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
726✔
146
            for idx in &indices {
1,449,244✔
147
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
1,449,244✔
148
                
1,449,244✔
149
            }
1,449,244✔
150
            
151
            BoomHashMap::new_parallel(kmers, indices.clone())
726✔
152
        };
153

154
        let right_order = {
726✔
155
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
726✔
156
            for idx in &indices {
1,449,244✔
157
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
1,449,244✔
158
            }
1,449,244✔
159
            
160
            BoomHashMap::new_parallel(kmers, indices)
726✔
161
        };
162
        debug!("finish graph loops: 2x {}", self.len());
726✔
163

164
        DebruijnGraph {
726✔
165
            base: self,
726✔
166
            left_order,
726✔
167
            right_order,
726✔
168
        }
726✔
169
    }
726✔
170
}
171

172
impl<K: Kmer, D> BaseGraph<K, D> {
173
    pub fn finish_serial(self) -> DebruijnGraph<K, D> {
111✔
174
        let indices: Vec<u32> = (0..self.len() as u32).collect();
111✔
175

176
        let left_order = {
111✔
177
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
111✔
178
            let mut sequences: Vec<String> = Vec::new();
111✔
179
            for idx in &indices {
683,251✔
180
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
683,251✔
181
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
683,251✔
182
            }
683,251✔
183

184
            BoomHashMap::new(kmers, indices.clone())
111✔
185
        };
186

187
        let right_order = {
111✔
188
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
111✔
189
            let mut sequences: Vec<String> = Vec::new();
111✔
190
            for idx in &indices {
683,251✔
191
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
683,251✔
192
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
683,251✔
193
            }
683,251✔
194

195
            BoomHashMap::new(kmers, indices)
111✔
196
        };
197

198
        DebruijnGraph {
111✔
199
            base: self,
111✔
200
            left_order,
111✔
201
            right_order,
111✔
202
        }
111✔
203
    }
111✔
204
}
205

206
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
207
/// The struct carries sorted index arrays the allow the graph
208
/// to be walked efficiently.
209
#[derive(Serialize, Deserialize, Debug)]
210
pub struct DebruijnGraph<K: Hash, D> {
211
    pub base: BaseGraph<K, D>,
212
    left_order: BoomHashMap<K, u32>,
213
    right_order: BoomHashMap<K, u32>,
214
}
215

216
impl<K: Kmer, D: Debug> DebruijnGraph<K, D> {
217
    /// Total number of nodes in the DeBruijn graph
218
    pub fn len(&self) -> usize {
276,772✔
219
        self.base.len()
276,772✔
220
    }
276,772✔
221

222
    pub fn is_empty(&self) -> bool {
1✔
223
        self.base.is_empty()
1✔
224
    }
1✔
225

226
    /// shrink the storage of the `DebruijnGraph` to fit its contents
NEW
227
    pub fn shrink_to_fit(&mut self)  {
×
NEW
228
        self.base.shrink_to_fit();
×
NEW
229
    }
×
230

231
    /// Get a node given it's `node_id`
232
    pub fn get_node(&'_ self, node_id: usize) -> Node<'_, K, D> {
5,455,411✔
233
        Node {
5,455,411✔
234
            node_id,
5,455,411✔
235
            graph: self,
5,455,411✔
236
        }
5,455,411✔
237
    }
5,455,411✔
238

239
    /// Get a node given it's `node_id`
240
    pub fn get_node_kmer(&'_ self, node_id: usize) -> NodeKmer<'_, K, D> {
×
241
        let node = self.get_node(node_id);
×
242
        let node_seq = node.sequence();
×
243

244
        NodeKmer {
×
245
            node_id,
×
246
            node_seq_slice: node_seq,
×
247
            phantom_d: PhantomData,
×
248
            phantom_k: PhantomData,
×
249
        }
×
250
    }
×
251

252
    /// Return an iterator over all nodes in the graph
253
    pub fn iter_nodes(&'_ self) -> NodeIter<'_, K, D> {
130✔
254
        NodeIter {
130✔
255
            graph: self,
130✔
256
            node_id: 0,
130✔
257
        }
130✔
258
    }
130✔
259

260
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
261
    /// accessed via a Node wrapper object
262
    fn find_edges(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
459,950✔
263
        let exts = self.base.exts[node_id];
459,950✔
264
        let sequence = self.base.sequences.get(node_id);
459,950✔
265
        let kmer: K = sequence.term_kmer(dir);
459,950✔
266
        let mut edges = SmallVec4::new();
459,950✔
267

268
        for i in 0..4 {
1,839,800✔
269
            if exts.has_ext(dir, i) {
1,839,800✔
270
                let link = self.find_link(kmer.extend(i, dir), dir).expect("missing link");
869,612✔
271
                edges.push((i, link.0, link.1, link.2));
869,612✔
272
            }
970,188✔
273
        }
274

275
        edges
459,950✔
276
    }
459,950✔
277

278
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
279
    /// accessed via a Node wrapper object
280
    /// 
281
    /// allows missing links
282
    fn _find_edges_sharded(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
×
283
        let exts = self.base.exts[node_id];
×
284
        let sequence = self.base.sequences.get(node_id);
×
285
        let kmer: K = sequence.term_kmer(dir);
×
286
        let mut edges = SmallVec4::new();
×
287

288
        for i in 0..4 {
×
289
            if exts.has_ext(dir, i) {
×
290
                let link = self.find_link(kmer.extend(i, dir), dir); //.expect("missing link");
×
291
                if let Some(l) = link {
×
292
                    edges.push((i, l.0, l.1, l.2));
×
293
                }
×
294
                // Otherwise, this edge doesn't exist within this shard, so ignore it.
295
                // NOTE: this should be allowed in a 'complete' DBG
296
            }
×
297
        }
298

299
        edges
×
300
    }
×
301

302
    /// Seach for the kmer `kmer`, appearing at the given `side` of a node sequence.
303
    fn search_kmer(&self, kmer: K, side: Dir) -> Option<usize> {
6,292,383✔
304
        match side {
6,292,383✔
305
            Dir::Left => self.left_order.get(&kmer).map(|pos| *pos as usize),
3,144,971✔
306
            Dir::Right => self.right_order.get(&kmer).map(|pos| *pos as usize),
3,147,412✔
307
        }
308
    }
6,292,383✔
309

310
    /// Find a link in the graph, possibly handling a RC switch.
311
    pub fn find_link(&self, kmer: K, dir: Dir) -> Option<(usize, Dir, bool)> {
4,395,850✔
312
        // Only test self-consistent paths through
313
        // the edges
314
        // Avoids problems due to single kmer edges
315
        // (dir, next_side_incoming, rc)
316
        // (Dir::Left, Dir::Right, false) => true,
317
        // (Dir::Left, Dir::Left,  true) => true,
318
        // (Dir::Right, Dir::Left, false) => true,
319
        // (Dir::Right, Dir::Right, true) => true,
320

321
        let rc = kmer.rc();
4,395,850✔
322

323
        match dir {
4,395,850✔
324
            Dir::Left => {
325
                if let Some(idx) = self.search_kmer(kmer, Dir::Right) {
2,198,156✔
326
                    return Some((idx, Dir::Right, false));
1,250,877✔
327
                }
947,279✔
328

329
                if !self.base.stranded {
947,279✔
330
                    if let Some(idx) = self.search_kmer(rc, Dir::Left) {
947,277✔
331
                        return Some((idx, Dir::Left, true));
947,047✔
332
                    }
230✔
333
                }
2✔
334
            }
335

336
            Dir::Right => {
337
                if let Some(idx) = self.search_kmer(kmer, Dir::Left) {
2,197,694✔
338
                    return Some((idx, Dir::Left, false));
1,250,368✔
339
                }
947,326✔
340

341
                if !self.base.stranded {
947,326✔
342
                    if let Some(idx) = self.search_kmer(rc, Dir::Right) {
947,324✔
343
                        return Some((idx, Dir::Right, true));
947,102✔
344
                    }
222✔
345
                }
2✔
346
            }
347
        }
348

349
        None
456✔
350
    }
4,395,850✔
351

352
    /// Check whether the graph is fully compressed. Return `None` if it's compressed,
353
    /// otherwise return `Some(node1, node2)` representing a pair of node that could
354
    /// be collapsed. Probably only useful for testing.
355
    pub fn is_compressed<S: CompressionSpec<D>>(&self, spec: &S) -> Option<(usize, usize)> {
789✔
356
        for i in 0..self.len() {
128,092✔
357
            let n = self.get_node(i);
128,092✔
358

359
            for dir in &[Dir::Left, Dir::Right] {
256,184✔
360
                let dir_edges = n.edges(*dir);
256,184✔
361
                if dir_edges.len() == 1 {
256,184✔
362
                    let (_, next_id, return_dir, _) = dir_edges[0];
181,479✔
363
                    let next = self.get_node(next_id);
181,479✔
364

365
                    let ret_edges = next.edges(return_dir);
181,479✔
366
                    if ret_edges.len() == 1 {
181,479✔
367
                        // Test for us being a palindrome: this makes it OK
368
                        if n.len() == K::k() && n.sequence().first_kmer::<K>().is_palindrome() {
24✔
369
                            continue;
16✔
370
                        }
8✔
371

372
                        // Test for a neighbor being a palindrome: this makes it OK
373
                        if next.len() == K::k() && next.sequence().first_kmer::<K>().is_palindrome()
8✔
374
                        {
375
                            continue;
8✔
376
                        }
×
377

378
                        // Test for this edge representing a smooth circle (biting it's own tail):
379
                        if n.node_id == next_id {
×
380
                            continue;
×
381
                        }
×
382

383
                        if spec.join_test(n.data(), next.data()) {
×
384
                            // Found a unbranched edge that should have been eliminated
385
                            return Some((i, next_id));
×
386
                        }
×
387
                    }
181,455✔
388
                }
74,705✔
389
            }
390
        }
391

392
        None
789✔
393
    }
789✔
394

395
    /// Remove non-existent extensions that may be created due to filtered kmers
396
    /// 
397
    /// if `valid_nodes` if `None`, all nodes are valid
398
    pub fn fix_exts(&mut self, valid_nodes: Option<&BitSet>) {
360✔
399
        for i in 0..self.len() {
1,390,886✔
400
            let valid_exts = self.get_valid_exts(i, valid_nodes);
1,390,886✔
401
            self.base.exts[i] = valid_exts;
1,390,886✔
402
        }
1,390,886✔
403
    }
360✔
404

405
    pub fn get_valid_exts(&self, node_id: usize, valid_nodes: Option<&BitSet>) -> Exts {
1,390,886✔
406
        let mut new_exts = Exts::empty();
1,390,886✔
407
        let node = self.get_node(node_id);
1,390,886✔
408
        let exts = node.exts();
1,390,886✔
409
        let l_kmer: K = node.sequence().first_kmer();
1,390,886✔
410
        let r_kmer: K = node.sequence().last_kmer();
1,390,886✔
411

412
        let check_node = |id| match valid_nodes {
2,829,095✔
413
            Some(bs) => bs.contains(id),
1,392,922✔
414
            None => true,
1,436,173✔
415
        };
2,829,095✔
416

417
        for i in 0..4 {
5,563,544✔
418
            if exts.has_ext(Dir::Left, i) {
5,563,544✔
419
                match self.find_link(l_kmer.extend_left(i), Dir::Left) {
1,414,649✔
420
                    Some((target, _, _)) if check_node(target) => {
1,414,417✔
421
                        new_exts = new_exts.set(Dir::Left, i)
1,414,417✔
422
                    }
423
                    _ => (),
232✔
424
                }
425
            }
4,148,895✔
426

427
            if exts.has_ext(Dir::Right, i) {
5,563,544✔
428
                match self.find_link(r_kmer.extend_right(i), Dir::Right) {
1,414,902✔
429
                    Some((target, _, _)) if check_node(target) => {
1,414,678✔
430
                        new_exts = new_exts.set(Dir::Right, i)
1,414,678✔
431
                    }
432
                    _ => (),
224✔
433
                }
434
            }
4,148,642✔
435
        }
436

437
        new_exts
1,390,886✔
438
    }
1,390,886✔
439

440
    /// mutable reference to the auxiliary data of the node node_id
441
    pub fn mut_data(&mut self, node_id: usize) -> &mut D {
1,869✔
442
        &mut self.base.data[node_id]
1,869✔
443
    }
1,869✔
444

445
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
446
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
447
    /// paths will be terminated if there are multiple valid paths emanating from a node.
448
    pub fn max_path<F, F2>(&self, score: F, solid_path: F2) -> Vec<(usize, Dir)>
×
449
    where
×
450
        F: Fn(&D) -> f32,
×
451
        F2: Fn(&D) -> bool,
×
452
    {
453
        if self.is_empty() {
×
454
            return Vec::default();
×
455
        }
×
456

457
        let mut best_node = 0;
×
458
        let mut best_score = f32::MIN;
×
459
        for i in 0..self.len() {
×
460
            let node = self.get_node(i);
×
461
            let node_score = score(node.data());
×
462

463
            if node_score > best_score {
×
464
                best_node = i;
×
465
                best_score = node_score;
×
466
            }
×
467
        }
468

469
        let oscore = |state| match state {
×
470
            None => 0.0,
×
471
            Some((id, _)) => score(self.get_node(id).data()),
×
472
        };
×
473

474
        let osolid_path = |state| match state {
×
475
            None => false,
×
476
            Some((id, _)) => solid_path(self.get_node(id).data()),
×
477
        };
×
478

479
        // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
480
        // already put into the path
481
        let mut used_nodes = HashSet::new();
×
482
        let mut path = VecDeque::new();
×
483

484
        // Start w/ initial state
485
        used_nodes.insert(best_node);
×
486
        path.push_front((best_node, Dir::Left));
×
487

488
        for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
×
489
            let &(start_node, dir, do_flip) = init;
×
490
            let mut current = (start_node, dir);
×
491

492
            loop {
493
                let mut next = None;
×
494
                let (cur_id, incoming_dir) = current;
×
495
                let node = self.get_node(cur_id);
×
496
                let edges = node.edges(incoming_dir.flip());
×
497

498
                let mut solid_paths = 0;
×
499
                for (_, id, dir, _) in edges {
×
500
                    let cand = Some((id, dir));
×
501
                    if osolid_path(cand) {
×
502
                        solid_paths += 1;
×
503
                    }
×
504

505
                    if oscore(cand) > oscore(next) {
×
506
                        next = cand;
×
507
                    }
×
508
                }
509

510
                if solid_paths > 1 {
×
511
                    break;
×
512
                }
×
513

514
                match next {
×
515
                    Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
×
516
                        if do_flip {
×
517
                            path.push_front((next_id, next_incoming.flip()));
×
518
                        } else {
×
519
                            path.push_back((next_id, next_incoming));
×
520
                        }
×
521

522
                        used_nodes.insert(next_id);
×
523
                        current = (next_id, next_incoming);
×
524
                    }
525
                    _ => break,
×
526
                }
527
            }
528
        }
529

530
        Vec::from_iter(path)
×
531
    }
×
532

533

534
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
535
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
536
    /// paths will be terminated if there are multiple valid paths emanating from a node.
537
    /// Returns vec with path for each component
538
    pub fn max_path_comp<F, F2>(&self, score: F, solid_path: F2) -> Vec<VecDeque<(usize, Dir)>>
1✔
539
    where
1✔
540
        F: Fn(&D) -> f32,
1✔
541
        F2: Fn(&D) -> bool,
1✔
542
    {
543
        if self.is_empty() {
1✔
544
            let vec: Vec<VecDeque<(usize, Dir)>> = Vec::new();
×
545
            return vec;
×
546
        }
1✔
547

548
        let components = self.iter_components();
1✔
549
        let mut paths: Vec<VecDeque<(usize, Dir)>> = Vec::new();
1✔
550

551
        for component in components {
2✔
552

553
            let current_comp = &component;
2✔
554
            
555

556
            let mut best_node = current_comp[0];
2✔
557
            let mut best_score = f32::MIN;
2✔
558
            for c in current_comp.iter() {
134✔
559
                let node = self.get_node(*c);
134✔
560
                let node_score = score(node.data());
134✔
561

562
                if node_score > best_score {
134✔
563
                    best_node = *c;
2✔
564
                    best_score = node_score;
2✔
565
                }
132✔
566
            }
567

568
            let oscore = |state| match state {
244✔
569
                None => 0.0,
122✔
570
                Some((id, _)) => score(self.get_node(id).data()),
122✔
571
            };
244✔
572

573
            let osolid_path = |state| match state {
122✔
574
                None => false,
×
575
                Some((id, _)) => solid_path(self.get_node(id).data()),
122✔
576
            };
122✔
577

578
            // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
579
            // already put into the path
580
            let mut used_nodes = HashSet::new();
2✔
581
            let mut path = VecDeque::new();
2✔
582

583
            // Start w/ initial state
584
            used_nodes.insert(best_node);
2✔
585
            path.push_front((best_node, Dir::Left));
2✔
586

587
            for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
4✔
588
                let &(start_node, dir, do_flip) = init;
4✔
589
                let mut current = (start_node, dir);
4✔
590

591
                loop {
592
                    let mut next = None;
126✔
593
                    let (cur_id, incoming_dir) = current;
126✔
594
                    let node = self.get_node(cur_id);
126✔
595
                    let edges = node.edges(incoming_dir.flip());
126✔
596

597
                    let mut solid_paths = 0;
126✔
598
                    for (_, id, dir, _) in edges {
126✔
599
                        let cand = Some((id, dir));
122✔
600
                        if osolid_path(cand) {
122✔
601
                            solid_paths += 1;
122✔
602
                        }
122✔
603

604
                        if oscore(cand) > oscore(next) {
122✔
605
                            next = cand;
122✔
606
                        }
122✔
607
                    }
608

609
                    if solid_paths > 1 {
126✔
610
                        break;
×
611
                    }
126✔
612

613
                    match next {
122✔
614
                        Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
122✔
615
                            if do_flip {
122✔
616
                                path.push_front((next_id, next_incoming.flip()));
45✔
617
                            } else {
77✔
618
                                path.push_back((next_id, next_incoming));
77✔
619
                            }
77✔
620

621
                            used_nodes.insert(next_id);
122✔
622
                            current = (next_id, next_incoming);
122✔
623
                        }
624
                        _ => break,
4✔
625
                    }
626
                }
627
            }
628
            
629
            paths.push(path);
2✔
630
            //paths.push(Vec::from_iter(path));
631
        }
632

633
        paths
1✔
634
    
635
    }
1✔
636

637
    pub fn iter_max_path_comp<F, F2>(&'_ self, score: F, solid_path: F2) -> PathCompIter<'_, K, D, F, F2> 
3✔
638
    where 
3✔
639
    F: Fn(&D) -> f32,
3✔
640
    F2: Fn(&D) -> bool
3✔
641
    {
642
        let component_iterator = self.iter_components();
3✔
643
        PathCompIter { graph: self, component_iterator, graph_pos: 0, score, solid_path }
3✔
644
    }
3✔
645

646
    /// write the paths from `iter_max_path_comp` to a fasta file
647
    pub fn path_to_fasta<F, F2>(&self, f: &mut dyn std::io::Write, path_iter: PathCompIter<K, D, F, F2>, return_lens: bool) -> (Vec<usize>, Vec<usize>)
1✔
648
    where 
1✔
649
    F: Fn(&D) -> f32,
1✔
650
    F2: Fn(&D) -> bool
1✔
651
    {
652
        // width of fasta lines
653
        let columns = 80;
1✔
654

655
        // sizes of components and of paths
656
        let mut comp_sizes = Vec::new();
1✔
657
        let mut path_lens = Vec::new();
1✔
658

659
        for (seq_counter, (component, path)) in path_iter.enumerate() {
2✔
660
            // get dna sequence from path
661
            let seq = self.sequence_of_path(path.iter());
2✔
662

663
            //write header with length & start node
664
            writeln!(f, ">path{} len={} start_node={}", seq_counter, seq.len(), path[0].0).unwrap();
2✔
665

666
            // calculate how sequence has to be split up
667
            let slices = (seq.len() / columns) + 1;
2✔
668
            let mut ranges = Vec::with_capacity(slices);
2✔
669

670
            let mut start = 0;
2✔
671
            while start < seq.len() {
6✔
672
                ranges.push(start..start + columns);
4✔
673
                start += columns;
4✔
674
            }
4✔
675

676
            let last_start = ranges.pop().expect("no kmers in parallel ranges").start;
2✔
677
            ranges.push(last_start..seq.len());
2✔
678

679
            // split up sequence and write to file accordingly
680
            for range in ranges {
4✔
681
                writeln!(f, "{:?}", seq.slice(range.start, range.end)).unwrap();
4✔
682
            }
4✔
683

684
            if return_lens {
2✔
685
                comp_sizes.push(component.len());
×
686
                path_lens.push(path.len());
×
687
            }
2✔
688
        }    
689

690
        (comp_sizes, path_lens)
1✔
691
        
692
    }
1✔
693

694

695
    /// Get the sequence of a path through the graph. The path is given as a sequence of node_id integers
696
    pub fn sequence_of_path<'a, I: 'a + Iterator<Item = &'a (usize, Dir)>>(
19,399✔
697
        &self,
19,399✔
698
        path: I,
19,399✔
699
    ) -> DnaString {
19,399✔
700
        let mut seq = DnaString::new();
19,399✔
701

702
        for (idx, &(node_id, dir)) in path.enumerate() {
688,696✔
703
            let start = if idx == 0 { 0 } else { K::k() - 1 };
688,696✔
704

705
            let node_seq = match dir {
688,696✔
706
                Dir::Left => self.get_node(node_id).sequence(),
357,671✔
707
                Dir::Right => self.get_node(node_id).sequence().rc(),
331,025✔
708
            };
709

710
            for p in start..node_seq.len() {
1,321,538✔
711
                seq.push(node_seq.get(p))
1,321,538✔
712
            }
713
        }
714

715
        seq
19,399✔
716
    }
19,399✔
717

718
    /// map sequences from a fasta reference to the nodes of a **completely uncompressed** debruijn graph
719
    /// 
720
    /// the IDs are stored with the node if the k-mer occured in the reference
721
    pub fn map_transcripts<P>(&self, path: P, translator: &mut Translator) -> Result<Vec<Box<[ID]>>, String> 
3✔
722
    where 
3✔
723
        P: AsRef<Path>
3✔
724
    {
725
        let reader = fasta::Reader::new(BufReader::new(File::open(path).unwrap()));
3✔
726
        let mut node_transcript_ids = vec![Vec::new(); self.len()];
3✔
727

728
        // if the translator has a id translator, use it, else make a new one to use and put it into the translator
729
        let id_tr = if let Some(id_tr) = translator.mut_id_translator() {
3✔
730
            id_tr
1✔
731
        } else {
732
            let new_id_tr = BiHashMap::new();
2✔
733
            translator.mut_id_translator().replace(new_id_tr);
2✔
734
            
735
            let Some(id_tr) = translator.mut_id_translator() else { panic!("should not happen") };
2✔
736

737
            id_tr
2✔
738
        };
739

740
        // go through each transcript and map to graph
741
        for result in reader.records() {
12✔
742
            let record = result.expect("error parsing transcripts fasta");
12✔
743

744
            // get gene id or make new id
745
            let gene_id = match id_tr.get_by_left(&record.id().to_string()) {
12✔
746
                Some(id) => *id,
4✔
747
                None => {
748
                    let new_id = id_tr.len() as ID;
8✔
749
                    id_tr.insert(record.id().to_string(), new_id);
8✔
750
                    new_id
8✔
751
                }
752
            };
753

754
            // iterate over k-mers in transcript and find each one in the graph
755
            let sequence = DnaString::from_acgt_bytes(record.seq());
12✔
756
            for kmer in sequence.iter_kmers::<K>() {
1,449✔
757
                if self.base.stranded {
1,449✔
758
                    if let Some(node) = self.search_kmer(kmer, Dir::Right) {
1,449✔
759
                        node_transcript_ids[node].push(gene_id);
1,377✔
760
                    }
1,377✔
761
                } else {
762
                    // graph is not stranded, look for both the k-mer ansd its reverse complement
NEW
763
                    if let Some(node) = self.search_kmer(kmer, Dir::Right) {
×
NEW
764
                        node_transcript_ids[node].push(gene_id);
×
NEW
765
                    } else if let Some(node) = self.search_kmer(kmer.rc(), Dir::Right) {
×
NEW
766
                        node_transcript_ids[node].push(gene_id);
×
NEW
767
                    }
×
768
                }
769
                
770
            }
771
        }
772

773
        // change to boxed slices to reduce memory
774
        let boxed_transcripts = node_transcript_ids.into_iter().map(|vec| vec.into()).collect();
1,461✔
775

776
        Ok(boxed_transcripts)
3✔
777
    }
3✔
778

779
    /// map sequences from a fasta reference to the edges of a **completely uncompressed** debruijn graph
780
    /// 
781
    /// the IDs are stored with the edges if the two k-mers occured together in the reference
782
    pub fn map_transcripts_to_edges<P>(&self, path: P, translator: &mut Translator) -> Result<Vec<EdgeMap>, String> 
1✔
783
    where 
1✔
784
        P: AsRef<Path>
1✔
785
    {
786
        let reader = fasta::Reader::new(BufReader::new(File::open(path).unwrap()));
1✔
787
        let mut node_edge_transcript_ids = vec![EdgeMap::default(); self.len()];
1✔
788

789
        // if the translator has a id translator, use it, else make a new one to use and put it into the translator
790
        let id_tr = if let Some(id_tr) = translator.mut_id_translator() {
1✔
791
            id_tr
1✔
792
        } else {
NEW
793
            let new_id_tr = BiHashMap::new();
×
NEW
794
            translator.mut_id_translator().replace(new_id_tr);
×
795
            
NEW
796
            let Some(id_tr) = translator.mut_id_translator() else { panic!("should not happen") };
×
797

NEW
798
            id_tr
×
799
        };
800

801
        // go through each transcript and map to graph
802
        for result in reader.records() {
4✔
803
            let record = result.expect("error parsing transcripts fasta");
4✔
804

805
            // get gene id or make new id
806
            let gene_id = match id_tr.get_by_left(&record.id().to_string()) {
4✔
807
                Some(id) => *id,
4✔
808
                None => {
NEW
809
                    let new_id = id_tr.len() as ID;
×
NEW
810
                    id_tr.insert(record.id().to_string(), new_id);
×
NEW
811
                    new_id
×
812
                }
813
            };
814

815
            // iterate over k-mers in transcript and find each one in the graph
816
            let sequence = DnaString::from_acgt_bytes(record.seq());
4✔
817

818
            for (kmer, exts) in sequence.iter_kmer_exts::<K>(Exts::empty()) {
483✔
819
                if self.base.stranded {
483✔
820
                    if let Some(node) = self.search_kmer(kmer, Dir::Right) {
483✔
821
                        node_edge_transcript_ids[node].add_id(exts, gene_id);
459✔
822
                    }
459✔
823
                } else {
824
                    // graph is not stranded, look for both the k-mer ansd its reverse complement
NEW
825
                    if let Some(node) = self.search_kmer(kmer, Dir::Right) {
×
NEW
826
                        node_edge_transcript_ids[node].add_id(exts, gene_id);
×
NEW
827
                    } else if let Some(node) = self.search_kmer(kmer.rc(), Dir::Right) {
×
NEW
828
                        node_edge_transcript_ids[node].add_id(exts.rc(), gene_id);
×
NEW
829
                    }
×
830
                }
831
                
832
            }
833
        }
834

835
        // remove unncecessary memory from vector
836
        node_edge_transcript_ids.shrink_to_fit();
1✔
837

838
        Ok(node_edge_transcript_ids)
1✔
839
    }
1✔
840

841
    /// write a node to a dot file
842
    /// 
843
    /// ### Arguments: 
844
    /// 
845
    /// * `node`: [`Node<K, D>`] which will be written to a dot file
846
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
847
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
848
    ///   and if the neighbor is flipped - returns a string containing commands for dot edges, 
849
    /// * `f`: writer
850
    fn node_to_dot<FN: Fn(&Node<K, D>) -> String, FE: Fn(&Node<K, D>, u8, Dir, bool) -> String>(
1,457✔
851
        &self,
1,457✔
852
        node: &Node<'_, K, D>,
1,457✔
853
        node_label: &FN,
1,457✔
854
        edge_label: &FE,
1,457✔
855
        f: &mut dyn Write,
1,457✔
856
    ) {
1,457✔
857
        writeln!(f, "n{} {}", node.node_id, node_label(node)).unwrap();
1,457✔
858
        assert_eq!(node.exts().val.count_ones() as usize, node.l_edges().len() + node.r_edges().len());
1,457✔
859

860
        for (base, id, incoming_dir, flipped) in node.l_edges() {
1,457✔
861
            writeln!(f, "n{} -> n{} {}", id, node.node_id, edge_label(node, base, incoming_dir, flipped)).unwrap();
1,363✔
862
        }
1,363✔
863

864
        for (base, id, incoming_dir, flipped) in node.r_edges() {
1,457✔
865
            writeln!(f, "n{} -> n{} {}", node.node_id, id, edge_label(node, base, incoming_dir, flipped)).unwrap();
1,363✔
866
        }
1,363✔
867
    }
1,457✔
868

869
    /// Write the graph to a dot file
870
    /// 
871
    /// ### Arguments: 
872
    /// 
873
    /// * `path`: path to the output file
874
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes, e.g. [`Node::node_dot_default`]
875
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge, e.g. [`Node::edge_dot_default`]
876
    ///   and if the neighbor is flipped - returns a string containing commands for dot edges, 
877
    pub fn to_dot<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
17✔
878
    where 
17✔
879
    P: AsRef<Path>,
17✔
880
    FN: Fn(&Node<K, D>) -> String,
17✔
881
    FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
17✔
882
    {
883
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
17✔
884

885
        let pb = ProgressBar::new(self.len() as u64);
17✔
886
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
17✔
887
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
17✔
888

889
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset").unwrap();
17✔
890
        for i in (0..self.len()).progress_with(pb) {
1,247✔
891
            self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
1,247✔
892
        }
1,247✔
893
        writeln!(&mut f, "}}").unwrap();
17✔
894
        
895
        f.flush().unwrap();
17✔
896
        debug!("large to dot loop: {}", self.len());
17✔
897
    }
17✔
898

899
    /// Write the graph to a dot file, highlight the nodes which form the 
900
    /// "best" path, according to [`PathCompIter`], with the number of occurences 
901
    /// as the score and `solid_path` always `true`.
902
    /// The nodes are formatted according to [`Node::node_dot_default`].
903
    /// 
904
    /// ### Arguments: 
905
    /// 
906
    /// * `path`: path to the output file
907
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge, e.g. [`Node::edge_dot_default`]
908
    ///   and if the neighbor is flipped - returns a string containing commands for dot edges, 
909
    /// * `colors`: a [`Colors`] with the color settings for the graph
910
    /// * `translator`: a [`Translator`] which translates tags or IDs to strings
911
    /// * `config`: a [`SummaryConfig`] which contains settings for the graph
912
    pub fn to_dot_with_path<P, FE, DI>(&self, path: P, edge_label: &FE, colors: &Colors<'_, D, DI>, translator: &Translator, config: &SummaryConfig, translate_id_groups: bool)
1✔
913
    where 
1✔
914
    P: AsRef<Path>,
1✔
915
    D: SummaryData<DI>,
1✔
916
    FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
917
    {
918
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
919

920
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset").unwrap();
1✔
921

922
        // iterate over components
923
        for (component, path) in self.iter_max_path_comp(|d| d.sum().unwrap_or(1) as f32, |_| true) {
194✔
924
            let hashed_path = path.into_iter().map(|(id, _)| id).collect::<HashSet<usize>>();
3✔
925
            for node_id in component {
103✔
926
                self.node_to_dot(
103✔
927
                    &self.get_node(node_id),
103✔
928
                    &|node| node.node_dot_default(colors, config, translator, hashed_path.contains(&node_id), translate_id_groups), 
103✔
929
                    edge_label, 
103✔
930
                    &mut f
103✔
931
                );
932
            }
933
        }
934

935
        writeln!(&mut f, "}}").unwrap();
1✔
936
        
937
        f.flush().unwrap();
1✔
938
        debug!("large to dot loop: {}", self.len());
1✔
939
    }
1✔
940

941
    /// Write the graph to a dot file in parallel
942
    /// Will write in to n_threads files simultaniously,
943
    /// then go though the files and add the contents to a larger file, 
944
    /// and delete the small files.
945
    /// 
946
    /// ### Arguments: 
947
    /// 
948
    /// * `path`: path to the output file
949
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
950
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
951
    ///   and if the neighbor is flipped - returns a string containing commands for dot edges, 
952
    pub fn to_dot_parallel<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
1✔
953
    where 
1✔
954
        D: Sync,
1✔
955
        K: Sync,
1✔
956
        P: AsRef<Path> + Display + Sync,
1✔
957
        FN: Fn(&Node<K, D>) -> String + Sync,
1✔
958
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String + Sync,
1✔
959
    {        
960
        let slices = current_num_threads();
1✔
961
        let n_nodes = self.len();
1✔
962
        let sz = n_nodes / slices + 1;
1✔
963

964
        debug!("n_nodes: {}", n_nodes);
1✔
965
        debug!("sz: {}", sz);
1✔
966

967
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
968
        let mut start = 0;
1✔
969
        while start < n_nodes {
5✔
970
            parallel_ranges.push(start..start + sz);
4✔
971
            start += sz;
4✔
972
        }
4✔
973

974
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
975
        parallel_ranges.push(last_start..n_nodes);
1✔
976
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
977

978
        let mut files = Vec::with_capacity(current_num_threads());
1✔
979

980
        for i in 0..parallel_ranges.len() {
4✔
981
            files.push(format!("{}-{}.dot", path, i));
4✔
982
        } 
4✔
983

984
        let pb = ProgressBar::new(self.len() as u64);
1✔
985
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
986
        pb.set_message(format!("{:<32}", "writing graph to DOT files"));
1✔
987
    
988
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
4✔
989
            let mut f = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel dot file"));
4✔
990

991
            for i in range {
103✔
992
                self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
103✔
993
                pb.inc(1);
103✔
994
            }
103✔
995

996
            f.flush().unwrap();
4✔
997
        });
4✔
998
        pb.finish_and_clear();
1✔
999

1000
        let mut out_file = BufWriter::with_capacity(BUF, File::create(path).expect("error creating combined dot file"));
1✔
1001

1002
        writeln!(&mut out_file, "digraph {{\nrankdir=\"LR\"\nmodel=subset").unwrap();
1✔
1003

1004
        let pb = ProgressBar::new(files.len() as u64);
1✔
1005
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1006
        pb.set_message(format!("{:<32}", "combining files"));
1✔
1007

1008
        for file in files.iter().progress_with(pb) {
4✔
1009
            let open_file = File::open(file).expect("error opening parallel dot file");
4✔
1010
            let mut reader = BufReader::new(open_file);
4✔
1011
            let mut buffer = [0; BUF];
4✔
1012

1013
            loop {
1014
                let linecount = reader.read(&mut buffer).unwrap();
8✔
1015
                if linecount == 0 { break }
8✔
1016
                out_file.write_all(&buffer[..linecount]).unwrap();
4✔
1017
            }
1018

1019
            remove_file(file).unwrap();
4✔
1020
        }
1021

1022
        writeln!(&mut out_file, "}}").unwrap();
1✔
1023

1024
        out_file.flush().unwrap();
1✔
1025

1026

1027
    }
1✔
1028

1029

1030
    /// Write part of the graph to a dot file
1031
    /// 
1032
    /// ### Arguments: 
1033
    /// 
1034
    /// * `path`: path to the output file
1035
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
1036
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
1037
    ///   and if the neighbor is flipped - returns a string containing commands for dot edges, 
1038
    /// * `nodes`: [`Vec<usize>`] listing all IDs of nodes which should be included
1039
    pub fn to_dot_partial<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE, nodes: &[usize]) 
1✔
1040
    where 
1✔
1041
        P: AsRef<Path>,
1✔
1042
        FN: Fn(&Node<K, D>) -> String,
1✔
1043
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
1044
    {
1045
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
1046

1047
        let pb = ProgressBar::new(nodes.len() as u64);
1✔
1048
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1049
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
1✔
1050

1051
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset").unwrap();
1✔
1052
        for i in nodes.iter().progress_with(pb) {
4✔
1053
            self.node_to_dot(&self.get_node(*i), node_label, edge_label, &mut f);
4✔
1054
        }
4✔
1055
        writeln!(&mut f, "}}").unwrap();
1✔
1056

1057
        f.flush().unwrap();
1✔
1058

1059
        debug!("large to dot loop: {}", self.len());
1✔
1060
    }
1✔
1061

1062
    fn node_to_gfa<F: Fn(&Node<'_, K, D>) -> String>(
313✔
1063
        &self,
313✔
1064
        node: &Node<'_, K, D>,
313✔
1065
        w: &mut dyn Write,
313✔
1066
        tag_func: Option<&F>,
313✔
1067
    ) -> Result<(), Error> {
313✔
1068
        match tag_func {
313✔
1069
            Some(f) => {
210✔
1070
                let tags = (f)(node);
210✔
1071
                writeln!(
210✔
1072
                    w,
210✔
1073
                    "S\t{}\t{}\t{}",
1074
                    node.node_id,
1075
                    node.sequence().to_dna_string(),
210✔
1076
                    tags
1077
                )?;
×
1078
            }
1079
            _ => writeln!(
103✔
1080
                w,
103✔
1081
                "S\t{}\t{}",
1082
                node.node_id,
1083
                node.sequence().to_dna_string()
103✔
1084
            )?,
×
1085
        }
1086

1087
        for (_, target, dir, _) in node.l_edges() {
313✔
1088
            if target >= node.node_id {
307✔
1089
                let to_dir = match dir {
151✔
1090
                    Dir::Left => "+",
×
1091
                    Dir::Right => "-",
151✔
1092
                };
1093
                writeln!(
151✔
1094
                    w,
151✔
1095
                    "L\t{}\t-\t{}\t{}\t{}M",
1096
                    node.node_id,
1097
                    target,
1098
                    to_dir,
1099
                    K::k() - 1
151✔
1100
                )?;
×
1101
            }
156✔
1102
        }
1103

1104
        for (_, target, dir, _) in node.r_edges() {
313✔
1105
            if target > node.node_id {
307✔
1106
                let to_dir = match dir {
160✔
1107
                    Dir::Left => "+",
160✔
1108
                    Dir::Right => "-",
×
1109
                };
1110
                writeln!(
160✔
1111
                    w,
160✔
1112
                    "L\t{}\t+\t{}\t{}\t{}M",
1113
                    node.node_id,
1114
                    target,
1115
                    to_dir,
1116
                    K::k() - 1
160✔
1117
                )?;
×
1118
            }
147✔
1119
        }
1120

1121
        Ok(())
313✔
1122
    }
313✔
1123

1124
    /// Write the graph to GFA format
1125
    pub fn to_gfa<P: AsRef<Path>>(&self, gfa_out: P) -> Result<(), Error> {
1✔
1126
        let wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
1✔
1127
        self.write_gfa(&mut std::io::BufWriter::new(wtr))
1✔
1128
    }
1✔
1129

1130
    pub fn write_gfa(&self, wtr: &mut impl Write) -> Result<(), Error> {
1✔
1131
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1132

1133
        type DummyFn<K, D> = fn(&Node<'_, K, D>) -> String;
1134
        let dummy_opt: Option<&DummyFn<K, D>> = None;
1✔
1135

1136
        let pb = ProgressBar::new(self.len() as u64);
1✔
1137
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1138
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1139

1140
        for i in (0..self.len()).progress_with(pb) {
103✔
1141
            let n = self.get_node(i);
103✔
1142
            self.node_to_gfa(&n, wtr, dummy_opt)?;
103✔
1143
        }
1144

1145
        wtr.flush().unwrap();
1✔
1146

1147
        Ok(())
1✔
1148
    }
1✔
1149

1150
    /// Write the graph to GFA format
1151
    pub fn to_gfa_with_tags<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(
1✔
1152
        &self,
1✔
1153
        gfa_out: P,
1✔
1154
        tag_func: F,
1✔
1155
    ) -> Result<(), Error> {
1✔
1156
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creatinf gfa file"));
1✔
1157
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1158

1159
        let pb = ProgressBar::new(self.len() as u64);
1✔
1160
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1161
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1162

1163
        for i in (0..self.len()).progress_with(pb) {
103✔
1164
            let n = self.get_node(i);
103✔
1165
            self.node_to_gfa(&n, &mut wtr, Some(&tag_func))?;
103✔
1166
        }
1167

1168
        wtr.flush().unwrap();
1✔
1169

1170
        Ok(())
1✔
1171
    }
1✔
1172

1173
    /// Write the graph to GFA format, with multithreading, 
1174
    /// pass `tag_func=None` to write without tags
1175
    pub fn to_gfa_otags_parallel<P: AsRef<Path> + Display + Sync, F: Fn(&Node<'_, K, D>) -> String + Sync>(
1✔
1176
        &self,
1✔
1177
        gfa_out: P,
1✔
1178
        tag_func: Option<&F>,
1✔
1179
    ) -> Result<(), Error> 
1✔
1180
    where 
1✔
1181
    K: Sync,
1✔
1182
    D: Sync,
1✔
1183
    {
1184
        // split into ranges according to thread count
1185
        let slices = current_num_threads();
1✔
1186
        let n_nodes = self.len();
1✔
1187
        let sz = n_nodes / slices + 1;
1✔
1188

1189
        debug!("n_nodes: {}", n_nodes);
1✔
1190
        debug!("sz: {}", sz);
1✔
1191

1192
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
1193
        let mut start = 0;
1✔
1194
        while start < n_nodes {
5✔
1195
            parallel_ranges.push(start..start + sz);
4✔
1196
            start += sz;
4✔
1197
        }
4✔
1198

1199
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
1200
        parallel_ranges.push(last_start..n_nodes);
1✔
1201
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
1202

1203
        let mut files = Vec::with_capacity(current_num_threads());
1✔
1204

1205
        for i in 0..parallel_ranges.len() {
4✔
1206
            files.push(format!("{}-{}.gfa", gfa_out, i));
4✔
1207
        } 
4✔
1208

1209
        let pb = ProgressBar::new(self.len() as u64);
1✔
1210
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1211
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1212
        
1213
        
1214
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
4✔
1215
            let mut wtr = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel gfa file"));
4✔
1216

1217
            for i in range {
103✔
1218
                let n = self.get_node(i);
103✔
1219
                self.node_to_gfa(&n, &mut wtr, tag_func).unwrap();
103✔
1220
                pb.inc(1);
103✔
1221
            }
103✔
1222

1223
            wtr.flush().unwrap();
4✔
1224
        });
4✔
1225

1226
        pb.finish_and_clear();
1✔
1227

1228
        // combine files
1229
        let mut out_file = BufWriter::with_capacity(BUF, File::create(format!("{}.gfa", gfa_out)).expect("error creating combined gfa file"));
1✔
1230
        writeln!(out_file, "H\tVN:Z:debruijn-rs")?;
1✔
1231

1232
        let pb = ProgressBar::new(files.len() as u64);
1✔
1233
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1234
        pb.set_message(format!("{:<32}", "combining files"));
1✔
1235

1236
        for file in files.iter() {
4✔
1237
            let open_file = File::open(file).expect("error opening parallel gfa file");
4✔
1238
            let mut reader = BufReader::new(open_file);
4✔
1239
            let mut buffer = [0; BUF];
4✔
1240

1241
            loop {
1242
                let linecount = reader.read(&mut buffer).unwrap();
8✔
1243
                if linecount == 0 { break }
8✔
1244
                out_file.write_all(&buffer[..linecount]).unwrap();
4✔
1245
            }
1246

1247
            remove_file(file).unwrap();
4✔
1248
        }
1249

1250
        out_file.flush().unwrap();
1✔
1251

1252
        Ok(())
1✔
1253
    }
1✔
1254

1255
    /// Write the graph to GFA format
1256
    pub fn to_gfa_partial<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(&self, gfa_out: P, tag_func: Option<&F>, nodes: Vec<usize>) -> Result<(), Error> {
1✔
1257
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
1✔
1258
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1259

1260
        let pb = ProgressBar::new(self.len() as u64);
1✔
1261
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1262
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1263

1264
        for i in nodes.into_iter().progress_with(pb) {
4✔
1265
            let n = self.get_node(i);
4✔
1266
            self.node_to_gfa(&n, &mut wtr, tag_func)?;
4✔
1267
        }
1268

1269
        wtr.flush().unwrap();
1✔
1270

1271
        Ok(())    
1✔
1272
    }
1✔
1273

1274
    fn node_to_tsv<W: Write, F>(&self, writer: &mut W, node_id: usize, data_format: F) -> Result<(), Box<dyn std::error::Error>> 
527✔
1275
    where 
527✔
1276
        F: Fn(&Node<'_, K, D>) -> String,
527✔
1277
    {
1278

1279
        let node = self.get_node(node_id);
527✔
1280
        let l_e = node.l_edges();
527✔
1281
        let r_e = node.r_edges();
527✔
1282

1283
        
1284
        if self.base.stranded {
527✔
1285
            // format: node id    l nb    r nb    seq    data
1286
            let l_nb = l_e.iter().map(|(_b, nb, _d, _f)| *nb).collect::<Vec<_>>();
208✔
1287
            let r_nb = r_e.iter().map(|(_b, nb, _d, _f)| *nb).collect::<Vec<_>>();
208✔
1288
            writeln!(writer, "{node_id}\t{:?}\t{:?}\t{}\t{}", l_nb, r_nb, node.sequence(), data_format(&node))?
208✔
1289
        } else {
1290
            // format: node id    l nb    l inc dir    r nb    r inc dir    seq    data
1291
            let l_nb = l_e.iter().map(|(_b, nb, _d, _f)| *nb).collect::<Vec<_>>();
319✔
1292
            let r_nb = r_e.iter().map(|(_b, nb, _d, _f)| *nb).collect::<Vec<_>>();
319✔
1293
            let l_nb_dirs = l_e.iter().map(|(_b, _nb, dir, _f)| *dir).collect::<Vec<_>>();
319✔
1294
            let r_nb_dirs = r_e.iter().map(|(_b, _nb, dir, _f)| *dir).collect::<Vec<_>>();
319✔
1295
            writeln!(writer, "{node_id}\t{:?}\t{:?}\t{:?}\t{:?}\t{}\t{}", l_nb, l_nb_dirs, r_nb, r_nb_dirs, node.sequence(), data_format(&node))?
319✔
1296
        }
1297

1298
        Ok(())
527✔
1299
    }
527✔
1300

1301
    /// save the graph as a tsv file with custom formatting for the node data
1302
    pub fn to_tsv<P, F>(&self, path: P, data_format: F) -> Result<(), Box<dyn std::error::Error>> 
2✔
1303
    where 
2✔
1304
        F: Fn(&Node<'_, K, D>) -> String,
2✔
1305
        P: AsRef<Path> + Display,
2✔
1306
    { 
1307
        let mut writer = BufWriter::new(File::create(path)?);
2✔
1308

1309
        // different format if stranded vs unstranded
1310
        if self.base.stranded {
2✔
1311
            writeln!(writer, "node id\tleft neighbors\tright neighbors\tsequence\tdata")?;
1✔
1312
        } else {
1313
            writeln!(writer, "node id\tleft neighbors\tleft nb incoming dirs\tright neighbors\tright nb incoming dirs\tsequence\tdata")?;
1✔
1314
        }
1315
            
1316
        for i in 0..self.len() {
527✔
1317
            self.node_to_tsv(&mut writer, i, &data_format)?
527✔
1318
        }
1319
        
1320
        Ok(())
2✔
1321
    }
2✔
1322

1323
    pub fn to_json_rest<W: Write, F: Fn(&D) -> Value>(
×
1324
        &self,
×
1325
        fmt_func: F,
×
1326
        mut writer: &mut W,
×
1327
        rest: Option<Value>,
×
1328
    ) {
×
1329
        writeln!(writer, "{{\n\"nodes\": [").unwrap();
×
1330
        for i in 0..self.len() {
×
1331
            let node = self.get_node(i);
×
1332
            node.to_json(&fmt_func, writer);
×
1333
            if i == self.len() - 1 {
×
1334
                writeln!(writer).unwrap();
×
1335
            } else {
×
1336
                writeln!(writer, ",").unwrap();
×
1337
            }
×
1338
        }
1339
        writeln!(writer, "],").unwrap();
×
1340

1341
        writeln!(writer, "\"links\": [").unwrap();
×
1342
        for i in 0..self.len() {
×
1343
            let node = self.get_node(i);
×
1344
            match node.edges_to_json(writer) {
×
1345
                true => {
1346
                    if i == self.len() - 1 {
×
1347
                        writeln!(writer).unwrap();
×
1348
                    } else {
×
1349
                        writeln!(writer, ",").unwrap();
×
1350
                    }
×
1351
                }
1352
                _ => continue,
×
1353
            }
1354
        }
1355
        writeln!(writer, "]").unwrap();
×
1356

1357
        match rest {
×
1358
            Some(Value::Object(v)) => {
×
1359
                for (k, v) in v.iter() {
×
1360
                    writeln!(writer, ",").expect("io error");
×
1361
                    write!(writer, "\"{}\": ", k).expect("io error");
×
1362
                    serde_json::to_writer(&mut writer, v).expect("io error");
×
1363
                    writeln!(writer).expect("io error");
×
1364
                }
×
1365
            }
1366
            _ => {
×
1367
                writeln!(writer).expect("io error");
×
1368
            }
×
1369
        }
1370

1371
        writeln!(writer, "}}").expect("io error");
×
1372
    }
×
1373

1374
    /// Write the graph to JSON
1375
    pub fn to_json<W: Write, F: Fn(&D) -> Value, RF: Fn(&mut W)>(
×
1376
        &self,
×
1377
        fmt_func: F,
×
1378
        writer: &mut W,
×
1379
    ) {
×
1380
        self.to_json_rest(fmt_func, writer, None);
×
1381
    }
×
1382

1383
    // iterate over graph or parial node IDs while leaving out the last node
1384
    fn iter_optional_partial<'a>(&self, partial_nodes: Option<&'a Vec<usize>>) -> Box<dyn Iterator<Item = usize> + 'a> {
4✔
1385
        if let Some(partial) = partial_nodes {
4✔
1386
            Box::new(partial[..(partial.len()-1)].iter().copied())
2✔
1387
        } else {
1388
            Box::new(0..(self.len()-1))
2✔
1389
        }
1390
    }
4✔
1391

1392
    /// write the graph or parts of the graph to a json file to view in 3d
1393
    pub fn to_json_3d<P, FN, FE>(&self, 
2✔
1394
        path: P, 
2✔
1395
        node_properties: &FN, 
2✔
1396
        edge_properties: &FE, 
2✔
1397
        partial_nodes: Option<&'_ Vec<usize>>
2✔
1398
    ) -> Result<(), Box<dyn std::error::Error>> 
2✔
1399
        where 
2✔
1400
        P: AsRef<Path>,
2✔
1401
        FN: Fn(&Node<K, D>) -> String,
2✔
1402
        FE: Fn(&Node<K, D>, usize, u8, Dir, bool) -> String,
2✔
1403
    {
1404
        let mut writer = BufWriter::new(File::create(path)?);
2✔
1405

1406
        writeln!(writer, "{{")?;
2✔
1407
        writeln!(writer, "\t\"nodes\": [")?;
2✔
1408

1409
        // write nodes to json
1410

1411
        for node_id in self.iter_optional_partial(partial_nodes) {
105✔
1412
            let node = self.get_node(node_id);
105✔
1413
            let node_fmt = node_properties(&node);
105✔
1414

1415
            writeln!(writer, "\t\t{{ {node_fmt} }},")?;
105✔
1416
        }
1417

1418
        // do last node separately because of comma
1419
        let last_node_id = match partial_nodes {
2✔
1420
            Some(partial) => *partial.last().expect("empty parial nodes vector"),
1✔
1421
            None => self.len() - 1
1✔
1422
        };
1423

1424
        let last_node = self.get_node(last_node_id);
2✔
1425
        let last_node_fmt = node_properties(&last_node);
2✔
1426

1427
        writeln!(writer, "\t\t{{ {last_node_fmt} }}")?;
2✔
1428

1429
        writeln!(writer, "\t],")?;
2✔
1430
        writeln!(writer, "\t\"links\": [")?;
2✔
1431

1432
        // write links to json
1433

1434
        for node_id in self.iter_optional_partial(partial_nodes) {
105✔
1435
            let node = self.get_node(node_id);
105✔
1436
            // write edges to the right
1437
            for (base, target_id, dir, flipped) in node.r_edges() {
105✔
1438
                let edge_fmt = edge_properties(&node, target_id, base, dir, flipped);
103✔
1439
                writeln!(writer, "\t\t{{ {edge_fmt} }},")?;
103✔
1440
            }
1441

1442
            // if stranded, continue, else also look at left edges
1443
            if self.base.stranded { continue; }
105✔
1444

1445
            // write edges to the right
1446
            for (base, target_id, dir, flipped) in node.l_edges() {
×
1447
                let edge_fmt = edge_properties(&node, target_id, base, dir, flipped);
×
1448
                writeln!(writer, "\t\t{{ {edge_fmt} }},")?;
×
1449
            }
1450
        }
1451

1452
        // eges for last node without comma
1453
        // FIXME only last edge should be without comma, not all edges from last node
1454
        // write edges to the right
1455
        for (base, target_id, dir, flipped) in last_node.r_edges() {
2✔
1456
            let edge_fmt = edge_properties(&last_node, target_id, base, dir, flipped);
2✔
1457
            writeln!(writer, "\t\t{{ {edge_fmt} }}")?;
2✔
1458
        }
1459

1460
        // if not stranded also look at left edges
1461
        if !self.base.stranded { 
2✔
1462
            // write edges to the right
1463
            for (base, target_id, dir, flipped) in last_node.l_edges() {
×
1464
                let edge_fmt = edge_properties(&last_node, target_id, base, dir, flipped);
×
1465
                writeln!(writer, "\t\t{{ {edge_fmt} }}")?;
×
1466
            }
1467
        }
2✔
1468

1469
        writeln!(writer, "\t]")?;
2✔
1470
        writeln!(writer, "}}")?;
2✔
1471

1472
        Ok(())
2✔
1473
    }
2✔
1474

1475
    /// Print a text representation of the graph.
1476
    pub fn print(&self) {
3✔
1477
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
3✔
1478
        for node in self.iter_nodes() {
138✔
1479
            println!("{:?}", node);
138✔
1480
        }
138✔
1481
    }
3✔
1482

1483
    pub fn print_with_data(&self) {
×
1484
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
×
1485
        for node in self.iter_nodes() {
×
1486
            println!("{:?} ({:?})", node, node.data());
×
1487
        }
×
1488
    }
×
1489

1490
    pub fn max_path_beam<F, F2>(&self, beam: usize, score: F, _solid_path: F2) -> Vec<(usize, Dir)>
×
1491
    where
×
1492
        F: Fn(&D) -> f32,
×
1493
        F2: Fn(&D) -> bool,
×
1494
    {
1495
        if self.is_empty() {
×
1496
            return Vec::default();
×
1497
        }
×
1498

1499
        let mut states = Vec::new();
×
1500

1501
        for i in 0..self.len() {
×
1502
            let node = self.get_node(i);
×
1503

1504
            // Initialize beam search on terminal nodes
1505
            if node.exts().num_exts_l() == 0 || node.exts().num_exts_r() == 0 {
×
1506
                let dir = if node.exts().num_exts_l() > 0 {
×
1507
                    Dir::Right
×
1508
                } else {
1509
                    Dir::Left
×
1510
                };
1511

1512
                let status = if node.exts().num_exts_l() == 0 && node.exts().num_exts_r() == 0 {
×
1513
                    Status::End
×
1514
                } else {
1515
                    Status::Active
×
1516
                };
1517

1518
                let mut path = SmallVec8::new();
×
1519
                path.push((i as u32, dir));
×
1520

1521
                let s = State {
×
1522
                    path,
×
1523
                    status,
×
1524
                    score: score(node.data()),
×
1525
                };
×
1526
                states.push(s);
×
1527
            }
×
1528
        }
1529

1530
        // No end nodes -- just start on the first node
1531
        if states.is_empty() {
×
1532
            // Make a start
×
1533
            let node = self.get_node(0);
×
1534
            let mut path = SmallVec8::new();
×
1535
            path.push((0, Dir::Left));
×
1536
            states.push(State {
×
1537
                path,
×
1538
                status: Status::Active,
×
1539
                score: score(node.data()),
×
1540
            });
×
1541
        }
×
1542

1543
        // Beam search until we can't find any more expansions
1544
        let mut active = true;
×
1545
        while active {
×
1546
            let mut new_states = Vec::with_capacity(states.len());
×
1547
            active = false;
×
1548

1549
            for s in states {
×
1550
                if s.status == Status::Active {
×
1551
                    active = true;
×
1552
                    let expanded = self.expand_state(&s, &score);
×
1553
                    new_states.extend(expanded);
×
1554
                } else {
×
1555
                    new_states.push(s)
×
1556
                }
1557
            }
1558

1559
            // workaround to sort by descending score - will panic if there are NaN scores
1560
            new_states.sort_by(|a, b| (-(a.score)).partial_cmp(&-(b.score)).unwrap());
×
1561
            new_states.truncate(beam);
×
1562
            states = new_states;
×
1563
        }
1564

1565
        for (i, state) in states.iter().take(5).enumerate() {
×
1566
            trace!("i:{}  -- {:?}", i, state);
×
1567
        }
1568

1569
        // convert back to using usize for node_id
1570
        states[0]
×
1571
            .path
×
1572
            .iter()
×
1573
            .map(|&(node, dir)| (node as usize, dir))
×
1574
            .collect()
×
1575
    }
×
1576

1577
    fn expand_state<F>(&self, state: &State, score: &F) -> SmallVec4<State>
×
1578
    where
×
1579
        F: Fn(&D) -> f32,
×
1580
    {
1581
        if state.status != Status::Active {
×
1582
            panic!("only attempt to expand active states")
×
1583
        }
×
1584

1585
        let (node_id, dir) = state.path[state.path.len() - 1];
×
1586
        let node = self.get_node(node_id as usize);
×
1587
        let mut new_states = SmallVec4::new();
×
1588

1589
        for (_, next_node_id, incoming_dir, _) in node.edges(dir.flip()) {
×
1590
            let next_node = self.get_node(next_node_id);
×
1591
            let new_score = state.score + score(next_node.data());
×
1592

1593
            let cycle = state
×
1594
                .path
×
1595
                .iter()
×
1596
                .any(|&(prev_node, _)| prev_node == (next_node_id as u32));
×
1597

1598
            let status = if cycle {
×
1599
                Status::Cycle
×
1600
            } else if next_node.edges(incoming_dir.flip()).is_empty() {
×
1601
                Status::End
×
1602
            } else {
1603
                Status::Active
×
1604
            };
1605

1606
            let mut new_path = state.path.clone();
×
1607
            new_path.push((next_node_id as u32, incoming_dir));
×
1608

1609
            let next_state = State {
×
1610
                path: new_path,
×
1611
                score: new_score,
×
1612
                status,
×
1613
            };
×
1614

1615
            new_states.push(next_state);
×
1616
        }
1617

1618
        new_states
×
1619
    }
×
1620

1621

1622
    pub fn iter_components(&'_ self) -> IterComponents<'_, K, D> {
7✔
1623
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
7✔
1624
        let pos = 0;
7✔
1625

1626
        for _i in 0..self.len() {
645✔
1627
            visited.push(false);
645✔
1628
        }
645✔
1629

1630
        IterComponents { 
7✔
1631
            graph: self, 
7✔
1632
            visited, 
7✔
1633
            pos }
7✔
1634
    }
7✔
1635

1636

1637
    /// iteratively returns 2D Vec with node_ids grouped according to the connected components they form
1638
    pub fn components_i(&self) -> Vec<Vec<usize>> {
1✔
1639
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
1✔
1640
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
1✔
1641

1642
        for _i in 0..self.len() {
134✔
1643
            visited.push(false);
134✔
1644
        }
134✔
1645

1646
        for i in 0..self.len() {
134✔
1647
            if !visited[i] {
134✔
1648
                let comp = self.component_i(&mut visited, i);
2✔
1649
                components.push(comp);
2✔
1650
            }
132✔
1651
        }
1652

1653
        components
1✔
1654
    }
1✔
1655

1656
    /// recursively detects which nodes form separate graph components
1657
    /// returns 2D vector with node ids per component
1658
    /// (may lead to stack overflow)
1659
    pub fn components_r(&self) -> Vec<Vec<usize>> {
2✔
1660
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
2✔
1661
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
2✔
1662

1663
        for _i in 0..self.len() {
136✔
1664
            visited.push(false);
136✔
1665
        }
136✔
1666

1667
        for i in 0..self.len() {
136✔
1668
            if !visited[i] {
136✔
1669
                let mut comp: Vec<usize> = Vec::new();
4✔
1670
                self.component_r(&mut visited, i, &mut comp);
4✔
1671
                components.push(comp);
4✔
1672
            }
132✔
1673
        }
1674

1675
        components
2✔
1676

1677
    }
2✔
1678

1679
    fn component_r<'a>(&'a self, visited: &'a mut Vec<bool>, i: usize, comp: &'a mut Vec<usize>) {
136✔
1680
        
1681
        visited[i] = true;
136✔
1682
        comp.push(i);
136✔
1683
        let mut edges = self.find_edges(i, Dir::Left);
136✔
1684
        let mut r_edges = self.find_edges(i, Dir::Right);
136✔
1685

1686
        edges.append(&mut r_edges);
136✔
1687

1688
        for (_, edge, _, _) in edges.iter() {
264✔
1689
            if !visited[*edge] {
264✔
1690
                self.component_r(visited, *edge, comp);
132✔
1691
            }
132✔
1692
        }
1693
    }
136✔
1694

1695
    fn component_i<'a>(&'a self, visited: &'a mut [bool], i: usize) -> Vec<usize> {
19✔
1696
        let mut edges: Vec<usize> = Vec::new();
19✔
1697
        let mut comp: Vec<usize> = Vec::new();
19✔
1698

1699
        edges.push(i);
19✔
1700

1701
        while let Some(current_edge) = edges.pop() {
799✔
1702
            if !visited[current_edge] { 
780✔
1703
                comp.push(current_edge);
779✔
1704
                visited[current_edge] = true;
779✔
1705

1706
                let mut l_edges = self.find_edges(current_edge, Dir::Left);
779✔
1707
                let mut r_edges = self.find_edges(current_edge, Dir::Right);
779✔
1708

1709
                l_edges.append(&mut r_edges);
779✔
1710

1711
                for (_, new_edge, _, _) in l_edges.into_iter() {
1,522✔
1712
                    if !visited[new_edge] {
1,522✔
1713
                        edges.push(new_edge);
761✔
1714
                    }
761✔
1715
                }
1716
            }
1✔
1717
        }
1718
        comp
19✔
1719
    }
19✔
1720

1721
    /// iterate over all edges of the graph, item: (node, ext base, ext dir, target node)
1722
    pub fn iter_edges(&self) -> EdgeIter<'_, K, D> {
41✔
1723
        EdgeIter::new(self)
41✔
1724
    }
41✔
1725

1726
    pub fn find_bad_nodes<F: Fn(&Node<'_, K, D>) -> bool>(&self, valid: F) -> Vec<usize> {
×
1727
        let mut bad_nodes = Vec::new();
×
1728

1729
        for (i, node) in enumerate(self.iter_nodes()) {
×
1730
            if !valid(&node) { bad_nodes.push(i); }
×
1731
        }
1732
        
1733
        bad_nodes
×
1734
    }
×
1735
}
1736

1737
impl<K: Kmer, SD: Debug> DebruijnGraph<K, SD> {
1738
    pub fn create_colors<'a, 'b: 'a, DI>(&'a self, config: &SummaryConfig, color_mode: ColorMode<'b>) -> Colors<'b, SD, DI> 
×
1739
    where 
×
1740
    SD: SummaryData<DI>,
×
1741
    {
1742
        Colors::new(self, config, color_mode)
×
1743
    }
×
1744
    
1745
    /// [`crate::EdgeMult`] and [] will contain hanging edges if the nodes were filtered
1746
    pub fn fix_edge_data<DI>(&mut self) 
1✔
1747
    where 
1✔
1748
        SD: SummaryData<DI>
1✔
1749
    {
1750
        if self.get_node(0).data().edge_mults().is_some() {
1✔
1751
            for i in 0..self.len() {
134✔
1752
                self.base.data[i].fix_edge_data(self.base.exts[i]);
134✔
1753
            }
134✔
1754
        }
×
1755
    }
1✔
1756

1757
    /// if there are [`crate::EdgeMult`]s in the data, prune the graph by removing edges that have a low coverage
1758
    pub fn filter_edges<DI>(&mut self, min: u32) -> Result<(), String>
1✔
1759
    where 
1✔
1760
        SD: SummaryData<DI>
1✔
1761
    {
1762
        // return if there is no edge coverage available
1763
        if self.get_node(0).data().edge_mults().is_none() { return Err(String::from("no edge mults available")) };
1✔
1764

1765
        for i in 0..self.len() {
134✔
1766
            let em = self.get_node(i).data().edge_mults().expect("shold have em").clone();
134✔
1767
            let edges = [(Dir::Left, 0), (Dir::Left, 1), (Dir::Left, 2), (Dir::Left, 3), 
134✔
1768
                (Dir::Right, 0), (Dir::Right, 1), (Dir::Right, 2), (Dir::Right, 3)];
134✔
1769
            
1770
            for (dir, base) in edges {
1,072✔
1771
                if min > em.edge_mult(base, dir) {
1,072✔
1772
                    // remove invalid ext from node
808✔
1773
                    let ext = self.base.exts[i].remove(dir, base);
808✔
1774
                    self.base.exts[i] = ext;
808✔
1775
                }
808✔
1776
            }
1777
        }
1778

1779
        Ok(())
1✔
1780
    }
1✔
1781

1782
    /// if a node has a connection to a high quality node and low quality nodes 
1783
    /// in the same direction, remove the connections to the low quality ndoes
1784
    pub fn remove_lq_splits<DI>(&mut self, min_quality: BaseQuality) -> Result<(), String>
4✔
1785
    where 
4✔
1786
        SD: SummaryData<DI>
4✔
1787
    {
1788
        // check we do indeed have quality and graph is stranded
1789
        if self.get_node(0).data().quality().is_none() { return Err(String::from("no quality scores available")); }
4✔
1790

1791
        // iterate over all nodes and look in both directions if there is a low quality node splitting off
1792
        for (node_id, out_dir) in (0..self.len()).flat_map(|id| [(id, Dir::Right), (id, Dir::Left)]) {
1,164✔
1793
            let out_edges = self.get_node(node_id).edges(out_dir);
1,164✔
1794

1795
            // check for split
1796
            if out_edges.len() < 2 {
1,164✔
1797
                // no split, move on
1798
                continue;
1,112✔
1799
            }
52✔
1800

1801
            // get qualities 
1802
            let nb_qualities = out_edges
52✔
1803
                .iter()
52✔
1804
                .map(|(_, nb_id, nb_in_dir, _)| (*nb_id, *nb_in_dir, self
104✔
1805
                    .get_node(*nb_id)
104✔
1806
                    .data()
104✔
1807
                    .quality()
104✔
1808
                    .unwrap()
104✔
1809
                )).collect::<Vec<_>>();
52✔
1810

1811
            // check for good neighbor
1812
            let has_good_nb = nb_qualities.iter()
52✔
1813
                .any(|(_, _, quality)| *quality >= min_quality);
83✔
1814

1815
            if !has_good_nb {
52✔
1816
                // split but no good path, move on
1817
                continue;
8✔
1818
            }
44✔
1819

1820
            // remove split with low quality
1821
            for (nb_id, nb_in_dir, _) in nb_qualities.iter().filter(|(_, _, quality)| *quality < min_quality ) {
88✔
1822
                let path = vec![(node_id, out_dir.flip()), (*nb_id, *nb_in_dir)];
44✔
1823
                if self.remove_path(path).is_err() {
44✔
UNCOV
1824
                    warn!("lq tip path could not be removed")
×
1825
                }
44✔
1826
            }
1827
        }
1828

1829
        Ok(())
4✔
1830
    }
4✔
1831

1832
    /// remove bubbles/ladders and tips in which one path has a quality lower than the given `min_quality`. 
1833
    /// The method continues searching on a path for a maximum of (`max_path_fac`` * k - 1).
1834
    pub fn remove_lq_paths<DI>(&mut self, min_quality: BaseQuality, max_path_fac: usize) -> Result<(), String>
4✔
1835
    where
4✔
1836
        SD: SummaryData<DI>
4✔
1837
    {
1838
        // check we do indeed have quality and graph is stranded
1839
        if self.get_node(0).data().quality().is_none() { return Err(String::from("no quality scores available")); }
4✔
1840

1841
        let min_path = 2 * K::k() - 1;
4✔
1842
        let max_path = max_path_fac * K::k() - 1;
4✔
1843

1844
        // iterate over nodes
1845
        for (node_id, out_dir) in (0..self.len()).flat_map(|id| [(id, Dir::Right), (id, Dir::Left)]) {
1,164✔
1846
            // check if node has multile outs, at least one with bad quality and one with good quality
1847
            let node_out_edges = self.get_node(node_id).edges(out_dir);
1,164✔
1848

1849
            let good_neighbors = node_out_edges.iter()
1,164✔
1850
                .map(|(_, target_id, target_in_dir, _)| (*target_id, target_in_dir))
1,164✔
1851
                .filter(|(target_id, _)| self.get_node(*target_id)
1,164✔
1852
                    .data()
953✔
1853
                    .quality()
953✔
1854
                    .unwrap() >= min_quality
953✔
1855
                ).collect::<Vec<_>>();
1,164✔
1856

1857
            let bad_neighbors = node_out_edges.iter()
1,164✔
1858
                .map(|(_, target_id, target_in_dir, _)| (*target_id, target_in_dir))
1,164✔
1859
                .filter(|(target_id, _)| self.get_node(*target_id)
1,164✔
1860
                    .data()
953✔
1861
                    .quality()
953✔
1862
                    .unwrap() < min_quality
953✔
1863
                ).collect::<Vec<_>>();
1,164✔
1864

1865
            if good_neighbors.is_empty() | bad_neighbors.is_empty() { continue; }
1,164✔
1866

1867
            // follow the bad quality paths until we reach nodes with high quality again (or max search radius)
1868
            let mut possible_paths = Vec::new();
27✔
1869
            let mut tips = Vec::new();
27✔
1870

1871
            for (bn, bn_in_dir) in bad_neighbors {
27✔
1872
                let mut current_in_dir = *bn_in_dir; // in an unstranded graph, the out dir can change
27✔
1873
                let mut current_node_id = bn;
27✔
1874
                let mut path_groups = vec![vec![(node_id, out_dir.flip())]];
27✔
1875
                let mut path_length = K::k() - 1;
27✔
1876

1877
                let mut state = LadderState::Singular;
27✔
1878
                let mut q_state_high = false;
27✔
1879

1880
                loop {
1881
                    let current_node = self.get_node(current_node_id);
323✔
1882
                    let out_edges = current_node.edges(current_in_dir.flip());
323✔
1883

1884
                    // add current node length to path
1885
                    path_length += current_node.len() - K::k() + 1;
323✔
1886

1887
                    // check if path has reached max length -> interrupt
1888
                    if path_length > max_path {
323✔
1889
                        break;
×
1890
                    }
323✔
1891
                    
1892
                    // check state and add current node to path
1893
                    let path_index = path_groups.len() - 1;
323✔
1894

1895
                    // if in singular state now or before, add node to path
1896
                    if matches!(state, LadderState::Singular) { 
323✔
1897
                        path_groups[path_index].push((current_node_id, current_in_dir));
278✔
1898
                    }
278✔
1899

1900
                    // check if we increase ladder state
1901
                    let q_increase = (current_node.data().quality().unwrap() >= min_quality) & !q_state_high;
323✔
1902
                    let mult_increase = current_node.edges(current_in_dir).len() > 1;
323✔
1903
                    
1904
                    if q_increase {
323✔
1905
                        q_state_high = true
28✔
1906
                    }
295✔
1907

1908
                    if q_increase | mult_increase {
323✔
1909
                        match state {
32✔
1910
                            LadderState::Singular => {
32✔
1911
                                state = LadderState::Double;
32✔
1912
                            }
32✔
1913
                            LadderState::Double => () // ignore
×
1914
                        };
1915
                    }
291✔
1916

1917
                    // check if we decrease ladder state
1918
                    let q_decrease = (current_node.data().quality().unwrap() < min_quality) & q_state_high;
323✔
1919
                    let mult_decrease = out_edges.len() > 1;
323✔
1920

1921
                    if q_decrease {
323✔
1922
                        q_state_high = false;
9✔
1923
                    }
314✔
1924

1925
                    if q_decrease | mult_decrease {
323✔
1926
                        match state {
24✔
1927
                            LadderState::Singular => (), // ignore
9✔
1928
                            LadderState::Double => {
15✔
1929
                                state = LadderState::Singular;
15✔
1930
                                path_groups.push(vec![(current_node_id, current_in_dir)]); // start new path group
15✔
1931
                            }
15✔
1932
                        }
1933
                    }
299✔
1934

1935
                    // check if we have met end criterium -> save path
1936
                    let quality_req =  current_node.data().quality().unwrap() >= min_quality;
323✔
1937
                    let len_req = path_length >= min_path;
323✔
1938
                    // path is a simple tip -> save as tip (does not have to meet length requirement)
1939
                    let is_tip = out_edges.is_empty() & (path_groups.len() == 1); // TODO maybe remove req 2 in future
323✔
1940

1941
                    if quality_req & len_req {
323✔
1942
                        possible_paths.push(path_groups);
19✔
1943
                        break;
19✔
1944
                    } else if is_tip {
304✔
1945
                        tips.push(path_groups);
4✔
1946
                        break;
4✔
1947
                    }
300✔
1948

1949
                    // we have not met the conditions and keep moving
1950

1951
                    // find next node
1952
                    let (next_node_id, next_in_dir)  = if out_edges.len() == 1 {
300✔
1953
                        let (_, next_node_id, next_in_dir, _) = out_edges[0];
283✔
1954
                        (next_node_id, next_in_dir)
283✔
1955
                    } else if out_edges.is_empty() {
17✔
1956
                        // dead end which has not qualified as tip
1957
                        break;
4✔
1958
                    } else {
1959
                        // choose the lowest quality path
1960

1961
                        let worst_neighbor = out_edges.iter()
13✔
1962
                            .map(|(_, target_id, target_in_dir, _)| (*target_id, *target_in_dir, self.get_node(*target_id)
26✔
1963
                                .data()
26✔
1964
                                .quality()
26✔
1965
                                .unwrap())
26✔
1966
                            )
1967
                            .min_by(|(_, _, q_a), (_, _, q_b)| q_a.cmp(q_b));
13✔
1968

1969
                        if let Some((worst_nb_id, worst_nb_inc_dir, _)) = worst_neighbor {
13✔
1970
                            (worst_nb_id, worst_nb_inc_dir)
13✔
1971
                        } else {
1972
                            break;
×
1973
                        }
1974
                    };
1975

1976
                    current_node_id = next_node_id;
296✔
1977
                    current_in_dir = next_in_dir;
296✔
1978
                }
1979
            }
1980

1981
            let possible_targets = possible_paths.iter().map(|p| p.last().unwrap().last().unwrap().0).collect::<Vec<_>>();
27✔
1982
            let mut confirmed_targets = Vec::new();
27✔
1983
            // follow the good quality paths until we reach a possible target node (or max search radius)
1984
            // if we reached a target node, send bad path to be removed from graph
1985
            for (gn, gn_in_dir) in good_neighbors {
27✔
1986
                let mut path_length = K::k() - 1;
27✔
1987
                let mut current_node_id = gn;
27✔
1988
                let mut current_in_dir = *gn_in_dir;
27✔
1989

1990
                loop {
1991
                    // check if we have exceeded the search radius
1992
                    if path_length > max_path {
310✔
1993
                        break;
1✔
1994
                    }
309✔
1995

1996
                    // add current node length to path length
1997
                    let current_node = self.get_node(current_node_id);
309✔
1998
                    path_length += current_node.len() - K::k() + 1;
309✔
1999

2000
                    // check if we have found a target
2001
                    if possible_targets.contains(&current_node_id) {
309✔
2002
                        confirmed_targets.push(current_node_id);
16✔
2003
                        break;
16✔
2004
                    }
293✔
2005

2006
                    // look for next node
2007
                    let out_edges = current_node.edges(current_in_dir.flip());
293✔
2008

2009
                    let (next_node_id, next_in_dir) = if out_edges.len() == 1 {
293✔
2010
                        let (_, next_node_id, next_in_dir, _) = out_edges[0];
283✔
2011
                        (next_node_id, next_in_dir)
283✔
2012
                    } else if out_edges.is_empty() {
10✔
2013
                        // dead end, break
2014
                        break;
10✔
2015
                    } else {
2016
                        // use node with highest quality
2017
                        // TODO future: use read mapping?
NEW
2018
                        let good_neighbor = out_edges.iter()
×
NEW
2019
                            .map(|(_, target_id, target_in_dir, _)| (*target_id, target_in_dir, self.get_node(*target_id)
×
2020
                                .data()
×
2021
                                .quality()
×
2022
                                .unwrap()))
×
NEW
2023
                            .max_by(|(_, _, q_a), (_, _, q_b)| q_a.cmp(q_b));
×
NEW
2024
                        if let Some((best_nb_id, best_nb_in_dir, _)) = good_neighbor {
×
NEW
2025
                            (best_nb_id, *best_nb_in_dir)
×
2026
                        } else {
2027
                            break;
×
2028
                        }
2029
                    };
2030

2031
                    current_node_id = next_node_id;
283✔
2032
                    current_in_dir = next_in_dir;
283✔
2033
                }
2034
            }
2035

2036
            // check if we have found end nodes of possible paths by following good quality paths
2037
            // if so, remove path
2038
            for path_group in possible_paths {
27✔
2039
                let target = path_group.last().unwrap().last().unwrap().0;
19✔
2040

2041
                if confirmed_targets.contains(&target) {
19✔
2042
                    for path in path_group {
26✔
2043
                        if let Err(_err) = self.remove_path(path.clone()) {
26✔
2044
                            warn!("lq ladder partial path could not be removed, likely cause: loop, edges were already removed. parital path: {:?}", path)
×
2045
                        }
26✔
2046
                    }
2047
                }
3✔
2048
            }
2049

2050
            // remove tip paths
2051
            for path_group in tips {
27✔
2052
                let path = path_group.into_iter().next().expect("empty tip path found");
4✔
2053
                if let Err(_err) = self.remove_path(path.clone()) {
4✔
2054
                    warn!("lq tip partial path could not be removed, likely cause: loop, edges were already removed. parital path: {:?}", path)
×
2055
                }
4✔
2056
            }
2057

2058
        }
2059

2060
        Ok(())
4✔
2061
    }
4✔
2062

2063
    /// remove bubbles/ladders and tips in which one path has a lower coverage than the alternative path. 
2064
    /// The method continues searching on a path for a maximum of (`max_path_fac`` * k - 1).
2065
    /// The path is only removed if the average coverage of the lower path is below `max_avg_low_cov` and the 
2066
    /// average coverage is at least `min_diff_factor` times higher.
2067
    pub fn remove_lc_paths<DI>(&mut self, max_path_fac: usize, min_diff_factor: u32, max_avg_low_cov: f32) -> Result<(), String>
4✔
2068
    where
4✔
2069
        SD: SummaryData<DI>
4✔
2070
    {
2071
        // state is switched if coverage rises by 20% + 2 (so it's at least 2 more)
2072
        // nodes with higher state are kept connected together
2073
        // tests have shown that increase in coverage does not necessarily indicate 
2074
        // a multiplicity change
2075
        // however, we are trying to avoid false positives - false negatives will 
2076
        // most likely be cut off from the component or would be removed by remove_tips
2077
        // as a next step
2078
        // increasing these values will increase the number of removed edges
2079
        const COV_STATE_FACTOR: f32 = 0.2; 
2080
        const COV_STATE_ADD: f32 = 2.;
2081

2082
        // check we do indeed have the edge coverage
2083
        if self.get_node(0).data().edge_mults().is_none() { return Err(String::from("no quality scores available")); }
4✔
2084

2085
        let min_path = 2 * K::k() - 1;
4✔
2086
        let max_path = max_path_fac * K::k() - 1;
4✔
2087

2088
        // iterate over nodes
2089
        for (node_id, start_out_dir) in (0..self.len()).flat_map(|id| [(id, Dir::Right), (id, Dir::Left)]) {
572✔
2090

2091
            let node = self.get_node(node_id);
572✔
2092

2093
            // check if node has multile outs, at least one with bad quality and one with good quality
2094
            let node_out_coverages = node.data().edge_mults().expect("should have em").single_dir(start_out_dir).edge_mults;
572✔
2095

2096
            let Some((out_max_base, &out_max_cov)) = node_out_coverages.iter().rev().enumerate().filter(|&(_, &c)| c  > 0).max_by(|&(_b1, &c1), &(_b2, c2)| c1.cmp(c2)) else { continue };
2,288✔
2097
            let smaller_outs = node_out_coverages.iter().copied().rev().enumerate().filter(|&(_b, c)| (c > 0) & (out_max_cov > c)).collect::<Vec<_>>();
1,924✔
2098

2099
            if smaller_outs.is_empty() { continue; }
481✔
2100

2101
            // follow the bad quality paths until we reach nodes with high quality again (or max search radius)
2102
            let mut possible_paths = Vec::new();
8✔
2103
            let mut tips = Vec::new();
8✔
2104

2105
            let node_term_kmer = node.sequence().term_kmer::<K>(start_out_dir);
8✔
2106

2107
            for (start_out_base, start_out_cov) in smaller_outs {
8✔
2108
                // get nb id
2109
                let next_kmer = node_term_kmer.extend(start_out_base as u8, start_out_dir);
8✔
2110
                let (mut current_node_id, mut current_in_dir, _) = self.find_link(next_kmer, start_out_dir).expect("link should exist"); 
8✔
2111

2112
                let mut current_cov = start_out_cov as f32;
8✔
2113

2114
                let mut path_groups = vec![vec![(node_id, start_out_dir.flip())]];
8✔
2115
                let mut path_length = K::k() - 1;
8✔
2116

2117
                let mut state = LadderState::Singular;
8✔
2118
                let mut c_state_high = false;
8✔
2119

2120
                let mut cov_sum = start_out_cov;
8✔
2121
                let mut cov_count = 1;
8✔
2122

2123
                loop {
2124
                    let current_node = self.get_node(current_node_id);
86✔
2125
                    let out_edges = current_node.edges(current_in_dir.flip());
86✔
2126

2127
                    // add current node length to path
2128
                    path_length += current_node.len() - K::k() + 1;
86✔
2129

2130
                    // check if path has reached max length -> interrupt
2131
                    if path_length > max_path {
86✔
NEW
2132
                        break;
×
2133
                    }
86✔
2134
                    
2135
                    // check state and add current node to path
2136
                    let path_index = path_groups.len() - 1;
86✔
2137

2138
                    // if in singular state now or before, add node to path
2139
                    if matches!(state, LadderState::Singular) { 
86✔
2140
                        path_groups[path_index].push((current_node_id, current_in_dir));
72✔
2141
                    }
72✔
2142

2143
                    // choose next edge by choosing coverage closest to current coverage
2144

2145
                    // get coverage
2146
                    // edge cov must be available
2147
                    let edge_coverages = current_node.data().edge_mults().expect("must have edge mults");
86✔
2148
                    let out_edge_coverages = edge_coverages.single_dir(current_in_dir.flip());
86✔
2149

2150
                    let mut closest_out_cov = None;
86✔
2151
                    let mut next_node_id = None;
86✔
2152
                    let mut next_in_dir = None;
86✔
2153
                    let mut cov_diff = i32::MAX;
86✔
2154
                    for (base, id, in_dir, _) in out_edges.iter() {
96✔
2155
                        let cov = out_edge_coverages.edge_mult(*base) as i32;
96✔
2156
                        let new_cov_diff = (current_cov as i32 - cov).abs();
96✔
2157
                        if new_cov_diff < cov_diff {
96✔
2158
                            (closest_out_cov, next_node_id, next_in_dir, cov_diff) = (Some(cov as f32), Some(*id), Some(*in_dir), new_cov_diff);
92✔
2159
                        }
92✔
2160
                    }
2161

2162
                    // check if coverage increases on high path, similar to c_increase check but with highest outgoing coverage and start cov
2163
                    // TODO check if better with min_diff_factor
2164
                    // TODO check if we should replace consts with min diff factor
2165
                    let highest_cov = out_edge_coverages.edge_mults.iter().max().unwrap_or(&0);
86✔
2166
                    let coverage_req =  (*highest_cov as f32 > out_max_cov as f32 - out_max_cov as f32 * COV_STATE_FACTOR + COV_STATE_ADD) & !c_state_high; // higer coverage than last edge
86✔
2167

2168
                    // check if we have met end criterium -> save path
2169
                    let len_req = path_length >= min_path; // path long enough
86✔
2170
                    // path is a simple tip -> save as tip
2171
                    let is_tip = out_edges.is_empty() & (path_groups.len() == 1); // TODO maybe remove req 2 in future
86✔
2172

2173
                    if coverage_req & len_req {
86✔
2174
                        possible_paths.push((path_groups, cov_sum as f32 / cov_count as f32));
7✔
2175
                        break;
7✔
2176
                    } else if is_tip {
79✔
2177
                        tips.push((path_groups, cov_sum as f32 / cov_count as f32));
1✔
2178
                        break;
1✔
2179
                    }
78✔
2180

2181
                    // check if we have a next node: 
2182
                    let (Some(closest_out_cov), Some(next_node_id), Some(next_in_dir)) = (closest_out_cov, next_node_id, next_in_dir) else { 
78✔
NEW
2183
                        break; 
×
2184
                    };
2185

2186
                    // check if we increase ladder state
2187
                    let c_increase = (closest_out_cov > current_cov + current_cov * COV_STATE_FACTOR + COV_STATE_ADD) & !c_state_high;
78✔
2188
                    let mult_increase = current_node.edges(current_in_dir).len() > 1;
78✔
2189
                    
2190
                    if c_increase {
78✔
2191
                        c_state_high = true
6✔
2192
                    }
72✔
2193

2194
                    if c_increase | mult_increase {
78✔
2195
                        match state {
10✔
2196
                            LadderState::Singular => {
10✔
2197
                                state = LadderState::Double;
10✔
2198
                            }
10✔
NEW
2199
                            LadderState::Double => () // ignore
×
2200
                        };
2201
                    }
68✔
2202

2203
                    // check if we decrease ladder state
2204
                    // do not check if coverage was already high bc state could be increased for just one node by inc edge
2205
                    let c_decrease = closest_out_cov < current_cov + current_cov * COV_STATE_FACTOR + COV_STATE_ADD; //& c_state_high;
78✔
2206
                    let mult_decrease = out_edges.len() > 1;
78✔
2207

2208
                    if c_decrease {
78✔
2209
                        c_state_high = false;
64✔
2210
                    }
64✔
2211

2212
                    if c_decrease | mult_decrease {
78✔
2213
                        match state {
64✔
2214
                            LadderState::Singular => (), // ignore
54✔
2215
                            LadderState::Double => {
10✔
2216
                                state = LadderState::Singular;
10✔
2217
                                path_groups.push(vec![(current_node_id, current_in_dir)]); // start new path group
10✔
2218
                            }
10✔
2219
                        }
2220
                    }
14✔
2221

2222
                    // we have not met the conditions and keep moving
2223
                    current_node_id = next_node_id;
78✔
2224
                    current_in_dir = next_in_dir;
78✔
2225

2226
                    // if the ladder state is singular, use coverage for avg cov
2227
                    if matches!(state, LadderState::Singular) {
78✔
2228
                        current_cov = closest_out_cov;
64✔
2229
                        cov_sum += current_cov as u32;
64✔
2230
                        cov_count += 1;
64✔
2231
                    }
64✔
2232
                }
2233
            }
2234

2235
            let possible_targets = possible_paths.iter().map(|(p, _c)| p.last().unwrap().last().unwrap().0).collect::<Vec<_>>();
8✔
2236
            let mut confirmed_targets = Vec::new();
8✔
2237

2238
            // follow the high coverage until we reach a possible target node (or max search radius)
2239
            // if we reached a target node, send bad path to be removed from graph
2240
            
2241
            let next_kmer = node_term_kmer.extend(out_max_base as u8, start_out_dir);
8✔
2242
            let (mut current_node_id, mut current_in_dir, _) = self.find_link(next_kmer, start_out_dir).expect("link should exist"); 
8✔
2243

2244
            let mut path_length = K::k() - 1;
8✔
2245
            let mut current_cov = out_max_cov;
8✔
2246

2247
            // track coverage 
2248
            let mut cov_sum = current_cov;
8✔
2249
            let mut cov_count = 1;
8✔
2250

2251
            loop {
2252
                // check if we have exceeded the search radius
2253
                if path_length > max_path {
145✔
NEW
2254
                    break;
×
2255
                }
145✔
2256

2257
                // add current node length to path length
2258
                let current_node = self.get_node(current_node_id);
145✔
2259
                path_length += current_node.len() - K::k() + 1;
145✔
2260

2261
                // check if we have found a target
2262
                if possible_targets.contains(&current_node_id) {
145✔
2263
                    confirmed_targets.push(current_node_id);
7✔
2264
                    // keep going in case we find another one
7✔
2265
                }
138✔
2266

2267
                // look for next node
2268
                let out_coverages = current_node.data().edge_mults().expect("must have edge mults").single_dir(current_in_dir.flip()).edge_mults;
145✔
2269

2270
                // use node with highest coverage
2271
                // find highest coverage
2272
                let current_term_kmer = current_node.sequence().term_kmer::<K>(current_in_dir.flip());
145✔
2273
                let Some((next_out_base, next_out_cov)) = out_coverages.iter().rev().enumerate().filter(|&(_, &c)| c > 0).max_by(|&(_b1, &c1), &(_b2, c2)| c1.cmp(c2)) else { break };
580✔
2274
                let next_kmer = current_term_kmer.extend(next_out_base as u8, current_in_dir.flip());
137✔
2275
                let (next_node_id, next_in_dir, _) = self.find_link(next_kmer, current_in_dir.flip()).expect("link should exist"); 
137✔
2276

2277

2278
                current_node_id = next_node_id;
137✔
2279
                current_in_dir = next_in_dir;
137✔
2280
                current_cov = *next_out_cov;
137✔
2281
                cov_sum += current_cov;
137✔
2282
                cov_count += 1;
137✔
2283

2284
            }
2285

2286
            let avg_cov_high_path = (cov_sum as f32 / cov_count as f32) as u32;
8✔
2287

2288
            // check if we have found end nodes of possible paths by following high coverage paths
2289
            // if so, remove path
2290
            for (path_group, avg_low_cov) in possible_paths {
8✔
2291
                let target = path_group.last().unwrap().last().unwrap().0;
7✔
2292

2293
                let cov_valid = (avg_low_cov.round() as u32 * min_diff_factor <= avg_cov_high_path) & (avg_low_cov <= max_avg_low_cov);
7✔
2294

2295
                if confirmed_targets.contains(&target) & cov_valid {
7✔
2296
                    for path in path_group {
17✔
2297
                        if let Err(_err) = self.remove_path(path.clone()) {
17✔
NEW
2298
                            warn!("lq ladder partial path could not be removed, likely cause: loop, edges were already removed. parital path: {:?}", path)
×
2299
                        }
17✔
2300
                    }
NEW
2301
                }
×
2302
            }
2303

2304
            // remove tip paths
2305
            for (path_group, avg_tip_cov) in tips {
8✔
2306
                let path = path_group.into_iter().next().expect("empty tip path found");
1✔
2307
                let cov_valid = (avg_tip_cov.round() as u32 * min_diff_factor <= avg_cov_high_path) & (avg_tip_cov <= max_avg_low_cov);
1✔
2308

2309
                if cov_valid {
1✔
2310
                    if let Err(_err) = self.remove_path(path.clone()) {
1✔
NEW
2311
                        warn!("lq tip partial path could not be removed, likely cause: loop, edges were already removed. parital path: {:?}", path)
×
2312
                    }
1✔
NEW
2313
                }
×
2314
                
2315
            }
2316

2317
        }
2318

2319
        Ok(())
4✔
2320
    }
4✔
2321

2322
    /// remove simple ladder structures (bubbles) caused by 1-base sequencing errors from the graph
2323
    /// 
2324
    /// graph must contain edge mults and be stranded
2325
    /// this function will likely leave tips on the graph, so it is recommended to run
2326
    /// [`DebruijnGraph::remove_tips`] afterwards
2327
    /// 
2328
    /// ladder structure refers to bubbles where one side has been compressed into one node
2329
    /// but the other has low compression, due to differences in coverage and thus data variance,
2330
    /// leading to a ladder-like appearance
2331
    pub fn remove_ladders<DI, P>(&mut self, min_diff_factor: u32, max_avg_low_cov: f32, out_path: Option<P>) -> Result<(), String> 
4✔
2332
    where 
4✔
2333
        SD: SummaryData<DI>,
4✔
2334
        P: AsRef<Path>
4✔
2335
    {
2336
        // interrupt if we dont have edge mults
2337
        if self.get_node(0).data().edge_mults().is_none() { return Err(String::from("no edge mults available")) };
4✔
2338

2339
        let mut writer = out_path.map(|path| BufWriter::new(File::create(path).expect("error creating ladder stats file")));
4✔
2340
        if let Some(wtr) = writer.as_mut() {
4✔
2341
            writeln!(wtr, "avg high cov,avg low cov,high truth ratio,low truth ratio").unwrap();
4✔
2342
        }
4✔
2343

2344
        // iterate over nodes, check in both directions
2345
        for (node_id, out_dir) in (0..self.len()).flat_map(|id| [(id, Dir::Right), (id, Dir::Left)]) {
572✔
2346
            // check if node has outgoing edge(s) with both high and low coverage
2347
            let outs = self.get_node(node_id).data().edge_mults().expect("should have em").single_dir(out_dir).edge_mults;
572✔
2348

2349
            let Some((out_max_base, &out_max_cov)) = outs.iter().rev().enumerate().filter(|&(_, &c)| c  > 0).max_by(|&(_b1, &c1), &(_b2, c2)| c1.cmp(c2)) else { continue };
2,288✔
2350
            let smaller_outs = outs.iter().copied().rev().enumerate().filter(|&(_b, c)| (c > 0) & (out_max_cov > c)).collect::<Vec<_>>();
2,044✔
2351

2352
            // TODO add factor back in, remove writer
2353

2354
            if smaller_outs.is_empty() { continue; }
511✔
2355

2356
            // follow path with highest coverage until target length is reached
2357
            let Some((target_node, avg_high_cov, high_cc)) = self.follow_ladder_path_high(node_id, out_max_base as u8, out_max_cov, out_dir) else { continue; };
12✔
2358
            
2359
            // check all small outs
2360
            for (s_base, s_cov) in smaller_outs {
10✔
2361
                let Some((target_paths, avg_low_cov, low_cc)) = self.follow_ladder_path_low(node_id, s_base as u8, s_cov, out_dir) else { continue; };
10✔
2362
                let other_target_node = target_paths.last().expect("should have at least one element").last().expect("should have at least two elements").0;
8✔
2363

2364
                if target_node == other_target_node {
8✔
2365
                    // the two paths landed on the same node -> remove all edges in the low coverage path
2366
                    if (s_cov * min_diff_factor <= out_max_cov) & (avg_low_cov <= max_avg_low_cov) {
4✔
2367
                        for path in target_paths.iter() {
14✔
2368
                            if self.remove_path(path.clone()).is_err() {
14✔
2369
                                warn!("removing ladders: partial path could not be removed, likely cause: loop, edges were already removed. parital path: {:?}", path)
×
2370
                            }
14✔
2371
                        } 
2372
                    }
×
2373
                    
2374
                    if let Some(wtr) = writer.as_mut() {
4✔
2375
                        writeln!(wtr, "{},{},{},{}", avg_high_cov, avg_low_cov, high_cc, low_cc).unwrap();
4✔
2376
                    }
4✔
2377
                }
4✔
2378
            }
2379
        }
2380

2381
        Ok(())
4✔
2382
    }
4✔
2383

2384
    /// follow the presumably erroneous ladder path with low coverage
2385
    fn follow_ladder_path_low<DI>(&self, start_node_id: usize, start_ext: u8, start_cov: u32, start_out_dir: Dir) -> Option<(Vec<Vec<(usize, Dir)>>, f32, f32)> 
10✔
2386
    where SD: SummaryData<DI>
10✔
2387
    {
2388
        // state is switched if coverage rises by 20% + 2 (so it's at least 2 more)
2389
        // nodes with higher state are kept connected together
2390
        // tests have shown that increase in coverage does not necessarily indicate 
2391
        // a multiplicity change
2392
        // however, we are trying to avoid false positives - false negatives will 
2393
        // most likely be cut off from the component or would be removed by remove_tips
2394
        // as a next step
2395
        // increasing these values will increase the number of removed edges
2396
        const COV_STATE_FACTOR: f32 = 0.2; 
2397
        const COV_STATE_ADD: f32 = 2.;
2398

2399
        // path, including start and target node
2400
        let mut paths = Vec::new();
10✔
2401
        paths.push(Vec::new());
10✔
2402
        // add start node and start in dir as first element in path
2403
        paths[0].push((start_node_id, start_out_dir.flip()));
10✔
2404

2405
        let target_length = K::k();
10✔
2406
        let mut path_length = 0;
10✔
2407
        let mut n_correct_edges = 0;
10✔
2408

2409
        // get fist next node
2410
        let sequence = self.base.sequences.get(start_node_id);
10✔
2411
        let term_kmer: K = sequence.term_kmer(start_out_dir);
10✔
2412
        let next_kmer = term_kmer.extend(start_ext, start_out_dir);
10✔
2413
        let (mut current_node_id, mut current_in_dir, _) = self.find_link(next_kmer, start_out_dir).expect("link should exist"); 
10✔
2414
        paths[0].push((current_node_id, current_in_dir));
10✔
2415

2416
        if self.check_edge_truth(start_node_id, current_node_id) {
10✔
2417
            n_correct_edges += 1;
×
2418
        }
10✔
2419

2420
        let mut current_cov = start_cov as f32;
10✔
2421
        let mut sum_path_cov = start_cov as f32;
10✔
2422
        let mut coverage_counter = 1; 
10✔
2423

2424
        // state to track if nodes should be removed or we're suspecting there's another path here
2425
        // only track edges for removal while state is Singular
2426
        // only track one additional path, else too complicated, return None
2427
        // increase state if there is another incoming edge or if the coverage suddenly increases drastically
2428
        // decrease state if there is another outgoing edge ot if the coverage suddenly decreases drastically
2429
        // if state cannot be increased or decreased further, return None
2430
        let mut state = LadderState::Singular;
10✔
2431

2432
        loop {
2433
            // check if current node is "target node", i.e., the path has reached the desired length
2434
            match path_length {
108✔
2435
                len if len == target_length => return Some((paths, (sum_path_cov / coverage_counter as f32), (n_correct_edges as f32 / (path_length + 1) as f32))), // path has target length
108✔
2436
                len if len > target_length => return None, // path has surpassed desired length => invalid
100✔
2437
                _ => () // path has not yet reached desired length, continue
100✔
2438
            }
2439

2440
            // since current node is not target node, add length to path
2441
            let len = self.get_node(current_node_id).len() - (K::k() - 1);
100✔
2442
            path_length += len;
100✔
2443

2444
            // get current node
2445
            let current_node = self.get_node(current_node_id);
100✔
2446
            
2447
            // get edges
2448
            let in_edges = current_node.edges(current_in_dir);
100✔
2449
            let out_edges = current_node.edges(current_in_dir.flip());
100✔
2450

2451
            // get coverage
2452
            // edge cov must be available
2453
            let out_edge_coverages = current_node.data().edge_mults().expect("must have edge mults").single_dir(current_in_dir.flip());
100✔
2454

2455
            // choose next edge by choosing coverage closest to current coverage
2456
            let mut out_ext = None;
100✔
2457
            let mut next_node_id = None;
100✔
2458
            let mut next_in_dir = None;
100✔
2459
            let mut cov_diff = i32::MAX;
100✔
2460
            for (e, id, in_dir, _) in out_edges.iter() {
106✔
2461
                let cov = out_edge_coverages.edge_mult(*e) as i32;
106✔
2462
                let new_cov_diff = (current_cov as i32 - cov).abs();
106✔
2463
                if new_cov_diff < cov_diff {
106✔
2464
                    (out_ext, next_node_id, next_in_dir, cov_diff) = (Some(*e), Some(*id), Some(in_dir), new_cov_diff);
104✔
2465
                }
104✔
2466
            }
2467
            let (Some(out_ext), Some(next_node_id), Some(next_in_dir)) = (out_ext, next_node_id, next_in_dir) else { return None; };
100✔
2468

2469
            // ideally, low path nodes should have one incoming and one outgoing edge
2470
            // if two incoming edges, increase state from singular to double if possible
2471
            match in_edges.len() {
98✔
2472
                1 => (),
89✔
2473
                2 => match state {
9✔
2474
                    LadderState::Singular => state = LadderState::Double,
9✔
2475
                    LadderState::Double => return None // getting too complicated
×
2476
                }
2477
                _ => return None
×
2478
            }
2479

2480
            // if two outgoing edges, decrease state from double to singular if possible
2481
            // if already singular, 
2482
            match out_edges.len() {
98✔
2483
                1 => (),
90✔
2484
                2 => match state {
8✔
2485
                    LadderState::Singular => (), // could return None, but would kill if overlap is only one node long
2✔
2486
                    LadderState::Double => {
6✔
2487
                        state = LadderState::Singular;
6✔
2488
                        paths.push(vec![(current_node_id, current_in_dir)]); // start new path
6✔
2489
                    }
6✔
2490
                }
2491
                _ => return None
×
2492
            }
2493

2494
            // check coverage, in theoretical ladder coverage should be uniform
2495
            let coverage = out_edge_coverages.edge_mult(out_ext) as f32;
98✔
2496
            match state {
98✔
2497
                LadderState::Singular => {
2498
                    // single state: check if next coverage is similar enough to current coverage
2499
                    // if way bigger, increase state, else adapt current coverage
2500
                    if  coverage > current_cov + current_cov * COV_STATE_FACTOR + COV_STATE_ADD {
81✔
2501
                        // higher by too much, change state
2✔
2502
                        state = LadderState::Double;
2✔
2503
                    } else {
79✔
2504
                        // in acceptable frame
79✔
2505
                        current_cov = coverage;
79✔
2506
                    }
79✔
2507
                }
2508
                LadderState::Double => {
2509
                    // double state: if way smaller (back in acceptable frame), decrease state, else dont treat as current coverage
2510
                    if coverage < current_cov + current_cov * COV_STATE_FACTOR + COV_STATE_ADD {
17✔
2511
                        // back in acceptable range
4✔
2512
                        state = LadderState::Singular;
4✔
2513
                        paths.push(vec![(current_node_id, current_in_dir)]); // start new path
4✔
2514
                        current_cov = coverage;
4✔
2515
                    } // else continue on 
13✔
2516
                    // TODO check if better to also interrupt if coverage increases further
2517
                }
2518
            }
2519

2520
            // if state singular try to check if edge is "correct", add extra length of current node to it to account for compression
2521
            match state {
98✔
2522
                LadderState::Singular => {
2523
                    if self.check_edge_truth(current_node_id, next_node_id) {
83✔
2524
                        n_correct_edges += len;
×
2525
                    }
83✔
2526
                }
2527
                LadderState::Double => ()
15✔
2528
            }
2529

2530
            // set current node id to next node to be visited
2531
            current_node_id = next_node_id;
98✔
2532
            current_in_dir = *next_in_dir;
98✔
2533

2534
            // add current node to path, unless state is double
2535
            match state {
98✔
2536
                LadderState::Double => (),
15✔
2537
                LadderState::Singular => {
83✔
2538
                    sum_path_cov += current_cov;
83✔
2539
                    coverage_counter += 1;
83✔
2540
                    let last_path = paths.len() - 1;
83✔
2541
                    paths[last_path].push((current_node_id, current_in_dir)); // add node to latest path
83✔
2542
                }
83✔
2543
            }
2544
        }
2545
    }
10✔
2546

2547
    /// follow a path of the length 2*k - 1 by choosing the edges with the hightest coverage
2548
    /// requres the graph to have edge mults
2549
    fn follow_ladder_path_high<DI>(&self, start_node_id: usize, start_ext: u8, start_cov: u32, start_out_dir: Dir) -> Option<(usize, f32, f32)> 
12✔
2550
    where SD: SummaryData<DI>
12✔
2551
    {
2552

2553
        let target_length = K::k();
12✔
2554
        let mut path_length = 0;
12✔
2555
        let mut sum_path_cov = start_cov; 
12✔
2556
        let mut coverage_counter = 1;
12✔
2557
        let mut n_correct_edges = 0; 
12✔
2558

2559
        // get fist next node
2560
        let sequence = self.base.sequences.get(start_node_id);
12✔
2561
        let term_kmer: K = sequence.term_kmer(start_out_dir);
12✔
2562
        let next_kmer = term_kmer.extend(start_ext, start_out_dir);
12✔
2563
        let (mut current_node_id, mut current_in_dir, _) = self.find_link(next_kmer, start_out_dir).expect("link should exist"); 
12✔
2564

2565
        if self.check_edge_truth(start_node_id, current_node_id) {
12✔
2566
            n_correct_edges += 1;
12✔
2567
        }
12✔
2568

2569
        loop {
2570
            // check if current node is "target node", i.e., the path has reached the desired length
2571
            match path_length {
116✔
2572
                pl if pl == target_length => return Some((current_node_id, (sum_path_cov as f32 / coverage_counter as f32), (n_correct_edges as f32 / (path_length + 1) as f32))), // path has target length
116✔
2573
                pl if pl > target_length => return None, // path has surpassed desired length => invalid
106✔
2574
                _ => () // path has not yet reached desired length, continue
106✔
2575
            }
2576

2577
            // since current node is not target node, add length to path
2578
            let len = self.get_node(current_node_id).len() - (K::k() - 1);
106✔
2579
            path_length += len;
106✔
2580

2581
            // get current node
2582
            let current_node = self.get_node(current_node_id);
106✔
2583
        
2584
            // find outgoing edge with highest coverage
2585
            let em = current_node.data().edge_mults().expect("must have edge mults").single_dir(current_in_dir.flip()).edge_mults;
106✔
2586
            let (max_cov_base, &max_cov) = em.iter().rev().enumerate().filter(|&(_, &c)| c > 0).max_by(|&(_b1, &c1), &(_b2, c2)| c1.cmp(c2))?;
424✔
2587

2588
            // get next node id
2589
            let sequence = self.base.sequences.get(current_node_id);
104✔
2590
            let term_kmer: K = sequence.term_kmer(current_in_dir.flip());
104✔
2591
            let next_kmer = term_kmer.extend(max_cov_base as u8, current_in_dir.flip());
104✔
2592
            let (next_node_id, next_in_dir, _) = self.find_link(next_kmer, current_in_dir.flip()).expect("link should exist"); 
104✔
2593

2594
            // try to check if edge is "correct", add extra length of current node to it to account for compression
2595
            if self.check_edge_truth(current_node_id, next_node_id) {
104✔
2596
                n_correct_edges += len;
104✔
2597
            }
104✔
2598

2599
            // set current node id to next node to be visited
2600
            current_node_id = next_node_id;
104✔
2601
            current_in_dir = next_in_dir;
104✔
2602
            sum_path_cov += max_cov;
104✔
2603
            coverage_counter += 1;
104✔
2604
        }
2605
    }
12✔
2606

2607
    /// remove tips from the graph, reqires the graoh to have edge mults and be stranded
2608
    /// it is recommended to use this function after [`DebruijnGraph::remove_ladders`], since 
2609
    /// the latter will likely leave tips in the graph
2610
    pub fn remove_tips<DI, P>(&mut self, min_diff_factor: u32, max_avg_tip_cov: f32, out_path: Option<P>) -> Result<(), String> 
4✔
2611
    where 
4✔
2612
        SD: SummaryData<DI>,
4✔
2613
        P: AsRef<Path>
4✔
2614
    {
2615
        // interrupt if we dont have edge mults
2616
        if self.get_node(0).data().edge_mults().is_none() { return Err(String::from("no edge mults available")) };
4✔
2617

2618
        let mut writer = out_path.map(|path| BufWriter::new(File::create(path).expect("error creating ladder stats file")));
4✔
2619
        if let Some(wtr) = writer.as_mut() {
4✔
2620
            writeln!(wtr, "high cov,avg low cov,max true,tip truth ratio,tip len,dir").unwrap();
×
2621
        }
4✔
2622

2623
        let max_len = K::k();
4✔
2624

2625
        // iterate over nodes
2626
        for node_id in 0..self.len() {
132✔
2627
            // do for both left and right as "outgoing" direction 
2628
            // right for regular outgoing and tips from the read-end
2629
            // left for the reverse -> tips from the read-start
2630
            for dir in [Dir::Left, Dir::Right] {
264✔
2631
                let current_node = self.get_node(node_id);
264✔
2632
                // check if node has outgoing edge(s) with both high and low coverage
2633
                let outs = current_node.data().edge_mults().expect("should have em").single_dir(dir).edge_mults;
264✔
2634

2635
                let Some((out_max_base, &out_max_cov)) = outs.iter().rev().enumerate().filter(|&(_, &c)| c  > 0).max_by(|&(_b1, &c1), &(_b2, c2)| c1.cmp(c2)) else { continue };
1,056✔
2636
                let smaller_outs = outs.iter().copied().rev().enumerate().filter(|&(_b, c)| (c > 0) & (out_max_cov > c)).collect::<Vec<_>>();
916✔
2637
                
2638
                if smaller_outs.is_empty() { continue; }
229✔
2639

2640
                // try and check if max cov edge is correct
2641
                let max_connection_correct = {
8✔
2642
                    let sequence = self.base.sequences.get(node_id);
8✔
2643
                    let term_kmer: K = sequence.term_kmer(dir);
8✔
2644
                    let next_kmer = term_kmer.extend(out_max_base as u8, dir);
8✔
2645
                    let (next_node_id, _, _) = self.find_link(next_kmer, dir).expect("missing link");
8✔
2646

2647
                    self.check_edge_truth(node_id, next_node_id)
8✔
2648
                }; // if current node is incorrect, connection is incorrect in any case
2649
                
2650
                // check all small outs
2651
                for (s_base, s_cov) in smaller_outs {
8✔
2652
                    // path has to be short + unambiguous + consistently low coverage
2653
                    let Some((tip_path, avg_tip_coverage, truth_ratio, tip_len)) = self.follow_tip_path(node_id, s_base as u8, s_cov, dir) else { continue; };
8✔
2654

2655
                    // remove path if coverage below threshold
2656
                    if (s_cov * min_diff_factor <= out_max_cov) & (avg_tip_coverage <= max_avg_tip_cov) & (tip_len <= max_len) {
8✔
2657
                        self.remove_path(tip_path)?;
8✔
2658
                    }
×
2659
                        
2660
                    if let Some(wtr) = writer.as_mut() {
8✔
2661
                        writeln!(wtr, "{},{},{},{},{},{:?}", out_max_cov, avg_tip_coverage, max_connection_correct, truth_ratio, tip_len, dir).unwrap();
×
2662
                    }
8✔
2663
                }
2664
            }
2665
        }
2666
        
2667
        Ok(())
4✔
2668
    }
4✔
2669

2670
    /// follow a tip path, returns path, average coverage and ration of correct to incorrect connections
2671
    fn follow_tip_path<DI>(&self, start_node_id: usize, start_ext: u8, start_cov: u32, start_out_dir: Dir) -> Option<(Vec<(usize, Dir)>, f32, f32, usize)> 
8✔
2672
    where 
8✔
2673
        SD: SummaryData<DI>
8✔
2674
    {
2675
        const COV_MARGIN: f32 = 0.3;
2676
        const COV_ADD_MARGIN: f32 = 4.;
2677

2678
        // path, including start and target node
2679
        let mut path = Vec::new();
8✔
2680
        path.push((start_node_id, start_out_dir.flip()));
8✔
2681

2682
        let mut path_length = 0;
8✔
2683
        let mut n_correct_edges = 0;
8✔
2684

2685
        // get fist next node
2686
        let sequence = self.base.sequences.get(start_node_id);
8✔
2687
        let term_kmer: K = sequence.term_kmer(start_out_dir);
8✔
2688
        let next_kmer = term_kmer.extend(start_ext, start_out_dir);
8✔
2689
        let (mut current_node_id, mut current_in_dir, _) = self.find_link(next_kmer, start_out_dir).expect("link should exist"); 
8✔
2690
        path.push((current_node_id, current_in_dir));
8✔
2691

2692
        if self.check_edge_truth(start_node_id, current_node_id) {
8✔
2693
            n_correct_edges += 1;
×
2694
        }
8✔
2695

2696
        let mut current_cov = start_cov as f32;
8✔
2697
        let mut sum_path_cov = start_cov as f32;
8✔
2698
        let mut coverage_counter = 1;
8✔
2699

2700
        loop {     
2701
            // get current node
2702
            let current_node = self.get_node(current_node_id);
26✔
2703

2704
            // add length to path
2705
            let len = current_node.len() - (K::k() - 1);
26✔
2706
            path_length += len;
26✔
2707

2708
            // low path nodes should have only one incoming and one outgoing edge
2709
            let in_edges = current_node.edges(current_in_dir);
26✔
2710
            if in_edges.len() != 1 { return None; }
26✔
2711

2712
            let out_edges = current_node.edges(current_in_dir.flip());
26✔
2713
            match out_edges.len() {
26✔
2714
                oe if oe > 1 => return None,
26✔
2715
                0 => return Some((path, (sum_path_cov / coverage_counter as f32), (n_correct_edges as f32 / path_length as f32), path_length)),
8✔
2716
                _ => ()
18✔
2717
            }
2718

2719
            // if edge cov avalable, check for similarity
2720
            let (out_ext, next_node_id, next_in_dir, _) = out_edges[0];
18✔
2721
            if let Some(em) = current_node.data().edge_mults() {
18✔
2722
                let coverage = em.edge_mult(out_ext, current_in_dir.flip()) as f32;
18✔
2723
                if coverage < current_cov - current_cov * COV_MARGIN - COV_ADD_MARGIN // extra two for small values
18✔
2724
                    || coverage > current_cov + current_cov * COV_MARGIN + COV_ADD_MARGIN {
18✔
2725
                    return None;
×
2726
                } else {
18✔
2727
                    current_cov = coverage;
18✔
2728
                }
18✔
2729
            }
×
2730

2731

2732

2733
            // try to check if edge is "correct", add extra length of current node to it to account for compression
2734
            if self.check_edge_truth(current_node_id, next_node_id) {
18✔
2735
                n_correct_edges += len;
×
2736
            }
18✔
2737

2738
            // set current node id to next node to be visited
2739
            current_node_id = next_node_id;
18✔
2740
            current_in_dir = next_in_dir;
18✔
2741
            sum_path_cov += current_cov;
18✔
2742
            coverage_counter += 1;
18✔
2743
            // add current node to path
2744
            path.push((current_node_id, current_in_dir));
18✔
2745
        }
2746
    }
8✔
2747

2748
    /// remove the edges (not the nodes) of a path in the graph, the path has to contain 
2749
    /// the node ID and the direction the path is coming from moving through the node
2750
    fn remove_path<DI>(&mut self, path: Vec<(usize, Dir)>) -> Result<(), String> 
114✔
2751
    where SD: SummaryData<DI>
114✔
2752
    {
2753
        let mut path_iter = path.clone().into_iter();
114✔
2754
        let Some((mut current_node_id, mut current_in_dir)) = path_iter.next() else { return Ok(()); };
114✔
2755

2756
        loop {
2757
            // get next node id
2758
            let Some((next_node_id, next_in_dir)) = path_iter.next() else { return Ok(()); }; // whole path has been covered
498✔
2759
            // find base to remove ext leaving the current node
2760
            let Some((out_base, _, _, _)) = self.get_node(current_node_id).edges(current_in_dir.flip()).iter().find(|&(_, id, _, _)| *id == next_node_id).copied()
438✔
2761
                else {
NEW
2762
                    return Err( format!("no edge to remove (other dir), node 1:  {:?}, node 2: {:?}",
×
NEW
2763
                        self.get_node(current_node_id),
×
NEW
2764
                        self.get_node(next_node_id)
×
NEW
2765
                    ))
×
2766
                };
2767

2768
            // remove ext 
2769
            self.base.exts[current_node_id] = self.base.exts[current_node_id].remove(current_in_dir.flip(), out_base);
384✔
2770
        
2771

2772
            // find base to remove ext entering the next node
2773
            let Some((in_base, _, _, _)) = self.get_node(next_node_id).edges(next_in_dir).iter().find(|&(_, id, _, _)| *id == current_node_id).copied() 
409✔
2774
                else { 
NEW
2775
                    return Err( format!("no edge to remove (other dir), node 1:  {:?}, node 2: {:?}",
×
NEW
2776
                    self.get_node(current_node_id),
×
NEW
2777
                    self.get_node(next_node_id)
×
UNCOV
2778
                ))
×
2779
                };
2780

2781
            // remove ext
2782
            self.base.exts[next_node_id] = self.base.exts[next_node_id].remove(next_in_dir, in_base); 
384✔
2783

2784
            // use new exts to fix edge mults and edge maps
2785
            self.base.data[current_node_id].fix_edge_data(self.base.exts[current_node_id]);
384✔
2786
            self.base.data[next_node_id].fix_edge_data(self.base.exts[next_node_id]);
384✔
2787

2788

2789
            current_node_id = next_node_id;
384✔
2790
            current_in_dir = next_in_dir;
384✔
2791
        }
2792
    }
114✔
2793

2794
    /// use ids mapped to nodes to check if the nodes of an edge were mapped to the same id
2795
    /// returns false if mapped ids are not available
2796
    pub fn check_edge_truth<DI>(&self, node_id_1: usize, node_id_2: usize) -> bool
243✔
2797
    where 
243✔
2798
        SD: SummaryData<DI>
243✔
2799
    {
2800
        let mapped_ids_1 = self.get_node(node_id_1).data().mapped_ids();
243✔
2801
        let mapped_ids_2 = self.get_node(node_id_2).data().mapped_ids();
243✔
2802

2803
        if let (Some(mids1), Some(mids2)) = (mapped_ids_1, mapped_ids_2) {
243✔
2804
            let hashed_mi2 = mids2.iter().copied().collect::<HashSet<_>>();
243✔
2805
            for mid in mids1 {
243✔
2806
                if hashed_mi2.contains(mid) {
142✔
2807
                    return true;
124✔
2808
                }
18✔
2809
            }
2810
        }
×
2811

2812
        false
119✔
2813
    }
243✔
2814

2815
    /// use ids mapped to edges to check if the edge is a true edge
2816
    /// returns false if mapped ids are not available
NEW
2817
    pub fn check_edge_truth_emap<DI>(&self, node_id_1: usize, node_id_2: usize) -> bool
×
NEW
2818
    where 
×
NEW
2819
        SD: SummaryData<DI>
×
2820
    {
NEW
2821
        let mapped_ids_1 = self.get_node(node_id_1).data().mapped_edge_ids();
×
2822

NEW
2823
        if let Some(emap) = mapped_ids_1 {
×
NEW
2824
            let node_1 = self.get_node(node_id_1);
×
2825

2826
            // find node 2 in edges of node 1, check in both dirs
NEW
2827
            if let Some((out_base, _nb_id, _nb_inc_dir, _flip)) = node_1.l_edges().iter().find(|(_b, nb, _d, _f)| *nb == node_id_2) {
×
NEW
2828
                let edge_map = emap.edge_map(*out_base, Dir::Left);
×
NEW
2829
                if !edge_map.is_empty() {
×
NEW
2830
                    return true
×
NEW
2831
                }
×
NEW
2832
            } else if let Some((out_base, _nb_id, _nb_inc_dir, _flip)) = node_1.r_edges().iter().find(|(_b, nb, _d, _f)| *nb == node_id_2) {
×
NEW
2833
                let edge_map = emap.edge_map(*out_base, Dir::Right);
×
NEW
2834
                if !edge_map.is_empty() {
×
NEW
2835
                    return true
×
NEW
2836
                }
×
2837
            } else {
NEW
2838
                panic!("missing neighbor")
×
2839
            };
2840

NEW
2841
        }
×
2842

NEW
2843
        false
×
NEW
2844
    }
×
2845
}
2846

2847

2848
#[derive(Debug, Eq, PartialEq)]
2849
enum Status {
2850
    Active,
2851
    End,
2852
    Cycle,
2853
}
2854

2855
#[derive(Debug)]
2856
struct State {
2857
    path: SmallVec8<(u32, Dir)>,
2858
    score: f32,
2859
    status: Status,
2860
}
2861

2862
impl State {}
2863

2864
#[derive(Debug, PartialEq, Eq)]
2865
pub enum LadderState {
2866
    Singular,
2867
    Double
2868
}
2869

2870
/// Iterator over nodes in a `DeBruijnGraph`
2871
pub struct NodeIter<'a, K: Kmer + 'a, D: Debug + 'a> {
2872
    graph: &'a DebruijnGraph<K, D>,
2873
    node_id: usize,
2874
}
2875

2876
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIter<'a, K, D> {
2877
    type Item = Node<'a, K, D>;
2878

2879
    fn next(&mut self) -> Option<Node<'a, K, D>> {
20,029✔
2880
        if self.node_id < self.graph.len() {
20,029✔
2881
            let node = self.graph.get_node(self.node_id);
19,899✔
2882
            self.node_id += 1;
19,899✔
2883
            Some(node)
19,899✔
2884
        } else {
2885
            None
130✔
2886
        }
2887
    }
20,029✔
2888
}
2889

2890
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for &'a DebruijnGraph<K, D> {
2891
    type Item = NodeKmer<'a, K, D>;
2892
    type IntoIter = NodeIntoIter<'a, K, D>;
2893

2894
    fn into_iter(self) -> Self::IntoIter {
2,217✔
2895
        NodeIntoIter {
2,217✔
2896
            graph: self,
2,217✔
2897
            node_id: 0,
2,217✔
2898
        }
2,217✔
2899
    }
2,217✔
2900
}
2901

2902
/// Iterator over nodes in a `DeBruijnGraph`
2903
pub struct NodeIntoIter<'a, K: Kmer + 'a, D: Debug + 'a> {
2904
    graph: &'a DebruijnGraph<K, D>,
2905
    node_id: usize,
2906
}
2907

2908
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIntoIter<'a, K, D> {
2909
    type Item = NodeKmer<'a, K, D>;
2910

2911
    fn next(&mut self) -> Option<Self::Item> {
250,362✔
2912
        if self.node_id < self.graph.len() {
250,362✔
2913
            let node_id = self.node_id;
250,362✔
2914
            let node = self.graph.get_node(node_id);
250,362✔
2915
            let node_seq = node.sequence();
250,362✔
2916

2917
            self.node_id += 1;
250,362✔
2918
            Some(NodeKmer {
250,362✔
2919
                node_id,
250,362✔
2920
                node_seq_slice: node_seq,
250,362✔
2921
                phantom_d: PhantomData,
250,362✔
2922
                phantom_k: PhantomData,
250,362✔
2923
            })
250,362✔
2924
        } else {
2925
            None
×
2926
        }
2927
    }
250,362✔
2928
}
2929

2930
/// A `DebruijnGraph` node with a reference to the sequence of the node.
2931
#[derive(Clone)]
2932
pub struct NodeKmer<'a, K: Kmer + 'a, D: Debug + 'a> {
2933
    pub node_id: usize,
2934
    node_seq_slice: DnaStringSlice<'a>,
2935
    phantom_k: PhantomData<K>,
2936
    phantom_d: PhantomData<D>,
2937
}
2938

2939
/// An iterator over the kmers in a `DeBruijn graph node`
2940
pub struct NodeKmerIter<'a, K: Kmer + 'a, D: Debug + 'a> {
2941
    kmer_id: usize,
2942
    kmer: K,
2943
    num_kmers: usize,
2944
    node_seq_slice: DnaStringSlice<'a>,
2945
    phantom_k: PhantomData<K>,
2946
    phantom_d: PhantomData<D>,
2947
}
2948

2949
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for NodeKmer<'a, K, D> {
2950
    type Item = K;
2951
    type IntoIter = NodeKmerIter<'a, K, D>;
2952

2953
    fn into_iter(self) -> Self::IntoIter {
500,724✔
2954
        let num_kmers = self.node_seq_slice.len() - K::k() + 1;
500,724✔
2955

2956
        let kmer = if num_kmers > 0 {
500,724✔
2957
            self.node_seq_slice.get_kmer::<K>(0)
500,724✔
2958
        } else {
2959
            K::empty()
×
2960
        };
2961

2962
        NodeKmerIter {
500,724✔
2963
            kmer_id: 0,
500,724✔
2964
            kmer,
500,724✔
2965
            num_kmers,
500,724✔
2966
            node_seq_slice: self.node_seq_slice,
500,724✔
2967
            phantom_k: PhantomData,
500,724✔
2968
            phantom_d: PhantomData,
500,724✔
2969
        }
500,724✔
2970
    }
500,724✔
2971
}
2972

2973
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeKmerIter<'a, K, D> {
2974
    type Item = K;
2975

2976
    fn next(&mut self) -> Option<Self::Item> {
3,385,742✔
2977
        if self.num_kmers == self.kmer_id {
3,385,742✔
2978
            None
×
2979
        } else {
2980
            let current_kmer = self.kmer;
3,385,742✔
2981

2982
            self.kmer_id += 1;
3,385,742✔
2983
            if self.kmer_id < self.num_kmers {
3,385,742✔
2984
                let next_base = self.node_seq_slice.get(self.kmer_id + K::k() - 1);
3,321,046✔
2985
                let new_kmer = self.kmer.extend_right(next_base);
3,321,046✔
2986
                self.kmer = new_kmer;
3,321,046✔
2987
            }
3,321,046✔
2988

2989
            Some(current_kmer)
3,385,742✔
2990
        }
2991
    }
3,385,742✔
2992

2993
    fn size_hint(&self) -> (usize, Option<usize>) {
250,362✔
2994
        (self.num_kmers, Some(self.num_kmers))
250,362✔
2995
    }
250,362✔
2996

2997
    /// Provide a 'fast-forward' capability for this iterator
2998
    /// MPHF will use this to reduce the number of kmers that
2999
    /// need to be produced.
3000
    fn nth(&mut self, n: usize) -> Option<Self::Item> {
2,441,424✔
3001
        if n <= 4 {
2,441,424✔
3002
            // for small skips forward, shift one base at a time
3003
            for _ in 0..n {
2,195,546✔
3004
                self.next();
944,318✔
3005
            }
944,318✔
3006
        } else {
245,878✔
3007
            self.kmer_id += n;
245,878✔
3008
            self.kmer = self.node_seq_slice.get_kmer::<K>(self.kmer_id);
245,878✔
3009
        }
245,878✔
3010

3011
        self.next()
2,441,424✔
3012
    }
2,441,424✔
3013
}
3014

3015
/// Marker signifying that NodeKmerIter has a known size.
3016
impl<'a, K: Kmer + 'a, D: Debug + 'a> ExactSizeIterator for NodeKmerIter<'a, K, D> {}
3017

3018
/// Unbranched sequence in the DeBruijn graph
3019
pub struct Node<'a, K: Kmer + 'a, D: 'a> {
3020
    pub node_id: usize,
3021
    pub graph: &'a DebruijnGraph<K, D>,
3022
}
3023

3024
impl<'a, K: Kmer, D: Debug> Node<'a, K, D> {
3025
    /// Length of the sequence of this node
3026
    pub fn len(&self) -> usize {
1,396,979✔
3027
        self.graph.base.sequences.get(self.node_id).len()
1,396,979✔
3028
    }
1,396,979✔
3029

3030
    pub fn is_empty(&self) -> bool {
×
3031
        self.graph.base.sequences.get(self.node_id).is_empty()
×
3032
    }
×
3033

3034
    /// Sequence of the node
3035
    pub fn sequence(&self) -> DnaStringSlice<'a> {
5,132,443✔
3036
        self.graph.base.sequences.get(self.node_id)
5,132,443✔
3037
    }
5,132,443✔
3038

3039
    /// Reference to auxiliarly data associated with the node
3040
    pub fn data(&self) -> &'a D {
3,469,250✔
3041
        &self.graph.base.data[self.node_id]
3,469,250✔
3042
    }
3,469,250✔
3043

3044
    /// Extension bases from this node
3045
    pub fn exts(&self) -> Exts {
3,481,101✔
3046
        self.graph.base.exts[self.node_id]
3,481,101✔
3047
    }
3,481,101✔
3048

3049
    /// Edges leaving the left side of the node in the format
3050
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
3051
    pub fn l_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
7,769✔
3052
        self.graph.find_edges(self.node_id, Dir::Left)
7,769✔
3053
    }
7,769✔
3054

3055
    /// Edges leaving the right side of the node in the format
3056
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
3057
    pub fn r_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
7,770✔
3058
        self.graph.find_edges(self.node_id, Dir::Right)
7,770✔
3059
    }
7,770✔
3060

3061
    /// Edges leaving the 'dir' side of the node in the format
3062
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
3063
    pub fn edges(&self, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
442,581✔
3064
        self.graph.find_edges(self.node_id, dir)
442,581✔
3065
    }
442,581✔
3066

3067
    fn to_json<F: Fn(&D) -> Value>(&self, func: &F, f: &mut dyn Write) {
×
3068
        write!(
×
3069
            f,
×
3070
            "{{\"id\":\"{}\",\"L\":{},\"D\":{},\"Se\":\"{:?}\"}}",
3071
            self.node_id,
3072
            self.sequence().len(),
×
3073
            (func)(self.data()),
×
3074
            self.sequence(),
×
3075
        )
3076
        .unwrap();
×
3077
    }
×
3078

3079
    fn edges_to_json(&self, f: &mut dyn Write) -> bool {
×
3080
        let mut wrote = false;
×
3081
        let edges = self.r_edges();
×
3082
        for (idx, &(_, id, incoming_dir, _)) in edges.iter().enumerate() {
×
3083
            write!(
×
3084
                f,
×
3085
                "{{\"source\":\"{}\",\"target\":\"{}\",\"D\":\"{}\"}}",
3086
                self.node_id,
3087
                id,
3088
                match incoming_dir {
×
3089
                    Dir::Left => "L",
×
3090
                    Dir::Right => "R",
×
3091
                }
3092
            )
3093
            .unwrap();
×
3094

3095
            if idx < edges.len() - 1 {
×
3096
                write!(f, ",").unwrap();
×
3097
            }
×
3098

3099
            wrote = true;
×
3100
        }
3101
        wrote
×
3102
    }
×
3103
}
3104

3105
// TODO make generic instead of u8 (u8 is sufficient for dbg)
3106
impl<K: Kmer, SD: Debug> Node<'_, K, SD>  {
3107
    /// get default format for dot edges based on node data
3108
    pub fn edge_dot_default<DI>(&self, colors: &Colors<SD, DI>, base: u8, incoming_dir: Dir, flipped: bool) -> String 
2,728✔
3109
    where SD: SummaryData<DI>
2,728✔
3110
    {
3111
        // set color based on dir
3112
        let color = match incoming_dir {
2,728✔
3113
            Dir::Left => "blue",
1,370✔
3114
            Dir::Right => "red"
1,358✔
3115
        };
3116
        
3117
        if let Some(em) = self.data().edge_mults() {
2,728✔
3118
            
3119
            let dir = if flipped { 
2,728✔
3120
                incoming_dir 
492✔
3121
            } else {
3122
                incoming_dir.flip()
2,236✔
3123
            };
3124

3125
            // set penwidth based on count
3126
            let count = em.edge_mult(base, dir);
2,728✔
3127
            let penwidth = colors.edge_width(count);
2,728✔
3128

3129
            if let Some(emap) = self.data().mapped_edge_ids() {
2,728✔
3130
                // if available, also print IDs mapped to edges
NEW
3131
                let mapped_ids = emap.edge_map(base, dir);
×
3132

3133
                // change color according to edge truth
NEW
3134
                let color = match incoming_dir {
×
NEW
3135
                    Dir::Left => if mapped_ids.is_empty() {"deepskyblue"} else {"blue"},
×
NEW
3136
                    Dir::Right => if mapped_ids.is_empty() {"salmon"} else {"red"},
×
3137
                };
3138

NEW
3139
                format!("[color={color}, penwidth={penwidth}, label=\"{}: {count}, {:?}\", weight={count}]", bits_to_base(base), mapped_ids)
×
3140
            } else {
3141
                format!("[color={color}, penwidth={penwidth}, label=\"{}: {count}\", weight={count}]", bits_to_base(base))
2,728✔
3142
            }
3143
            
3144
        } else {
3145
            format!("[color={color}, penwidth={}]", colors.edge_width(1)) // since there should be no edge mults, this will return default value
×
3146
        }
3147
    }
2,728✔
3148

3149
    /// get default format for dot nodes, based on node data
3150
    pub fn node_dot_default<DI>(&self, colors: &Colors<SD, DI>, config: &SummaryConfig, translator: &Translator, outline: bool, translate_id_groups: bool) -> String
1,475✔
3151
    where SD: SummaryData<DI>
1,475✔
3152
    {
3153
        // set color based on labels/fold change/p-value
3154
        let color = colors.node_color_dot(self.data(), config, outline);
1,475✔
3155
        let translate_id_groups = if translate_id_groups { colors.id_group_ids() } else { None };
1,475✔
3156

3157
        let data_info = self.data().print(translator, config, translate_id_groups);
1,475✔
3158
        const MIN_TEXT_WIDTH: usize = 40;
3159
        let wrap = if self.len() > MIN_TEXT_WIDTH { self.len() } else { MIN_TEXT_WIDTH };
1,475✔
3160

3161
        let label = textwrap::fill(&format!("id: {}, len: {}, exts: {:?}, seq: {}\n{}", 
1,475✔
3162
            self.node_id,
1,475✔
3163
            self.len(),
1,475✔
3164
            self.exts(),
1,475✔
3165
            self.sequence(),
1,475✔
3166
            data_info
1,475✔
3167
        ), wrap);
1,475✔
3168

3169
        format!("[{color}, label=\"{label}\"]")
1,475✔
3170
    }
1,475✔
3171

3172
    // get the default properties for json edges
3173
    pub fn edge_json_default<DI>(&self, target_node_id: usize, base: u8, incoming_dir: Dir, flipped: bool) -> String 
106✔
3174
    where SD: SummaryData<DI>
106✔
3175
    {
3176
        // set color based on dir
3177
        let dir = match incoming_dir {
106✔
3178
            Dir::Right => 0,
1✔
3179
            Dir::Left => 1
105✔
3180
        };
3181

3182
        // get base in other dir
3183
        let target_node = self.graph.get_node(target_node_id);
106✔
3184
        let nb_base = if self.graph.base.stranded {
106✔
3185
            // only look at left edges of target node
3186
            let Some(nb_base) = target_node.l_edges().iter().filter_map(|(b, id, _in_dir, _flip)|
106✔
3187
                if *id == self.node_id {
110✔
3188
                    Some(*b)
106✔
3189
                } else { None }
110✔
3190
            ).next() else { panic!("missing neighbor") };
106✔
3191
            nb_base
106✔
3192
        } else {
3193
            // look at edges in either direction
3194
            let Some(nb_base) = target_node.r_edges().iter().chain(target_node.l_edges().iter()).filter_map(|(b, id, _in_dir, _flip)|
×
3195
                if *id == self.node_id {
×
3196
                    Some(*b)
×
3197
                } else { None }
×
3198
            ).next() else { panic!("missing neighbor") };
×
3199
            nb_base
×
3200
        };
3201
        
3202
        let value = if let Some(em) = self.data().edge_mults() {
106✔
3203
            
3204
            let dir = if flipped { 
106✔
3205
                incoming_dir 
1✔
3206
            } else {
3207
                incoming_dir.flip()
105✔
3208
            };
3209

3210
            let count = em.edge_mult(base, dir);
106✔
3211

3212
            format!(", \"strength\": {count}")
106✔
3213
        } else {
3214
            String::from("")
×
3215
        };
3216

3217
        format!("\"source\": {}, \"target\": {target_node_id}, \"source_b\": \"{}\", \"target_b\": \"{}\", \"dir\": {dir}{value}",
106✔
3218
            self.node_id,
3219
            bits_to_base(base),
106✔
3220
            bits_to_base(nb_base),
106✔
3221
        )
3222
    }
106✔
3223

3224
    /// get default properties for json nodes, based on node data
3225
    pub fn node_json_default<DI>(&self, colors: &Colors<SD, DI>, config: &SummaryConfig, translator: &Translator, translate_id_groups: bool) -> String
111✔
3226
    where SD: SummaryData<DI>
111✔
3227
    {
3228
        // set hue based on node data
3229
        let hue = colors.hue_json(self.data(), config);
111✔
3230
        let translate_id_groups = if translate_id_groups { colors.id_group_ids() } else { None };
111✔
3231

3232
        let data_info = self.data().print_json(translator, config, translate_id_groups);
111✔
3233

3234
        format!("\"id\": {}, \"len\": {}, \"seq\": \"{}\", \"hue\": {hue}, {data_info}",
111✔
3235
            self.node_id,
3236
            self.len(),
111✔
3237
            self.sequence(),
111✔
3238
        )
3239
    }
111✔
3240
}
3241

3242
impl<K: Kmer, D> fmt::Debug for Node<'_, K, D>
3243
where
3244
    D: Debug,
3245
{
3246
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
145✔
3247
        write!(
145✔
3248
            f,
145✔
3249
            "Node {{ id:{}, Exts: {:?}, L:{:?} R:{:?}, Seq: {:?}, Data: {:?} }}",
3250
            self.node_id,
3251
            self.exts(),
145✔
3252
            self.l_edges(),
145✔
3253
            self.r_edges(),
145✔
3254
            self.sequence().len(),
145✔
3255
            self.data()
145✔
3256
        )
3257
    }
145✔
3258
}
3259

3260
pub struct IterComponents<'a, K: Kmer, D> {
3261
    graph: &'a DebruijnGraph<K, D>,
3262
    visited: Vec<bool>,
3263
    pos: usize,
3264
}
3265

3266
impl<K: Kmer, D: Debug> Iterator for IterComponents<'_, K, D> {
3267
    type Item = Vec<usize>;
3268
    fn next(&mut self) -> Option<Self::Item> {
24✔
3269
        while self.pos < self.graph.len() {
652✔
3270
            if !self.visited[self.pos] {
645✔
3271
                let comp = self.graph.component_i(&mut self.visited, self.pos);
17✔
3272
                self.pos += 1;
17✔
3273
                return Some(comp)
17✔
3274
            } else {
628✔
3275
                self.pos += 1;
628✔
3276
            }
628✔
3277
        }
3278
        assert!(self.visited.iter().map(|x| *x as usize).sum::<usize>() == self.graph.len());
645✔
3279
        None
7✔
3280
    }
24✔
3281
    
3282
}
3283

3284
pub struct PathCompIter<'a, K: Kmer, D: Debug, F, F2> 
3285
where 
3286
F: Fn(&D) -> f32,
3287
F2: Fn(&D) -> bool
3288
{
3289
    graph: &'a DebruijnGraph<K, D>,
3290
    component_iterator: IterComponents<'a, K, D>,
3291
    graph_pos: usize,
3292
    score: F,
3293
    solid_path: F2,
3294
}
3295

3296
/// returns the component and the "best" path in the component
3297
impl<K: Kmer, D: Debug, F, F2> Iterator for PathCompIter<'_, K, D, F, F2> 
3298
where 
3299
F: Fn(&D) -> f32,
3300
F2: Fn(&D) -> bool
3301
{
3302
    type Item = (Vec<usize>, VecDeque<(usize, Dir)>,);
3303
    fn next(&mut self) -> Option<Self::Item> {
10✔
3304
        match self.component_iterator.next() {
10✔
3305
            Some(component) => {
7✔
3306
                let current_comp = component;
7✔
3307
                
3308
    
3309
                let mut best_node = current_comp[0];
7✔
3310
                let mut best_score = f32::MIN;
7✔
3311
                for c in current_comp.iter() {
371✔
3312
                    let node = self.graph.get_node(*c);
371✔
3313
                    let node_score = (self.score)(node.data());
371✔
3314
    
3315
                    if node_score > best_score {
371✔
3316
                        best_node = *c;
13✔
3317
                        best_score = node_score;
13✔
3318
                    }
358✔
3319
                }
3320
    
3321
                let oscore = |state| match state {
662✔
3322
                    None => 0.0,
327✔
3323
                    Some((id, _)) => (self.score)(self.graph.get_node(id).data()),
335✔
3324
                };
662✔
3325
    
3326
                let osolid_path = |state| match state {
331✔
3327
                    None => false,
×
3328
                    Some((id, _)) => (self.solid_path)(self.graph.get_node(id).data()),
331✔
3329
                };
331✔
3330
    
3331
                // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
3332
                // already put into the path
3333
                let mut used_nodes = HashSet::new();
7✔
3334
                let mut path = VecDeque::new();
7✔
3335
    
3336
                // Start w/ initial state
3337
                used_nodes.insert(best_node);
7✔
3338
                path.push_front((best_node, Dir::Left));
7✔
3339
    
3340
                for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
14✔
3341
                    let &(start_node, dir, do_flip) = init;
14✔
3342
                    let mut current = (start_node, dir);
14✔
3343
    
3344
                    loop {
3345
                        let mut next = None;
341✔
3346
                        let (cur_id, incoming_dir) = current;
341✔
3347
                        let node = self.graph.get_node(cur_id);
341✔
3348
                        let edges = node.edges(incoming_dir.flip());
341✔
3349
    
3350
                        let mut solid_paths = 0;
341✔
3351
                        for (_, id, dir, _) in edges {
341✔
3352
                            let cand = Some((id, dir));
331✔
3353
                            if osolid_path(cand) {
331✔
3354
                                solid_paths += 1;
331✔
3355

3356
                                // second if clause is outside of first in original code (see max_path) 
3357
                                // but would basically ignore path validity.
3358
                                if oscore(cand) > oscore(next) {
331✔
3359
                                    next = cand;
328✔
3360
                                }
328✔
3361
                            }
×
3362
                        }
3363
                        
3364
                        // break if multiple solid paths are available
3365
                        /* if solid_paths > 1 {
3366
                            break;
3367
                        } */
3368
    
3369
                        match next {
327✔
3370
                            Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
327✔
3371
                                if do_flip {
327✔
3372
                                    path.push_front((next_id, next_incoming.flip()));
136✔
3373
                                } else {
191✔
3374
                                    path.push_back((next_id, next_incoming));
191✔
3375
                                }
191✔
3376
    
3377
                                used_nodes.insert(next_id);
327✔
3378
                                current = (next_id, next_incoming);
327✔
3379
                            }
3380
                            _ => break,
14✔
3381
                        }
3382
                    }
3383
                }
3384
                
3385
                
3386
                Some((current_comp, path))
7✔
3387
            }, 
3388
            None => {
3389
                // should technically not need graph_pos after this 
3390
                self.graph_pos += 1;
3✔
3391
                None
3✔
3392
            }
3393
        }
3394
    }
10✔
3395
}
3396

3397

3398
/// iterator over the edges of the de bruijn graph
3399
pub struct EdgeIter<'a, K: Kmer, D: Debug> {
3400
    graph: &'a DebruijnGraph<K, D>,
3401
    visited_edges: HashSet<(usize, usize)>,
3402
    current_node: usize,
3403
    current_dir: Dir,
3404
    node_edge_iter: smallvec::IntoIter<[(u8, usize, Dir, bool); 4]>
3405
}
3406

3407
impl<K: Kmer, D: Debug> EdgeIter<'_, K, D> {
3408
    pub fn new(graph: &DebruijnGraph<K, D>) -> EdgeIter<'_, K, D>{
41✔
3409
        let node_edge_iter = graph.get_node(0).l_edges().into_iter();
41✔
3410

3411
        EdgeIter { 
41✔
3412
            graph, 
41✔
3413
            visited_edges: HashSet::new(), 
41✔
3414
            current_node: 0, 
41✔
3415
            current_dir: Dir::Left, 
41✔
3416
            node_edge_iter
41✔
3417
        }
41✔
3418
    }
41✔
3419
}
3420

3421
impl<K: Kmer, D: Debug> Iterator for EdgeIter<'_, K, D> {
3422
    type Item = (usize, Dir, u8, usize); // node, direction leaving node, base, target node
3423

3424
    fn next(&mut self) -> Option<Self::Item> {
3,491✔
3425
        loop {
3426
            if let Some((base, nb_node_id, _, _)) = self.node_edge_iter.next() {
14,428✔
3427
                let edge = if self.current_node > nb_node_id { (nb_node_id, self.current_node) } else { (self.current_node, nb_node_id) };
6,900✔
3428

3429
                if self.visited_edges.insert(edge) { return Some((self.current_node, self.current_dir, base, nb_node_id)); } // else simply skip and move on
6,900✔
3430

3431
            } else {
3432
                match self.current_dir {
7,528✔
3433
                    Dir::Left => {
3,764✔
3434
                        // no left edges, switch to right edges
3,764✔
3435
                        self.current_dir = Dir::Right;
3,764✔
3436
                        self.node_edge_iter = self.graph.get_node(self.current_node).r_edges().into_iter();
3,764✔
3437
                        
3,764✔
3438
                    }
3,764✔
3439
                    Dir::Right => {
3440
                        // no right edges, switch to next node left edges
3441
                        self.current_node += 1;
3,764✔
3442

3443
                        // quit if end of graph is reached
3444
                        if self.current_node == self.graph.len() { return None }
3,764✔
3445

3446
                        self.current_dir = Dir::Left;
3,723✔
3447
                        self.node_edge_iter = self.graph.get_node(self.current_node).l_edges().into_iter();
3,723✔
3448
                    }
3449
                }
3450
            }
3451
            
3452
        }
3453
    }
3,491✔
3454
}
3455

3456
#[cfg(test)]
3457
mod test {
3458
    use std::fs::remove_file;
3459

3460
    use crate::{BaseQuality, Exts, build_test_graph, colors::Colors, compression::{CheckCompress, ScmapCompress, compress_kmers_with_hash, uncompressed_graph}, dna_string::DnaString, filter::filter_kmers, kmer::{Kmer6, Kmer16, Kmer22}, reads::{Reads, ReadsPaired, Strandedness}, summarizer::{IDMapEMData, IDMapEMQualityData, IDTag, SampleInfo, SummaryConfig, TagsCountsData, TagsCountsSumData, Translator}, test::random_dna};
3461

3462
    use crate::{summarizer::SummaryData, Dir};
3463

3464

3465
    #[test]
3466
    #[cfg(not(feature = "sample128"))]
3467
    fn test_components() {
3468
        use crate::{kmer::Kmer16, test::build_test_graph};
3469

3470
        let (_, _, ser_graph) = build_test_graph::<Kmer16, TagsCountsSumData, _>();
3471
        let graph = ser_graph.graph();
3472

3473
        let components = graph.iter_components();
3474

3475
        let check_components = [
3476
            vec![0, 7, 43, 24, 47, 22, 37, 89, 25, 79, 63, 95, 64, 9, 96, 13, 11, 86, 74, 71, 92, 51, 94, 45, 12, 76, 21],
3477
            vec![1, 54, 44, 5, 57, 65, 84, 10, 58, 35, 42, 73, 30, 83, 77, 15, 80, 72, 81, 78, 67, 49, 69, 91, 2, 90, 33, 87, 55, 8, 17, 88, 31, 56, 52, 27, 4, 6, 99, 40, 93, 28, 26, 62, 59, 97, 82, 46],
3478
            vec![3, 41, 36, 34, 38, 85, 75, 19, 48, 16, 61, 66, 23, 20, 14, 18, 39, 29, 70, 32, 50, 53, 68, 60, 98],
3479
        ];
3480

3481
        let mut counter = 0;
3482

3483
        for component in components {
3484
            if component.len() > 1 { 
3485
                println!("component: {:?}", component);
3486
                assert_eq!(component, check_components[counter]);
3487
                counter += 1;
3488
            }
3489
        }
3490

3491
        assert_eq!(
3492
            vec![(88, Dir::Left), (17, Dir::Left), (8, Dir::Left), (55, Dir::Left), (87, Dir::Left), (33, Dir::Left), (90, Dir::Left), (2, Dir::Left), (91, Dir::Left), (69, Dir::Left), (49, Dir::Left), (78, Dir::Left), (81, Dir::Left), (72, Dir::Left), (1, Dir::Left), (54, Dir::Left), (44, Dir::Left), (5, Dir::Left), (57, Dir::Left)], 
3493
            graph.max_path(|data| data.sum().unwrap_or(1) as f32, |_| true)
3494
        );
3495
    }
3496

3497
    #[test]
3498
    fn test_iter_edges() {
1✔
3499
        use crate::{compression::uncompressed_graph, filter::filter_kmers, reads::{Reads, ReadsPaired}, summarizer::{SampleInfo, SummaryConfig, TagsData}};
3500

3501
        let read1 = "CAGCATCGATGCGACGAGCGCTCGCATCGA".as_bytes();
1✔
3502
        let read2 = "ACGATCGTACGTAGCTAGCTGACTGAGC".as_bytes();
1✔
3503

3504
        let mut reads = Reads::new(crate::reads::Strandedness::Forward);
1✔
3505
        reads.add_from_bytes(read1, None, 0u8);
1✔
3506
        reads.add_from_bytes(read2, None, 1);
1✔
3507

3508
        let reads_paired = ReadsPaired::Unpaired { reads };
1✔
3509

3510
        let sample_info = SampleInfo::new(0b1, 0b10, vec![12, 12]);
1✔
3511
        let summary_config = SummaryConfig::new(sample_info);
1✔
3512
        let (kmers, _) = filter_kmers::<TagsData, Kmer16, _>(&reads_paired, &summary_config, false, 1., false);
1✔
3513

3514
        let graph = uncompressed_graph(kmers, true).finish();
1✔
3515

3516
        let check_edges: Vec<(usize, Dir, u8, usize)> = vec![(0, Dir::Left, 2, 16), (0, Dir::Right, 1, 2), (1, Dir::Left, 0, 11), 
1✔
3517
        (1, Dir::Right, 0, 16), (3, Dir::Left, 0, 13), (3, Dir::Right, 3, 10), (4, Dir::Left, 2, 21), (4, Dir::Right, 1, 17), (5, Dir::Left, 1, 27), 
1✔
3518
        (5, Dir::Right, 2, 19), (6, Dir::Left, 1, 24), (6, Dir::Right, 2, 12), (7, Dir::Left, 2, 23), (7, Dir::Right, 3, 11), (8, Dir::Left, 0, 12), 
1✔
3519
        (9, Dir::Left, 3, 17), (9, Dir::Right, 3, 24), (10, Dir::Right, 1, 21), (13, Dir::Left, 1, 18), (14, Dir::Right, 0, 26), 
1✔
3520
        (15, Dir::Left, 2, 20), (15, Dir::Right, 3, 22), (18, Dir::Left, 2, 19), (20, Dir::Left, 1, 26), (22, Dir::Right, 2, 25), (23, Dir::Left, 1, 25)];
1✔
3521

3522
        let edges = graph.iter_edges().collect::<Vec<_>>();
1✔
3523

3524
        assert_eq!(check_edges, edges);
1✔
3525
    }
1✔
3526

3527
    #[test]
3528
    fn test_map_transcripts() {
1✔
3529
        let t_ref_path = "test_data/test_transcriptome_reference.fasta";
1✔
3530
        let (_, ser_kmers, _) = build_test_graph::<Kmer22, u32, _>();
1✔
3531
        let (kmers, mut translator, _) = ser_kmers.dissolve();
1✔
3532

3533
        let unc_graph = uncompressed_graph(kmers, true).finish();
1✔
3534

3535
        let mut id_strings = translator.id_translator().clone().unwrap().into_iter().map(|(name, _id)| name).collect::<Vec<_>>();
1✔
3536
        id_strings.sort();
1✔
3537

3538
        let t_map = unc_graph.map_transcripts(t_ref_path, &mut translator).unwrap();
1✔
3539
        assert_eq!(t_map.len(), unc_graph.len());
1✔
3540
        assert_eq!(t_map.iter().filter(|&ids| !ids.is_empty()).collect::<Vec<_>>().len(), 439);
487✔
3541

3542
        // repeat the same without a previous existing translator
3543
        let mut new_translator = Translator::empty();
1✔
3544
        let t_map = unc_graph.map_transcripts(t_ref_path, &mut new_translator).unwrap();
1✔
3545
        assert_eq!(t_map.len(), unc_graph.len());
1✔
3546
        assert_eq!(t_map.iter().filter(|&ids| !ids.is_empty()).collect::<Vec<_>>().len(), 439);
487✔
3547
        let mut new_id_strings = new_translator.id_translator().clone().unwrap().into_iter().map(|(name, _id)| name).collect::<Vec<_>>();
1✔
3548
        new_id_strings.sort();
1✔
3549

3550
        assert_eq!(id_strings, new_id_strings);
1✔
3551
    }
1✔
3552

3553

3554
    #[test]
3555
    fn test_map_transcripts_to_edges() {
1✔
3556
        let t_ref_path = "test_data/test_transcriptome_reference.fasta";
1✔
3557
        let (_, ser_kmers, _) = build_test_graph::<Kmer22, u32, _>();
1✔
3558
        let (kmers, mut translator, _) = ser_kmers.dissolve();
1✔
3559

3560
        let unc_graph = uncompressed_graph(kmers, true).finish();
1✔
3561

3562
        let mut id_strings = translator.id_translator().clone().unwrap().into_iter().map(|(name, _id)| name).collect::<Vec<_>>();
1✔
3563
        id_strings.sort();
1✔
3564

3565
        let t_map = unc_graph.map_transcripts_to_edges(t_ref_path, &mut translator).unwrap();
1✔
3566
        assert_eq!(t_map.len(), unc_graph.len());
1✔
3567
        assert_eq!(t_map.iter().filter(|&emap| !emap.is_empty()).collect::<Vec<_>>().len(), 439);
487✔
3568

3569
        // repeat the same without a previous existing translator
3570
        let mut new_translator = Translator::empty();
1✔
3571
        let t_map = unc_graph.map_transcripts(t_ref_path, &mut new_translator).unwrap();
1✔
3572
        assert_eq!(t_map.len(), unc_graph.len());
1✔
3573
        assert_eq!(t_map.iter().filter(|&ids| !ids.is_empty()).collect::<Vec<_>>().len(), 439);
487✔
3574
        let mut new_id_strings = new_translator.id_translator().clone().unwrap().into_iter().map(|(name, _id)| name).collect::<Vec<_>>();
1✔
3575
        new_id_strings.sort();
1✔
3576

3577
        assert_eq!(id_strings, new_id_strings);
1✔
3578
    }
1✔
3579

3580
    fn build_reads_quality_test(strand: Strandedness) -> ReadsPaired<IDTag> {
4✔
3581
        let correct1 = "CGATGCTGCTGATGCTGAGTCTGACGTATGCGATCGATCGACGATCGTACTAGCTGACTGTGCAGCTAGCTGACTGATCGTAGCTAGCTACGTGCTAGCTACTAGCACTGATGC";
4✔
3582
        let qu_corr1 = "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC";
4✔
3583
        let incorrect1 =                                     "CGACGATCGTACTAGCTGACTGTGCAGCTAGCTGACTGATCGTGGCTAGCTACGTGCTAGCTA";
4✔
3584
        let qu_incorr1 =                                     "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC-CCCCCCCCCCCCCCCCCCC";
4✔
3585
        let correct2 =         "GCATCGATCGACGACGTTACGTACGATCTACGTAGCTAGCTAGCTGACATGCTAGCTAGCTCTGACTGATCGTGGCTAGCTGACTGACTGTAGCT";
4✔
3586
        let qu_corr2 =         "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC";
4✔
3587
        let incorrect2 = "ATGCTGCTGATGCTGAGTCTGACGTAGGCGATCGATCGACGATCGTACTAGCTGACT";
4✔
3588
        let qu_incorr2 = "CCCCCCCCCCCCCCCCCCCCCCCCCC-CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC";
4✔
3589
        let incorrect3 = "ATGCTGCTGATGCTGACTCTGACGTAGGCGATCGATCGATGATCGTACTAGCTGACT";
4✔
3590
        let qu_incorr3 = "CCCCCCCCCCCCCCCC-CCCCCCCCC-CCCCCCCCCCCC-CCCCCCCCCCCCCCCCC";
4✔
3591

3592
        let incorrect4_ins =                 "GACGTATGCGATCGATCGACGATCGTACTAGCTGACTTGTGCAGCTAGCTGACTGAT";
4✔
3593
        let qu_incorr4_ins =                 "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC-CCCCCCCCCCCCCCCCCCC";
4✔
3594

3595
        let incorrect5_tip =                                                             "AGCTGACTGATCGTAGCTAGCTACGTGCTAGCTACTATCACTGATGC";
4✔
3596
        let qu_incorr5_tip =                                                             "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC-CCCCCCCCC";
4✔
3597

3598

3599
        let mut reads = Reads::new_with_quality(strand);
4✔
3600

3601
        for _i in 0..20 {
80✔
3602
            reads.add_read(DnaString::from_acgt_bytes(correct1.as_bytes()), None, IDTag::new(1, 0), Some(qu_corr1.as_bytes()));
80✔
3603
        }
80✔
3604

3605
        for _i in 0..40 {
160✔
3606
            reads.add_read(DnaString::from_acgt_bytes(correct2.as_bytes()), None, IDTag::new(2, 0), Some(qu_corr2.as_bytes()));
160✔
3607
        }
160✔
3608

3609
        reads.add_read(DnaString::from_acgt_bytes(incorrect1.as_bytes()), None, IDTag::new(3, 1), Some(qu_incorr1.as_bytes()));
4✔
3610
        reads.add_read(DnaString::from_acgt_bytes(incorrect2.as_bytes()), None, IDTag::new(4, 1), Some(qu_incorr2.as_bytes()));
4✔
3611
        reads.add_read(DnaString::from_acgt_bytes(incorrect3.as_bytes()), None, IDTag::new(5, 1), Some(qu_incorr3.as_bytes()));
4✔
3612
        reads.add_read(DnaString::from_acgt_bytes(incorrect4_ins.as_bytes()), None, IDTag::new(6, 1), Some(qu_incorr4_ins.as_bytes()));
4✔
3613
        reads.add_read(DnaString::from_acgt_bytes(incorrect5_tip.as_bytes()), None, IDTag::new(7, 1), Some(qu_incorr5_tip.as_bytes()));
4✔
3614

3615

3616
        ReadsPaired::Unpaired { reads }
4✔
3617
    }
4✔
3618

3619
    #[test]
3620
    fn test_remove_lq_splits_s() {
1✔
3621
        test_remove_lq_splits(true);
1✔
3622
    }
1✔
3623

3624
    #[test]
3625
    fn test_remove_lq_splits_us() {
1✔
3626
        test_remove_lq_splits(false);
1✔
3627
    }
1✔
3628

3629
    fn test_remove_lq_splits(stranded: bool) {
2✔
3630
        let print = false;
2✔
3631
        let strandedness = if stranded { Strandedness::Forward } else { Strandedness::Unstranded };
2✔
3632

3633

3634
        type K = Kmer16;
3635

3636
        let seqs = build_reads_quality_test(strandedness);
2✔
3637
        let sample_info = SampleInfo::new(1, 0b111110, vec![1000, 10, 20, 20, 20, 20]);
2✔
3638
        let summary_config = SummaryConfig::new(sample_info);
2✔
3639
        let (kmers, _) = filter_kmers::<IDMapEMQualityData, K, IDTag>(&seqs, &summary_config, false, 1., false);
2✔
3640

3641

3642
        // make uncompressed graph
3643
        let mut unc_graph = uncompressed_graph(kmers.clone(), stranded).finish();
2✔
3644

3645
        // add "mapped" ids to graph
3646
        for i in 0..unc_graph.len() {
526✔
3647
            let data = unc_graph.mut_data(i);
526✔
3648
            if let Some(ids) = data.ids() {
526✔
3649
                if ids.contains(&1) {
526✔
3650
                    data.set_mapped_ids(vec![1].into());
198✔
3651
                }
198✔
3652
                else if ids.contains(&2) {
328✔
3653
                    data.set_mapped_ids(vec![2].into());
160✔
3654
                }
168✔
3655
            }
×
3656
        }
3657

3658
        let colors = Colors::new(&unc_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 7 });
2✔
3659

3660
        if print { unc_graph.to_dot("uncompressed_bf-lq-splits.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
2✔
3661
        let n_edges = unc_graph.iter_edges().count();
2✔
3662
        unc_graph.remove_lq_splits(BaseQuality::Medium).unwrap();
2✔
3663
        if print { unc_graph.to_dot("uncompressed_af-lq-splits.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
2✔
3664
    
3665
        let n_edges_af = unc_graph.iter_edges().count();
2✔
3666
        assert_eq!(n_edges, n_edges_af + 11);
2✔
3667

3668
        // make compressed graph
3669
        let spec = CheckCompress::new(|d: IDMapEMQualityData, _| d, |d, d1| d.join_test(d1));
518✔
3670
        let mut c_graph = compress_kmers_with_hash(stranded, &spec, kmers, false, false).finish();
2✔
3671
    
3672
         // add "mapped" ids to graph
3673
        for i in 0..c_graph.len() {
56✔
3674
            let data = c_graph.mut_data(i);
56✔
3675
            if let Some(ids) = data.ids() {
56✔
3676
                if ids.contains(&1) {
56✔
3677
                    data.set_mapped_ids(vec![1].into());
32✔
3678
                }
32✔
3679
                else if ids.contains(&2) {
24✔
3680
                    data.set_mapped_ids(vec![2].into());
6✔
3681
                }
18✔
3682
            }
×
3683
        }
3684

3685
        let colors = Colors::new(&c_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 7 });
2✔
3686

3687
        if print { c_graph.to_dot("compressed_bf-lq-splits.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
2✔
3688
        let n_edges = c_graph.iter_edges().count();
2✔
3689
        c_graph.remove_lq_splits(BaseQuality::Medium).unwrap();
2✔
3690
        if print { c_graph.to_dot("compressed_af-lq-splits.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
2✔
3691
    
3692
        let n_edges_af = c_graph.iter_edges().count();
2✔
3693
        assert_eq!(n_edges, n_edges_af + 11);
2✔
3694
    }
2✔
3695

3696
    #[test]
3697
    fn test_remove_lq_paths_s() {
1✔
3698
        test_remove_lq_paths(true);
1✔
3699
    }
1✔
3700

3701
    #[test]
3702
    fn test_remove_lq_paths_us() {
1✔
3703
        test_remove_lq_paths(false);
1✔
3704
    }
1✔
3705

3706
    fn test_remove_lq_paths(stranded: bool) {
2✔
3707
        let print = false;
2✔
3708
        let strandedness = if stranded { Strandedness::Forward } else { Strandedness::Unstranded };
2✔
3709

3710
        type K = Kmer16;
3711

3712
        let seqs = build_reads_quality_test(strandedness);
2✔
3713
        let sample_info = SampleInfo::new(1, 0b111110, vec![1000, 10, 20, 20, 20, 20]);
2✔
3714
        let summary_config = SummaryConfig::new(sample_info);
2✔
3715
        let (kmers, _) = filter_kmers::<IDMapEMQualityData, K, IDTag>(&seqs, &summary_config, false, 1., false);
2✔
3716

3717

3718
        // make uncompressed graph
3719
        let mut unc_graph = uncompressed_graph(kmers.clone(), stranded).finish();
2✔
3720

3721
        // add "mapped" ids to graph
3722
        for i in 0..unc_graph.len() {
526✔
3723
            let data = unc_graph.mut_data(i);
526✔
3724
            if let Some(ids) = data.ids() {
526✔
3725
                if ids.contains(&1) {
526✔
3726
                    data.set_mapped_ids(vec![1].into());
198✔
3727
                }
198✔
3728
                else if ids.contains(&2) {
328✔
3729
                    data.set_mapped_ids(vec![2].into());
160✔
3730
                }
168✔
3731
            }
×
3732
        }
3733

3734
        let colors = Colors::new(&unc_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 7 });
2✔
3735

3736
        if print { unc_graph.to_dot("uncompressed_bf-lq.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
2✔
3737
        let n_edges = unc_graph.iter_edges().count();
2✔
3738
        unc_graph.remove_lq_paths(BaseQuality::Medium, 4).unwrap();
2✔
3739
        if print { unc_graph.to_dot("uncompressed_af-lq.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
2✔
3740
    
3741
        let n_edges_af = unc_graph.iter_edges().count();
2✔
3742
        assert_eq!(n_edges, n_edges_af + 90);
2✔
3743

3744
        // make compressed graph
3745
        let spec = CheckCompress::new(|d: IDMapEMQualityData, _| d, |d, d1| d.join_test(d1));
518✔
3746
        let mut c_graph = compress_kmers_with_hash(stranded, &spec, kmers, false, false).finish();
2✔
3747
    
3748
         // add "mapped" ids to graph
3749
        for i in 0..c_graph.len() {
56✔
3750
            let data = c_graph.mut_data(i);
56✔
3751
            if let Some(ids) = data.ids() {
56✔
3752
                if ids.contains(&1) {
56✔
3753
                    data.set_mapped_ids(vec![1].into());
32✔
3754
                }
32✔
3755
                else if ids.contains(&2) {
24✔
3756
                    data.set_mapped_ids(vec![2].into());
6✔
3757
                }
18✔
3758
            }
×
3759
        }
3760

3761
        let colors = Colors::new(&c_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 7 });
2✔
3762

3763
        if print { c_graph.to_dot("compressed_bf-lq.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
2✔
3764
        let n_edges = c_graph.iter_edges().count();
2✔
3765
        c_graph.remove_lq_paths(BaseQuality::Medium, 4).unwrap();
2✔
3766
        if print { c_graph.to_dot("compressed_af-lq.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
2✔
3767
    
3768
        let n_edges_af = c_graph.iter_edges().count();
2✔
3769
        assert_eq!(n_edges, n_edges_af + 15);
2✔
3770
    }
2✔
3771

3772
    #[test]
3773
    fn test_remove_lc_paths_s() {
1✔
3774
        test_remove_lc_paths(true);
1✔
3775
    }
1✔
3776

3777
    #[test]
3778
    fn test_remove_lc_paths_us() {
1✔
3779
        test_remove_lc_paths(false);
1✔
3780
    }
1✔
3781
    
3782
    fn test_remove_lc_paths(stranded: bool) {
2✔
3783
        let print = true; 
2✔
3784
        let strandedness = if stranded { Strandedness::Forward } else { Strandedness::Unstranded };
2✔
3785

3786
        let (n_diff_uc, n_diff_c) = if stranded { (27, 8) } else { (27, 10) };
2✔
3787

3788
        let   correct = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
2✔
3789
        let incorrect = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGGCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
2✔
3790
        let incorrec2 = "TGACAGCTGACGGCTGACTACTACGTCACTGACGATGCTGACAC".as_bytes();
2✔
3791
        let incorrec3 = "AAAAAAAAAGCTGACTGCTGACGGCTG".as_bytes();
2✔
3792
        let incorrec4 = "ACGGCTGACTACTGACTGAAAAAAAAAAA".as_bytes();
2✔
3793

3794
        let insertion = "ACGATCGATCGCGATCGATAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
2✔
3795

3796
        let mut reads = Reads::new(strandedness);
2✔
3797
        for _i in 0..1000 {
2,000✔
3798
            reads.add_from_bytes(correct, None, IDTag::new(0, 0));
2,000✔
3799
        }
2,000✔
3800

3801
        for _i in 0..2 {
4✔
3802
            reads.add_from_bytes(incorrect, None, IDTag::new(1, 1)); // should be removed
4✔
3803
        }
4✔
3804

3805
        for _i in 0..15 {
30✔
3806
            reads.add_from_bytes(incorrec2, None, IDTag::new(2, 2)); // should be removed
30✔
3807
        }
30✔
3808

3809
        for _i in 0..25 {
50✔
3810
            reads.add_from_bytes(incorrec3, None, IDTag::new(3, 3)); // should be removed
50✔
3811
        }
50✔
3812

3813
        for _i in 0..30 {
60✔
3814
            reads.add_from_bytes(incorrec4, None, IDTag::new(4, 4)); // should be removed
60✔
3815
        }
60✔
3816

3817

3818
        for _i in 0..1 {
2✔
3819
            reads.add_from_bytes(insertion, None, IDTag::new(1, 3)); // should not be removed
2✔
3820
        }
2✔
3821

3822
        let seqs = ReadsPaired::Unpaired { reads };
2✔
3823
        let sample_info = SampleInfo::new(1, 0b111110, vec![1000, 10, 20, 20, 20, 20]);
2✔
3824
        let summary_config = SummaryConfig::new(sample_info);
2✔
3825
        let (kmers, _) = filter_kmers::<IDMapEMData, Kmer16, IDTag>(&seqs, &summary_config, false, 1., false);
2✔
3826

3827

3828
        // test with uncompressed graph
3829
        let mut unc_graph = uncompressed_graph(kmers.clone(), stranded).finish();
2✔
3830
        // add ids to graph
3831
        for i in 0..unc_graph.len() {
254✔
3832
            let data = unc_graph.mut_data(i);
254✔
3833
            if let Some(ids) = data.ids() {
254✔
3834
                if ids.contains(&0) {
254✔
3835
                    data.set_mapped_ids(vec![0].into());
100✔
3836
                }
154✔
NEW
3837
            }
×
3838
        }
3839
        let colors = Colors::new(&unc_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 5 });
2✔
3840
        if print { unc_graph.to_dot("uncompressed_bf-lcp.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
512✔
3841
        let n_edges = unc_graph.iter_edges().count();
2✔
3842
        unc_graph.remove_lc_paths(10, 10, 10.).unwrap();
2✔
3843
        if print { unc_graph.to_dot("uncompressed_af-lcp.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
404✔
3844
        assert_eq!(n_edges - n_diff_uc, unc_graph.iter_edges().count());
2✔
3845

3846
        // test with compressed graph
3847
        let spec = CheckCompress::new(|d: IDMapEMData, _| d, |d, d1| d.join_test(d1));
245✔
3848
        let mut c_graph = compress_kmers_with_hash(stranded, &spec, kmers, false, false).finish();
2✔
3849
        // add ids to graph
3850
        for i in 0..c_graph.len() {
32✔
3851
            let data = c_graph.mut_data(i);
32✔
3852
            if let Some(ids) = data.ids() {
32✔
3853
                if ids.contains(&0) {
32✔
3854
                    data.set_mapped_ids(vec![0].into());
10✔
3855
                }
22✔
NEW
3856
            }
×
3857
        }
3858
        let colors = Colors::new(&c_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 5 });
2✔
3859
        if print { c_graph.to_dot("compressed_bf-lcp.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
68✔
3860
        let n_edges = c_graph.iter_edges().count();
2✔
3861
        c_graph.remove_lc_paths(10, 10, 10.).unwrap();
2✔
3862
        if print { c_graph.to_dot("compressed_af-lcp.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
32✔
3863
        assert_eq!(n_edges - n_diff_c, c_graph.iter_edges().count());
2✔
3864
    }
2✔
3865

3866
    #[test]
3867
    fn test_remove_ladders_s() {
1✔
3868
        test_remove_ladders(true);
1✔
3869
    }
1✔
3870

3871
    #[test]
3872
    fn test_remove_ladders_us() {
1✔
3873
        test_remove_ladders(false);
1✔
3874
    }
1✔
3875

3876
    fn test_remove_ladders(stranded: bool) {
2✔
3877
        let print = true; 
2✔
3878
        let strandedness = if stranded { Strandedness::Forward } else { Strandedness::Unstranded };
2✔
3879

3880
        let c_csv = if print { Some("c_ladders.csv") } else { None };
2✔
3881
        let uc_csv = if print { Some("uc_ladders.csv") } else { None };
2✔
3882

3883
        let   correct = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
2✔
3884
        let incorrect = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGGCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
2✔
3885
        let incorrec2 = "TGACAGCTGACGGCTGACTACTACGTCACTGACGATGCTGACAC".as_bytes();
2✔
3886
        let incorrec3 = "AAAAAAAAAGCTGACTGCTGACGGCTG".as_bytes();
2✔
3887
        let incorrec4 = "ACGGCTGACTACTGACTGAAAAAAAAAAA".as_bytes();
2✔
3888

3889
        let insertion = "ACGATCGATCGCGATCGATAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
2✔
3890

3891
        let mut reads = Reads::new(strandedness);
2✔
3892
        for _i in 0..1000 {
2,000✔
3893
            reads.add_from_bytes(correct, None, IDTag::new(0, 0));
2,000✔
3894
        }
2,000✔
3895

3896
        for _i in 0..2 {
4✔
3897
            reads.add_from_bytes(incorrect, None, IDTag::new(1, 1)); // should be removed
4✔
3898
        }
4✔
3899

3900
        for _i in 0..15 {
30✔
3901
            reads.add_from_bytes(incorrec2, None, IDTag::new(2, 2)); // should be removed
30✔
3902
        }
30✔
3903

3904
        for _i in 0..25 {
50✔
3905
            reads.add_from_bytes(incorrec3, None, IDTag::new(3, 3)); // should be removed
50✔
3906
        }
50✔
3907

3908
        for _i in 0..30 {
60✔
3909
            reads.add_from_bytes(incorrec4, None, IDTag::new(4, 4)); // should be removed
60✔
3910
        }
60✔
3911

3912

3913
        for _i in 0..1 {
2✔
3914
            reads.add_from_bytes(insertion, None, IDTag::new(1, 3)); // should not be removed
2✔
3915
        }
2✔
3916

3917
        let seqs = ReadsPaired::Unpaired { reads };
2✔
3918
        let sample_info = SampleInfo::new(1, 0b111110, vec![1000, 10, 20, 20, 20, 20]);
2✔
3919
        let summary_config = SummaryConfig::new(sample_info);
2✔
3920
        let (kmers, _) = filter_kmers::<IDMapEMData, Kmer16, IDTag>(&seqs, &summary_config, false, 1., false);
2✔
3921

3922

3923
        // test with uncompressed graph
3924
        let mut unc_graph = uncompressed_graph(kmers.clone(), stranded).finish();
2✔
3925
        // add ids to graph
3926
        for i in 0..unc_graph.len() {
254✔
3927
            let data = unc_graph.mut_data(i);
254✔
3928
            if let Some(ids) = data.ids() {
254✔
3929
                if ids.contains(&0) {
254✔
3930
                    data.set_mapped_ids(vec![0].into());
100✔
3931
                }
154✔
3932
            }
×
3933
        }
3934
        let colors = Colors::new(&unc_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 5 });
2✔
3935
        if print { unc_graph.to_dot("uncompressed_bf-lad.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
512✔
3936
        let n_edges = unc_graph.iter_edges().count();
2✔
3937
        unc_graph.remove_ladders(10, 10., uc_csv).unwrap();
2✔
3938
        if print { unc_graph.to_dot("uncompressed_af-lad.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
472✔
3939
        assert_eq!(n_edges - 10, unc_graph.iter_edges().count());
2✔
3940

3941
        // test with compressed graph
3942
        let spec = CheckCompress::new(|d: IDMapEMData, _| d, |d, d1| d.join_test(d1));
245✔
3943
        let mut c_graph = compress_kmers_with_hash(stranded, &spec, kmers, false, false).finish();
2✔
3944
        // add ids to graph
3945
        for i in 0..c_graph.len() {
32✔
3946
            let data = c_graph.mut_data(i);
32✔
3947
            if let Some(ids) = data.ids() {
32✔
3948
                if ids.contains(&0) {
32✔
3949
                    data.set_mapped_ids(vec![0].into());
10✔
3950
                }
22✔
3951
            }
×
3952
        }
3953
        let colors = Colors::new(&c_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 5 });
2✔
3954
        if print { c_graph.to_dot("compressed_bf-lad.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
68✔
3955
        let n_edges = c_graph.iter_edges().count();
2✔
3956
        c_graph.remove_ladders(10, 10., c_csv).unwrap();
2✔
3957
        if print { c_graph.to_dot("compressed_af-lad.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
44✔
3958
        assert_eq!(n_edges - 6, c_graph.iter_edges().count());
2✔
3959
    }
2✔
3960

3961

3962
    #[test]
3963
    fn test_remove_tips_s() {
1✔
3964
        test_remove_tips(true);
1✔
3965
    }
1✔
3966

3967
    #[test]
3968
    fn test_remove_tips_us() {
1✔
3969
        test_remove_tips(false);
1✔
3970
    }
1✔
3971

3972
    fn test_remove_tips(stranded: bool) {
2✔
3973
        let print = false;
2✔
3974
        let strandedness = if stranded { Strandedness::Forward } else { Strandedness::Unstranded };
2✔
3975

3976
        let c_csv = if print { Some("c_tips.csv") } else { None };
2✔
3977
        let uc_csv = if print { Some("uc_tips.csv") } else { None };
2✔
3978

3979
        let     correct = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
2✔
3980
        let incorrect_r = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTAACGTGAC".as_bytes();
2✔
3981
        let incorrect_l = "ACGTTCGATCGCGATCGTAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
2✔
3982

3983

3984

3985
        let mut reads = Reads::new(strandedness);
2✔
3986
        for _i in 0..1000 {
2,000✔
3987
            reads.add_from_bytes(correct, None, IDTag::new(0, 0));
2,000✔
3988
        }
2,000✔
3989

3990
        for _i in 0..10 {
20✔
3991
            reads.add_from_bytes(incorrect_r, None, IDTag::new(1, 1)); // should be removed
20✔
3992
        }
20✔
3993

3994
        for _i in 0..10 {
20✔
3995
            reads.add_from_bytes(incorrect_l, None, IDTag::new(2, 2)); // should be removed
20✔
3996
        }
20✔
3997

3998
        let seqs = ReadsPaired::Unpaired { reads };
2✔
3999
        let sample_info = SampleInfo::new(1, 6, vec![1000, 10, 20]);
2✔
4000
        let summary_config = SummaryConfig::new(sample_info);
2✔
4001
        let (kmers, _) = filter_kmers::<IDMapEMData, Kmer16, IDTag>(&seqs, &summary_config, false, 1., false);
2✔
4002

4003

4004
        // test with uncompressed graph
4005
        let mut unc_graph = uncompressed_graph(kmers.clone(), stranded).finish();
2✔
4006
        // add ids to graph
4007
        for i in 0..unc_graph.len() {
122✔
4008
            let data = unc_graph.mut_data(i);
122✔
4009
            if let Some(ids) = data.ids() {
122✔
4010
                if ids.contains(&0) {
122✔
4011
                    data.set_mapped_ids(vec![0].into());
100✔
4012
                }
100✔
4013
            }
×
4014
        }
4015

4016
        let colors = Colors::new(&unc_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 3 });
2✔
4017
        if print { unc_graph.to_dot("uncompressed_bf-tips.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
2✔
4018
        let n_edges = unc_graph.iter_edges().count();
2✔
4019
        
4020
        unc_graph.remove_tips(10, 10., uc_csv).unwrap();
2✔
4021
        if print { unc_graph.to_dot("uncompressed_af-tips.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
2✔
4022
        assert_eq!(n_edges - 11, unc_graph.iter_edges().count());
2✔
4023

4024
        // test with compressed graph
4025
        let spec = CheckCompress::new(|d: IDMapEMData, _| d, |d, d1| d.join_test(d1));
113✔
4026
        let mut c_graph = compress_kmers_with_hash(stranded, &spec, kmers, false, false).finish();
2✔
4027
        // add ids to graph
4028
        for i in 0..c_graph.len() {
10✔
4029
            let data = c_graph.mut_data(i);
10✔
4030
            if let Some(ids) = data.ids() {
10✔
4031
                if ids.contains(&0) {
10✔
4032
                    data.set_mapped_ids(vec![0].into());
6✔
4033
                }
6✔
4034
            }
×
4035
        }
4036

4037
        let colors = Colors::new(&c_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 3 });
2✔
4038
        if print { c_graph.to_dot("compressed_bf-tips.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
2✔
4039
        let n_edges = c_graph.iter_edges().count();
2✔
4040
        
4041
        c_graph.remove_tips(10, 10., c_csv).unwrap();
2✔
4042
        if print { c_graph.to_dot("compressed_af-tips.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
2✔
4043
        assert_eq!(n_edges - 2, c_graph.iter_edges().count());
2✔
4044

4045
    }
2✔
4046

4047
    #[test]
4048
    fn test_to_tsv() {
1✔
4049
        let reads_us = Reads::from_vmer_vec(
1✔
4050
            (0..10).map(|i| (DnaString::from_bytes(&random_dna(100)), Exts::empty(), i as u8)).collect::<Vec<_>>(), 
10✔
4051
            crate::reads::Strandedness::Unstranded
1✔
4052
        );
4053

4054
        let reads_paired = ReadsPaired::Unpaired { reads: reads_us };
1✔
4055

4056
        let sample_info = SampleInfo::new(0b1111100000, 0b0000011111, vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
1✔
4057
        let summary_config = SummaryConfig::new(sample_info);
1✔
4058
        let (kmers, _) = filter_kmers::<TagsCountsData, Kmer6, _>(&reads_paired, &summary_config, false, 1., false);
1✔
4059

4060
        let graph = compress_kmers_with_hash(false, &ScmapCompress::new(), kmers, false, false).finish();
1✔
4061

4062
        graph.to_tsv("test_graph_unstranded.tsv", |node| node.data().print_ol(&Translator::empty(), &summary_config, None)).unwrap();
319✔
4063

4064

4065
        let reads_us = Reads::from_vmer_vec(
1✔
4066
            (0..10).map(|i| (DnaString::from_bytes(&random_dna(100)), Exts::empty(), i as u8)).collect::<Vec<_>>(), 
10✔
4067
            crate::reads::Strandedness::Forward
1✔
4068
        );
4069

4070
        let reads_paired = ReadsPaired::Unpaired { reads: reads_us };
1✔
4071

4072
        let sample_info = SampleInfo::new(0b1111100000, 0b0000011111, vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
1✔
4073
        let summary_config = SummaryConfig::new(sample_info);
1✔
4074
        let (kmers, _) = filter_kmers::<TagsCountsData, Kmer6, _>(&reads_paired, &summary_config, false, 1., false);
1✔
4075

4076
        let graph = compress_kmers_with_hash(true, &ScmapCompress::new(), kmers, false, false).finish();
1✔
4077

4078
        graph.to_tsv("test_graph_stranded.tsv", |node| node.data().print_ol(&Translator::empty(), &summary_config, None)).unwrap();
208✔
4079
    
4080
        remove_file("test_graph_unstranded.tsv").unwrap();
1✔
4081
        remove_file("test_graph_stranded.tsv").unwrap();
1✔
4082
    }
1✔
4083
}
4084

STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc