• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jlab / rust-debruijn / 16652259051

31 Jul 2025 02:46PM UTC coverage: 87.212% (-0.08%) from 87.287%
16652259051

Pull #27

github

evolp
add DebruijnGraph::remove_tips
Pull Request #27: more `Kmer`s, transcript mapping, edge filters

845 of 936 new or added lines in 7 files covered. (90.28%)

7 existing lines in 3 files now uncovered.

7318 of 8391 relevant lines covered (87.21%)

4517901.38 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

79.47
/src/graph.rs
1
// Copyright 2017 10x Genomics
2

3
//! Containers for path-compressed De Bruijn graphs
4

5
use bimap::BiHashMap;
6
use bio::io::fasta;
7
use bit_set::BitSet;
8
use indicatif::ProgressBar;
9
use indicatif::ProgressIterator;
10
use indicatif::ProgressStyle;
11
use itertools::enumerate;
12
use log::{debug, trace};
13
use rayon::prelude::*;
14
use rayon::current_num_threads;
15
use serde_derive::{Deserialize, Serialize};
16
use smallvec::SmallVec;
17
use std::borrow::Borrow;
18

19
use std::collections::HashSet;
20
use std::collections::VecDeque;
21
use std::f32;
22
use std::fmt::{self, Debug, Display};
23
use std::fs::{remove_file, File};
24
use std::hash::Hash;
25
use std::io::BufWriter;
26
use std::io::{BufReader, Error, Read};
27
use std::io::Write;
28
use std::iter::FromIterator;
29
use std::marker::PhantomData;
30
use std::path::Path;
31
use std::process::id;
32

33
use boomphf::hashmap::BoomHashMap;
34

35
use serde_json;
36
use serde_json::Value;
37

38
type SmallVec4<T> = SmallVec<[T; 4]>;
39
type SmallVec8<T> = SmallVec<[T; 8]>;
40

41
use crate::bits_to_base;
42
use crate::colors::ColorMode;
43
use crate::colors::Colors;
44
use crate::compression::CompressionSpec;
45
use crate::dna_string::{DnaString, DnaStringSlice, PackedDnaStringSet};
46
use crate::summarizer::SummaryConfig;
47
use crate::summarizer::SummaryData;
48
use crate::summarizer::Translator;
49
use crate::summarizer::ID;
50
use crate::BUF;
51
use crate::PROGRESS_STYLE;
52
use crate::{Dir, Exts, Kmer, Mer, Vmer};
53

54
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
55
/// This type does not carry the sorted index arrays the allow the graph
56
/// to be walked efficiently. The `DeBruijnGraph` type wraps this type and add those
57
/// vectors.
58
#[derive(Serialize, Deserialize, Clone, Debug)]
59
pub struct BaseGraph<K, D> {
60
    pub sequences: PackedDnaStringSet,
61
    pub exts: Vec<Exts>,
62
    pub data: Vec<D>,
63
    pub stranded: bool,
64
    phantom: PhantomData<K>,
65
}
66

67
impl<K, D> BaseGraph<K, D> {
68
    pub fn new(stranded: bool) -> Self {
2,199✔
69
        BaseGraph {
2,199✔
70
            sequences: PackedDnaStringSet::new(),
2,199✔
71
            exts: Vec::new(),
2,199✔
72
            data: Vec::new(),
2,199✔
73
            phantom: PhantomData,
2,199✔
74
            stranded,
2,199✔
75
        }
2,199✔
76
    }
2,199✔
77

78
    pub fn len(&self) -> usize {
806,499✔
79
        self.sequences.len()
806,499✔
80
    }
806,499✔
81

82
    pub fn is_empty(&self) -> bool {
1✔
83
        self.sequences.is_empty()
1✔
84
    }
1✔
85

86
    pub fn combine<I: Iterator<Item = BaseGraph<K, D>>>(graphs: I) -> Self {
12✔
87
        let mut sequences = PackedDnaStringSet::new();
12✔
88
        let mut exts = Vec::new();
12✔
89
        let mut data = Vec::new();
12✔
90
        let mut stranded = Vec::new();
12✔
91

92
        for g in graphs {
1,514✔
93
            for s in 0..g.sequences.len() {
6,619✔
94
                sequences.add(&g.sequences.get(s));
6,619✔
95
            }
6,619✔
96

97
            exts.extend(g.exts);
1,502✔
98
            data.extend(g.data);
1,502✔
99
            stranded.push(g.stranded);
1,502✔
100
        }
101

102
        let out_stranded = stranded.iter().all(|x| *x);
12✔
103

104
        if !out_stranded && !stranded.iter().all(|x| !*x) {
1,502✔
105
            panic!("attempted to combine stranded and unstranded graphs");
×
106
        }
12✔
107

108
        BaseGraph {
12✔
109
            sequences,
12✔
110
            stranded: out_stranded,
12✔
111
            exts,
12✔
112
            data,
12✔
113
            phantom: PhantomData,
12✔
114
        }
12✔
115
    }
12✔
116
}
117

118
impl<K: Kmer, D> BaseGraph<K, D> {
119
    pub fn add<R: Borrow<u8>, S: IntoIterator<Item = R>>(
1,548,498✔
120
        &mut self,
1,548,498✔
121
        sequence: S,
1,548,498✔
122
        exts: Exts,
1,548,498✔
123
        data: D,
1,548,498✔
124
    ) {
1,548,498✔
125
        self.sequences.add(sequence);
1,548,498✔
126
        self.exts.push(exts);
1,548,498✔
127
        self.data.push(data);
1,548,498✔
128
    }
1,548,498✔
129
}
130

131
impl<K: Kmer + Send + Sync, D> BaseGraph<K, D> {
132
    pub fn finish(self) -> DebruijnGraph<K, D> {
586✔
133
        let indices: Vec<u32> = (0..self.len() as u32).collect();
586✔
134

135
        let left_order = {
586✔
136
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
586✔
137
            for idx in &indices {
822,399✔
138
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
821,813✔
139
                
821,813✔
140
            }
821,813✔
141
            
142
            BoomHashMap::new_parallel(kmers, indices.clone())
586✔
143
        };
144

145
        let right_order = {
586✔
146
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
586✔
147
            for idx in &indices {
822,399✔
148
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
821,813✔
149
            }
821,813✔
150
            
151
            BoomHashMap::new_parallel(kmers, indices)
586✔
152
        };
153
        debug!("finish graph loops: 2x {}", self.len());
586✔
154

155
        DebruijnGraph {
586✔
156
            base: self,
586✔
157
            left_order,
586✔
158
            right_order,
586✔
159
        }
586✔
160
    }
586✔
161
}
162

163
impl<K: Kmer, D> BaseGraph<K, D> {
164
    pub fn finish_serial(self) -> DebruijnGraph<K, D> {
111✔
165
        let indices: Vec<u32> = (0..self.len() as u32).collect();
111✔
166

167
        let left_order = {
111✔
168
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
111✔
169
            let mut sequences: Vec<String> = Vec::new();
111✔
170
            for idx in &indices {
724,972✔
171
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
724,861✔
172
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
724,861✔
173
            }
724,861✔
174
            println!("left kmers: {:?}", kmers);
111✔
175
            println!("left seqs: {:?}", sequences);
111✔
176
            BoomHashMap::new(kmers, indices.clone())
111✔
177
        };
178

179
        let right_order = {
111✔
180
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
111✔
181
            let mut sequences: Vec<String> = Vec::new();
111✔
182
            for idx in &indices {
724,972✔
183
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
724,861✔
184
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
724,861✔
185
            }
724,861✔
186
            println!("right kmers: {:?}", kmers);
111✔
187
            println!("right seqs: {:?}", sequences);
111✔
188
            BoomHashMap::new(kmers, indices)
111✔
189
        };
190

191
        DebruijnGraph {
111✔
192
            base: self,
111✔
193
            left_order,
111✔
194
            right_order,
111✔
195
        }
111✔
196
    }
111✔
197
}
198

199
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
200
/// The struct carries sorted index arrays the allow the graph
201
/// to be walked efficiently.
202
#[derive(Serialize, Deserialize, Debug)]
203
pub struct DebruijnGraph<K: Hash, D> {
204
    pub base: BaseGraph<K, D>,
205
    left_order: BoomHashMap<K, u32>,
206
    right_order: BoomHashMap<K, u32>,
207
}
208

209
impl<K: Kmer, D: Debug> DebruijnGraph<K, D> {
210
    /// Total number of nodes in the DeBruijn graph
211
    pub fn len(&self) -> usize {
804,384✔
212
        self.base.len()
804,384✔
213
    }
804,384✔
214

215
    pub fn is_empty(&self) -> bool {
1✔
216
        self.base.is_empty()
1✔
217
    }
1✔
218

219
    /// Get a node given it's `node_id`
220
    pub fn get_node(&self, node_id: usize) -> Node<K, D> {
5,077,075✔
221
        Node {
5,077,075✔
222
            node_id,
5,077,075✔
223
            graph: self,
5,077,075✔
224
        }
5,077,075✔
225
    }
5,077,075✔
226

227
    /// Get a node given it's `node_id`
228
    pub fn get_node_kmer(&self, node_id: usize) -> NodeKmer<K, D> {
×
229
        let node = self.get_node(node_id);
×
230
        let node_seq = node.sequence();
×
231

232
        NodeKmer {
×
233
            node_id,
×
234
            node_seq_slice: node_seq,
×
235
            phantom_d: PhantomData,
×
236
            phantom_k: PhantomData,
×
237
        }
×
238
    }
×
239

240
    /// Return an iterator over all nodes in the graph
241
    pub fn iter_nodes(&self) -> NodeIter<K, D> {
129✔
242
        NodeIter {
129✔
243
            graph: self,
129✔
244
            node_id: 0,
129✔
245
        }
129✔
246
    }
129✔
247

248
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
249
    /// accessed via a Node wrapper object
250
    fn find_edges(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
1,089,502✔
251
        let exts = self.base.exts[node_id];
1,089,502✔
252
        let sequence = self.base.sequences.get(node_id);
1,089,502✔
253
        let kmer: K = sequence.term_kmer(dir);
1,089,502✔
254
        let mut edges = SmallVec4::new();
1,089,502✔
255

256
        for i in 0..4 {
5,447,510✔
257
            if exts.has_ext(dir, i) {
4,358,008✔
258
                let link = self.find_link(kmer.extend(i, dir), dir).expect("missing link");
1,455,544✔
259
                edges.push((i, link.0, link.1, link.2));
1,455,544✔
260
            }
2,902,464✔
261
        }
262

263
        edges
1,089,502✔
264
    }
1,089,502✔
265

266
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
267
    /// accessed via a Node wrapper object
268
    /// 
269
    /// allows missing links
270
    fn _find_edges_sharded(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
×
271
        let exts = self.base.exts[node_id];
×
272
        let sequence = self.base.sequences.get(node_id);
×
273
        let kmer: K = sequence.term_kmer(dir);
×
274
        let mut edges = SmallVec4::new();
×
275

276
        for i in 0..4 {
×
277
            if exts.has_ext(dir, i) {
×
278
                let link = self.find_link(kmer.extend(i, dir), dir); //.expect("missing link");
×
279
                if let Some(l) = link {
×
280
                    edges.push((i, l.0, l.1, l.2));
×
281
                }
×
282
                // Otherwise, this edge doesn't exist within this shard, so ignore it.
283
                // NOTE: this should be allowed in a 'complete' DBG
284
            }
×
285
        }
286

287
        edges
×
288
    }
×
289

290
    /// Seach for the kmer `kmer`, appearing at the given `side` of a node sequence.
291
    fn search_kmer(&self, kmer: K, side: Dir) -> Option<usize> {
5,332,151✔
292
        match side {
5,332,151✔
293
            Dir::Left => self.left_order.get(&kmer).map(|pos| *pos as usize),
2,659,029✔
294
            Dir::Right => self.right_order.get(&kmer).map(|pos| *pos as usize),
2,673,122✔
295
        }
296
    }
5,332,151✔
297

298
    /// Find a link in the graph, possibly handling a RC switch.
299
    pub fn find_link(&self, kmer: K, dir: Dir) -> Option<(usize, Dir, bool)> {
3,723,637✔
300
        // Only test self-consistent paths through
301
        // the edges
302
        // Avoids problems due to single kmer edges
303
        // (dir, next_side_incoming, rc)
304
        // (Dir::Left, Dir::Right, false) => true,
305
        // (Dir::Left, Dir::Left,  true) => true,
306
        // (Dir::Right, Dir::Left, false) => true,
307
        // (Dir::Right, Dir::Right, true) => true,
308

309
        let rc = kmer.rc();
3,723,637✔
310

311
        match dir {
3,723,637✔
312
            Dir::Left => {
313
                if let Some(idx) = self.search_kmer(kmer, Dir::Right) {
1,862,308✔
314
                    return Some((idx, Dir::Right, false));
1,064,608✔
315
                }
797,700✔
316

317
                if !self.base.stranded {
797,700✔
318
                    if let Some(idx) = self.search_kmer(rc, Dir::Left) {
797,700✔
319
                        return Some((idx, Dir::Left, true));
797,700✔
320
                    }
×
321
                }
×
322
            }
323

324
            Dir::Right => {
325
                if let Some(idx) = self.search_kmer(kmer, Dir::Left) {
1,861,329✔
326
                    return Some((idx, Dir::Left, false));
1,064,663✔
327
                }
796,666✔
328

329
                if !self.base.stranded {
796,666✔
330
                    if let Some(idx) = self.search_kmer(rc, Dir::Right) {
796,666✔
331
                        return Some((idx, Dir::Right, true));
796,666✔
332
                    }
×
333
                }
×
334
            }
335
        }
336

337
        None
×
338
    }
3,723,637✔
339

340
    /// Check whether the graph is fully compressed. Return `None` if it's compressed,
341
    /// otherwise return `Some(node1, node2)` representing a pair of node that could
342
    /// be collapsed. Probably only useful for testing.
343
    pub fn is_compressed<S: CompressionSpec<D>>(&self, spec: &S) -> Option<(usize, usize)> {
789✔
344
        for i in 0..self.len() {
118,095✔
345
            let n = self.get_node(i);
118,095✔
346

347
            for dir in &[Dir::Left, Dir::Right] {
354,285✔
348
                let dir_edges = n.edges(*dir);
236,190✔
349
                if dir_edges.len() == 1 {
236,190✔
350
                    let (_, next_id, return_dir, _) = dir_edges[0];
166,890✔
351
                    let next = self.get_node(next_id);
166,890✔
352

353
                    let ret_edges = next.edges(return_dir);
166,890✔
354
                    if ret_edges.len() == 1 {
166,890✔
355
                        // Test for us being a palindrome: this makes it OK
356
                        if n.len() == K::k() && n.sequence().first_kmer::<K>().is_palindrome() {
26✔
357
                            continue;
16✔
358
                        }
10✔
359

360
                        // Test for a neighbor being a palindrome: this makes it OK
361
                        if next.len() == K::k() && next.sequence().first_kmer::<K>().is_palindrome()
10✔
362
                        {
363
                            continue;
8✔
364
                        }
2✔
365

366
                        // Test for this edge representing a smooth circle (biting it's own tail):
367
                        if n.node_id == next_id {
2✔
368
                            continue;
2✔
369
                        }
×
370

371
                        if spec.join_test(n.data(), next.data()) {
×
372
                            // Found a unbranched edge that should have been eliminated
373
                            return Some((i, next_id));
×
374
                        }
×
375
                    }
166,864✔
376
                }
69,300✔
377
            }
378
        }
379

380
        None
789✔
381
    }
789✔
382

383
    /// Remove non-existent extensions that may be created due to filtered kmers
384
    /// 
385
    /// if `valid_nodes` if `None`, all nodes are valid
386
    pub fn fix_exts(&mut self, valid_nodes: Option<&BitSet>) {
247✔
387
        for i in 0..self.len() {
749,607✔
388
            let valid_exts = self.get_valid_exts(i, valid_nodes);
749,607✔
389
            self.base.exts[i] = valid_exts;
749,607✔
390
        }
749,607✔
391
    }
247✔
392

393
    pub fn get_valid_exts(&self, node_id: usize, valid_nodes: Option<&BitSet>) -> Exts {
749,607✔
394
        let mut new_exts = Exts::empty();
749,607✔
395
        let node = self.get_node(node_id);
749,607✔
396
        let exts = node.exts();
749,607✔
397
        let l_kmer: K = node.sequence().first_kmer();
749,607✔
398
        let r_kmer: K = node.sequence().last_kmer();
749,607✔
399

400
        let check_node = |id| match valid_nodes {
1,529,146✔
401
            Some(bs) => bs.contains(id),
1,477,928✔
402
            None => true,
51,218✔
403
        };
1,529,146✔
404

405
        for i in 0..4 {
3,748,035✔
406
            if exts.has_ext(Dir::Left, i) {
2,998,428✔
407
                match self.find_link(l_kmer.extend_left(i), Dir::Left) {
764,593✔
408
                    Some((target, _, _)) if check_node(target) => {
764,593✔
409
                        new_exts = new_exts.set(Dir::Left, i)
764,593✔
410
                    }
411
                    _ => (),
×
412
                }
413
            }
2,233,835✔
414

415
            if exts.has_ext(Dir::Right, i) {
2,998,428✔
416
                match self.find_link(r_kmer.extend_right(i), Dir::Right) {
764,553✔
417
                    Some((target, _, _)) if check_node(target) => {
764,553✔
418
                        new_exts = new_exts.set(Dir::Right, i)
764,553✔
419
                    }
420
                    _ => (),
×
421
                }
422
            }
2,233,875✔
423
        }
424

425
        new_exts
749,607✔
426
    }
749,607✔
427

428
    /// mutable reference to the auxiliary data of the node node_id
429
    pub fn mut_data(&mut self, node_id: usize) -> &mut D {
147✔
430
        &mut self.base.data[node_id]
147✔
431
    }
147✔
432

433
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
434
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
435
    /// paths will be terminated if there are multiple valid paths emanating from a node.
436
    pub fn max_path<F, F2>(&self, score: F, solid_path: F2) -> Vec<(usize, Dir)>
×
437
    where
×
438
        F: Fn(&D) -> f32,
×
439
        F2: Fn(&D) -> bool,
×
440
    {
441
        if self.is_empty() {
×
442
            return Vec::default();
×
443
        }
×
444

445
        let mut best_node = 0;
×
446
        let mut best_score = f32::MIN;
×
447
        for i in 0..self.len() {
×
448
            let node = self.get_node(i);
×
449
            let node_score = score(node.data());
×
450

451
            if node_score > best_score {
×
452
                best_node = i;
×
453
                best_score = node_score;
×
454
            }
×
455
        }
456

457
        let oscore = |state| match state {
×
458
            None => 0.0,
×
459
            Some((id, _)) => score(self.get_node(id).data()),
×
460
        };
×
461

462
        let osolid_path = |state| match state {
×
463
            None => false,
×
464
            Some((id, _)) => solid_path(self.get_node(id).data()),
×
465
        };
×
466

467
        // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
468
        // already put into the path
469
        let mut used_nodes = HashSet::new();
×
470
        let mut path = VecDeque::new();
×
471

472
        // Start w/ initial state
473
        used_nodes.insert(best_node);
×
474
        path.push_front((best_node, Dir::Left));
×
475

476
        for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
×
477
            let &(start_node, dir, do_flip) = init;
×
478
            let mut current = (start_node, dir);
×
479

480
            loop {
481
                let mut next = None;
×
482
                let (cur_id, incoming_dir) = current;
×
483
                let node = self.get_node(cur_id);
×
484
                let edges = node.edges(incoming_dir.flip());
×
485

486
                let mut solid_paths = 0;
×
487
                for (_, id, dir, _) in edges {
×
488
                    let cand = Some((id, dir));
×
489
                    if osolid_path(cand) {
×
490
                        solid_paths += 1;
×
491
                    }
×
492

493
                    if oscore(cand) > oscore(next) {
×
494
                        next = cand;
×
495
                    }
×
496
                }
497

498
                if solid_paths > 1 {
×
499
                    break;
×
500
                }
×
501

502
                match next {
×
503
                    Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
×
504
                        if do_flip {
×
505
                            path.push_front((next_id, next_incoming.flip()));
×
506
                        } else {
×
507
                            path.push_back((next_id, next_incoming));
×
508
                        }
×
509

510
                        used_nodes.insert(next_id);
×
511
                        current = (next_id, next_incoming);
×
512
                    }
513
                    _ => break,
×
514
                }
515
            }
516
        }
517

518
        Vec::from_iter(path)
×
519
    }
×
520

521

522
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
523
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
524
    /// paths will be terminated if there are multiple valid paths emanating from a node.
525
    /// Returns vec with path for each component
526
    pub fn max_path_comp<F, F2>(&self, score: F, solid_path: F2) -> Vec<VecDeque<(usize, Dir)>>
1✔
527
    where
1✔
528
        F: Fn(&D) -> f32,
1✔
529
        F2: Fn(&D) -> bool,
1✔
530
    {
531
        if self.is_empty() {
1✔
532
            let vec: Vec<VecDeque<(usize, Dir)>> = Vec::new();
×
533
            return vec;
×
534
        }
1✔
535

536
        let components = self.iter_components();
1✔
537
        let mut paths: Vec<VecDeque<(usize, Dir)>> = Vec::new();
1✔
538

539
        for component in components {
3✔
540

541
            let current_comp = &component;
2✔
542
            
543

544
            let mut best_node = current_comp[0];
2✔
545
            let mut best_score = f32::MIN;
2✔
546
            for c in current_comp.iter() {
139✔
547
                let node = self.get_node(*c);
139✔
548
                let node_score = score(node.data());
139✔
549

550
                if node_score > best_score {
139✔
551
                    best_node = *c;
2✔
552
                    best_score = node_score;
2✔
553
                }
137✔
554
            }
555

556
            let oscore = |state| match state {
170✔
557
                None => 0.0,
84✔
558
                Some((id, _)) => score(self.get_node(id).data()),
86✔
559
            };
170✔
560

561
            let osolid_path = |state| match state {
85✔
562
                None => false,
×
563
                Some((id, _)) => solid_path(self.get_node(id).data()),
85✔
564
            };
85✔
565

566
            // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
567
            // already put into the path
568
            let mut used_nodes = HashSet::new();
2✔
569
            let mut path = VecDeque::new();
2✔
570

571
            // Start w/ initial state
572
            used_nodes.insert(best_node);
2✔
573
            path.push_front((best_node, Dir::Left));
2✔
574

575
            for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
4✔
576
                let &(start_node, dir, do_flip) = init;
4✔
577
                let mut current = (start_node, dir);
4✔
578

579
                loop {
580
                    let mut next = None;
87✔
581
                    let (cur_id, incoming_dir) = current;
87✔
582
                    let node = self.get_node(cur_id);
87✔
583
                    let edges = node.edges(incoming_dir.flip());
87✔
584

585
                    let mut solid_paths = 0;
87✔
586
                    for (_, id, dir, _) in edges {
172✔
587
                        let cand = Some((id, dir));
85✔
588
                        if osolid_path(cand) {
85✔
589
                            solid_paths += 1;
85✔
590
                        }
85✔
591

592
                        if oscore(cand) > oscore(next) {
85✔
593
                            next = cand;
84✔
594
                        }
84✔
595
                    }
596

597
                    if solid_paths > 1 {
87✔
598
                        break;
1✔
599
                    }
86✔
600

601
                    match next {
83✔
602
                        Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
83✔
603
                            if do_flip {
83✔
604
                                path.push_front((next_id, next_incoming.flip()));
59✔
605
                            } else {
59✔
606
                                path.push_back((next_id, next_incoming));
24✔
607
                            }
24✔
608

609
                            used_nodes.insert(next_id);
83✔
610
                            current = (next_id, next_incoming);
83✔
611
                        }
612
                        _ => break,
3✔
613
                    }
614
                }
615
            }
616
            
617
            paths.push(path);
2✔
618
            //paths.push(Vec::from_iter(path));
619
        }
620

621
        paths
1✔
622
    
623
    }
1✔
624

625
    pub fn iter_max_path_comp<F, F2>(&self, score: F, solid_path: F2) -> PathCompIter<K, D, F, F2> 
3✔
626
    where 
3✔
627
    F: Fn(&D) -> f32,
3✔
628
    F2: Fn(&D) -> bool
3✔
629
    {
630
        let component_iterator = self.iter_components();
3✔
631
        PathCompIter { graph: self, component_iterator, graph_pos: 0, score, solid_path }
3✔
632
    }
3✔
633

634
    /// write the paths from `iter_max_path_comp` to a fasta file
635
    pub fn path_to_fasta<F, F2>(&self, f: &mut dyn std::io::Write, path_iter: PathCompIter<K, D, F, F2>, return_lens: bool) -> (Vec<usize>, Vec<usize>)
1✔
636
    where 
1✔
637
    F: Fn(&D) -> f32,
1✔
638
    F2: Fn(&D) -> bool
1✔
639
    {
640
        // width of fasta lines
641
        let columns = 80;
1✔
642

643
        // sizes of components and of paths
644
        let mut comp_sizes = Vec::new();
1✔
645
        let mut path_lens = Vec::new();
1✔
646

647
        for (seq_counter, (component, path)) in path_iter.enumerate() {
2✔
648
            // get dna sequence from path
649
            let seq = self.sequence_of_path(path.iter());
2✔
650

651
            //write header with length & start node
652
            writeln!(f, ">path{} len={} start_node={}", seq_counter, seq.len(), path[0].0).unwrap();
2✔
653

654
            // calculate how sequence has to be split up
655
            let slices = (seq.len() / columns) + 1;
2✔
656
            let mut ranges = Vec::with_capacity(slices);
2✔
657

658
            let mut start = 0;
2✔
659
            while start < seq.len() {
6✔
660
                ranges.push(start..start + columns);
4✔
661
                start += columns;
4✔
662
            }
4✔
663

664
            let last_start = ranges.pop().expect("no kmers in parallel ranges").start;
2✔
665
            ranges.push(last_start..seq.len());
2✔
666

667
            // split up sequence and write to file accordingly
668
            for range in ranges {
6✔
669
                writeln!(f, "{:?}", seq.slice(range.start, range.end)).unwrap();
4✔
670
            }
4✔
671

672
            if return_lens {
2✔
673
                comp_sizes.push(component.len());
×
674
                path_lens.push(path.len());
×
675
            }
2✔
676
        }    
677

678
        (comp_sizes, path_lens)
1✔
679
        
680
    }
1✔
681

682

683
    /// Get the sequence of a path through the graph. The path is given as a sequence of node_id integers
684
    pub fn sequence_of_path<'a, I: 'a + Iterator<Item = &'a (usize, Dir)>>(
17,994✔
685
        &self,
17,994✔
686
        path: I,
17,994✔
687
    ) -> DnaString {
17,994✔
688
        let mut seq = DnaString::new();
17,994✔
689

690
        for (idx, &(node_id, dir)) in path.enumerate() {
731,825✔
691
            let start = if idx == 0 { 0 } else { K::k() - 1 };
731,825✔
692

693
            let node_seq = match dir {
731,825✔
694
                Dir::Left => self.get_node(node_id).sequence(),
378,244✔
695
                Dir::Right => self.get_node(node_id).sequence().rc(),
353,581✔
696
            };
697

698
            for p in start..node_seq.len() {
1,338,961✔
699
                seq.push(node_seq.get(p))
1,338,961✔
700
            }
701
        }
702

703
        seq
17,994✔
704
    }
17,994✔
705

706
    /// map sequences from a fasta file to a completely uncompressed (!!!) and stranded (!!!!!) debruijn graph
707
    pub fn map_transcripts<P>(&self, path: P, translator: &mut Translator) -> Result<Vec<Box<[ID]>>, String> 
1✔
708
    where 
1✔
709
        P: AsRef<Path>
1✔
710
    {
711
        if !self.base.stranded { return Err("graph has to be stranded".to_string()) };
1✔
712

713
        let reader = fasta::Reader::new(BufReader::new(File::open(path).unwrap()));
1✔
714
        let mut node_transcript_ids = vec![Vec::new(); self.len()];
1✔
715

716
        let mut backup_id_tr = BiHashMap::new();
1✔
717

718
        let id_tr = if let Some(id_tr) = translator.mut_id_translator() {
1✔
719
            id_tr
1✔
720
        } else {
NEW
721
            &mut backup_id_tr
×
722
        };
723

724
        // go through each transcript and map to graph
725
        for result in reader.records() {
15✔
726
            let record = result.expect("error parsing transcripts fasta");
15✔
727

728
            // get gene id or make new id
729
            let gene_id = match id_tr.get_by_left(&record.id().to_string()) {
15✔
NEW
730
                Some(id) => *id,
×
731
                None => {
732
                    let new_id = id_tr.len() as ID;
15✔
733
                    id_tr.insert(record.id().to_string(), new_id);
15✔
734
                    new_id
15✔
735
                }
736
            };
737

738
            // iterate over k-mers in transcript and find each one in the graph
739
            let sequence = DnaString::from_acgt_bytes(record.seq());
15✔
740
            for kmer in sequence.iter_kmers::<K>() {
14,148✔
741
                if let Some(node) = self.search_kmer(kmer, Dir::Right) {
14,148✔
742
                    node_transcript_ids[node].push(gene_id);
13,284✔
743
                    //println!("node: {node}, seq: {}, kmer: {:?}, ids: {:?}, tr id: {}", self.get_node(node).sequence(), kmer, self.get_node(node).data(), gene_id);                
13,284✔
744
                }
13,284✔
745
            }
746
        }
747

748
        // change to boxed slices to reduce memory
749
        let boxed_transcripts = node_transcript_ids.into_iter().map(|vec| vec.into()).collect();
21,993✔
750

751
        Ok(boxed_transcripts)
1✔
752
    }
1✔
753

754
    /// write a node to a dot file
755
    /// 
756
    /// ### Arguments: 
757
    /// 
758
    /// * `node`: [`Node<K, D>`] which will be written to a dot file
759
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
760
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
761
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
762
    /// * `f`: writer
763
    fn node_to_dot<FN: Fn(&Node<K, D>) -> String, FE: Fn(&Node<K, D>, u8, Dir, bool) -> String>(
97,978✔
764
        &self,
97,978✔
765
        node: &Node<'_, K, D>,
97,978✔
766
        node_label: &FN,
97,978✔
767
        edge_label: &FE,
97,978✔
768
        f: &mut dyn Write,
97,978✔
769
    ) {
97,978✔
770
        writeln!(f, "n{} {}", node.node_id, node_label(node)).unwrap();
97,978✔
771
        assert_eq!(node.exts().val.count_ones() as usize, node.l_edges().len() + node.r_edges().len());
97,978✔
772

773
        for (base, id, incoming_dir, flipped) in node.l_edges() {
97,978✔
774
            writeln!(f, "n{} -> n{} {}", id, node.node_id, edge_label(node, base, incoming_dir, flipped)).unwrap();
97,591✔
775
        }
97,591✔
776

777
        for (base, id, incoming_dir, flipped) in node.r_edges() {
97,978✔
778
            writeln!(f, "n{} -> n{} {}", node.node_id, id, edge_label(node, base, incoming_dir, flipped)).unwrap();
97,165✔
779
        }
97,165✔
780
    }
97,978✔
781

782
    /// Write the graph to a dot file
783
    /// 
784
    /// ### Arguments: 
785
    /// 
786
    /// * `path`: path to the output file
787
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes, e.g. [`Node::node_dot_default`]
788
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge, e.g. [`Node::edge_dot_default`]
789
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
790
    pub fn to_dot<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
1✔
791
    where 
1✔
792
    P: AsRef<Path>,
1✔
793
    FN: Fn(&Node<K, D>) -> String,
1✔
794
    FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
795
    {
796
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
797

798
        let pb = ProgressBar::new(self.len() as u64);
1✔
799
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
800
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
1✔
801

802
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
803
        for i in (0..self.len()).progress_with(pb) {
32,658✔
804
            self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
32,658✔
805
        }
32,658✔
806
        writeln!(&mut f, "}}").unwrap();
1✔
807
        
808
        f.flush().unwrap();
1✔
809
        debug!("large to dot loop: {}", self.len());
1✔
810
    }
1✔
811

812
    /// Write the graph to a dot file, highlight the nodes which form the 
813
    /// "best" path, according to [`PathCompIter`], with the number of occurences 
814
    /// as the score and `solid_path` always `true`.
815
    /// The nodes are formatted according to [`Node::node_dot_default`].
816
    /// 
817
    /// ### Arguments: 
818
    /// 
819
    /// * `path`: path to the output file
820
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge, e.g. [`Node::edge_dot_default`]
821
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
822
    /// * `colors`: a [`Colors`] with the color settings for the graph
823
    /// * `translator`: a [`Translator`] which translates tags or IDs to strings
824
    /// * `config`: a [`SummaryConfig`] which contains settings for the graph
825
    pub fn to_dot_with_path<P, FE, DI>(&self, path: P, edge_label: &FE, colors: &Colors<'_, D, DI>, translator: &Translator, config: &SummaryConfig, translate_id_groups: bool)
1✔
826
    where 
1✔
827
    P: AsRef<Path>,
1✔
828
    D: SummaryData<DI>,
1✔
829
    FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
830
    {
831
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
832

833
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
834

835
        // iterate over components
836
        for (component, path) in self.iter_max_path_comp(|d| d.sum().unwrap_or(1) as f32, |_| true) {
62,565✔
837
            let hashed_path = path.into_iter().map(|(id, _)| id).collect::<HashSet<usize>>();
233✔
838
            for node_id in component {
32,891✔
839
                self.node_to_dot(
32,658✔
840
                    &self.get_node(node_id),
32,658✔
841
                    &|node| node.node_dot_default(colors, config, translator, hashed_path.contains(&node_id), translate_id_groups), 
32,658✔
842
                    edge_label, 
32,658✔
843
                    &mut f
32,658✔
844
                );
845
            }
846
        }
847

848
        writeln!(&mut f, "}}").unwrap();
1✔
849
        
850
        f.flush().unwrap();
1✔
851
        debug!("large to dot loop: {}", self.len());
1✔
852
    }
1✔
853

854
    /// Write the graph to a dot file in parallel
855
    /// Will write in to n_threads files simultaniously,
856
    /// then go though the files and add the contents to a larger file, 
857
    /// and delete the small files.
858
    /// 
859
    /// ### Arguments: 
860
    /// 
861
    /// * `path`: path to the output file
862
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
863
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
864
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
865
    pub fn to_dot_parallel<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
1✔
866
    where 
1✔
867
        D: Sync,
1✔
868
        K: Sync,
1✔
869
        P: AsRef<Path> + Display + Sync,
1✔
870
        FN: Fn(&Node<K, D>) -> String + Sync,
1✔
871
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String + Sync,
1✔
872
    {        
873
        let slices = current_num_threads();
1✔
874
        let n_nodes = self.len();
1✔
875
        let sz = n_nodes / slices + 1;
1✔
876

877
        debug!("n_nodes: {}", n_nodes);
1✔
878
        debug!("sz: {}", sz);
1✔
879

880
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
881
        let mut start = 0;
1✔
882
        while start < n_nodes {
5✔
883
            parallel_ranges.push(start..start + sz);
4✔
884
            start += sz;
4✔
885
        }
4✔
886

887
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
888
        parallel_ranges.push(last_start..n_nodes);
1✔
889
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
890

891
        let mut files = Vec::with_capacity(current_num_threads());
1✔
892

893
        for i in 0..parallel_ranges.len() {
4✔
894
            files.push(format!("{}-{}.dot", path, i));
4✔
895
        } 
4✔
896

897
        let pb = ProgressBar::new(self.len() as u64);
1✔
898
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
899
        pb.set_message(format!("{:<32}", "writing graph to DOT files"));
1✔
900
    
901
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
4✔
902
            let mut f = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel dot file"));
4✔
903

904
            for i in range {
32,662✔
905
                self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
32,658✔
906
                pb.inc(1);
32,658✔
907
            }
32,658✔
908

909
            f.flush().unwrap();
4✔
910
        });
4✔
911
        pb.finish_and_clear();
1✔
912

913
        let mut out_file = BufWriter::with_capacity(BUF, File::create(path).expect("error creating combined dot file"));
1✔
914

915
        writeln!(&mut out_file, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
916

917
        let pb = ProgressBar::new(files.len() as u64);
1✔
918
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
919
        pb.set_message(format!("{:<32}", "combining files"));
1✔
920

921
        for file in files.iter().progress_with(pb) {
4✔
922
            let open_file = File::open(file).expect("error opening parallel dot file");
4✔
923
            let mut reader = BufReader::new(open_file);
4✔
924
            let mut buffer = [0; BUF];
4✔
925

926
            loop {
927
                let linecount = reader.read(&mut buffer).unwrap();
219✔
928
                if linecount == 0 { break }
219✔
929
                out_file.write_all(&buffer[..linecount]).unwrap();
215✔
930
            }
931

932
            remove_file(file).unwrap();
4✔
933
        }
934

935
        writeln!(&mut out_file, "}}").unwrap();
1✔
936

937
        out_file.flush().unwrap();
1✔
938

939

940
    }
1✔
941

942

943
    /// Write part of the graph to a dot file
944
    /// 
945
    /// ### Arguments: 
946
    /// 
947
    /// * `path`: path to the output file
948
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
949
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
950
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
951
    /// * `nodes`: [`Vec<usize>`] listing all IDs of nodes which should be included
952
    pub fn to_dot_partial<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE, nodes: Vec<usize>) 
1✔
953
    where 
1✔
954
        P: AsRef<Path>,
1✔
955
        FN: Fn(&Node<K, D>) -> String,
1✔
956
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
957
    {
958
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
959

960
        let pb = ProgressBar::new(nodes.len() as u64);
1✔
961
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
962
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
1✔
963

964
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
965
        for i in nodes.into_iter().progress_with(pb) {
4✔
966
            self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
4✔
967
        }
4✔
968
        writeln!(&mut f, "}}").unwrap();
1✔
969

970
        f.flush().unwrap();
1✔
971

972
        debug!("large to dot loop: {}", self.len());
1✔
973
    }
1✔
974

975
    fn node_to_gfa<F: Fn(&Node<'_, K, D>) -> String>(
97,978✔
976
        &self,
97,978✔
977
        node: &Node<'_, K, D>,
97,978✔
978
        w: &mut dyn Write,
97,978✔
979
        tag_func: Option<&F>,
97,978✔
980
    ) -> Result<(), Error> {
97,978✔
981
        match tag_func {
97,978✔
982
            Some(f) => {
65,320✔
983
                let tags = (f)(node);
65,320✔
984
                writeln!(
65,320✔
985
                    w,
65,320✔
986
                    "S\t{}\t{}\t{}",
65,320✔
987
                    node.node_id,
988
                    node.sequence().to_dna_string(),
65,320✔
989
                    tags
UNCOV
990
                )?;
×
991
            }
992
            _ => writeln!(
32,658✔
993
                w,
32,658✔
994
                "S\t{}\t{}",
32,658✔
995
                node.node_id,
996
                node.sequence().to_dna_string()
32,658✔
UNCOV
997
            )?,
×
998
        }
999

1000
        for (_, target, dir, _) in node.l_edges() {
97,978✔
1001
            if target >= node.node_id {
97,591✔
1002
                let to_dir = match dir {
48,748✔
1003
                    Dir::Left => "+",
19,433✔
1004
                    Dir::Right => "-",
29,315✔
1005
                };
1006
                writeln!(
48,748✔
1007
                    w,
48,748✔
1008
                    "L\t{}\t-\t{}\t{}\t{}M",
48,748✔
1009
                    node.node_id,
1010
                    target,
1011
                    to_dir,
1012
                    K::k() - 1
48,748✔
UNCOV
1013
                )?;
×
1014
            }
48,843✔
1015
        }
1016

1017
        for (_, target, dir, _) in node.r_edges() {
97,978✔
1018
            if target > node.node_id {
97,165✔
1019
                let to_dir = match dir {
48,634✔
1020
                    Dir::Left => "+",
29,415✔
1021
                    Dir::Right => "-",
19,219✔
1022
                };
1023
                writeln!(
48,634✔
1024
                    w,
48,634✔
1025
                    "L\t{}\t+\t{}\t{}\t{}M",
48,634✔
1026
                    node.node_id,
1027
                    target,
1028
                    to_dir,
1029
                    K::k() - 1
48,634✔
UNCOV
1030
                )?;
×
1031
            }
48,531✔
1032
        }
1033

1034
        Ok(())
97,978✔
1035
    }
97,978✔
1036

1037
    /// Write the graph to GFA format
1038
    pub fn to_gfa<P: AsRef<Path>>(&self, gfa_out: P) -> Result<(), Error> {
1✔
1039
        let wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
1✔
1040
        self.write_gfa(&mut std::io::BufWriter::new(wtr))
1✔
1041
    }
1✔
1042

1043
    pub fn write_gfa(&self, wtr: &mut impl Write) -> Result<(), Error> {
1✔
1044
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1045

1046
        type DummyFn<K, D> = fn(&Node<'_, K, D>) -> String;
1047
        let dummy_opt: Option<&DummyFn<K, D>> = None;
1✔
1048

1049
        let pb = ProgressBar::new(self.len() as u64);
1✔
1050
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1051
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1052

1053
        for i in (0..self.len()).progress_with(pb) {
32,658✔
1054
            let n = self.get_node(i);
32,658✔
1055
            self.node_to_gfa(&n, wtr, dummy_opt)?;
32,658✔
1056
        }
1057

1058
        wtr.flush().unwrap();
1✔
1059

1060
        Ok(())
1✔
1061
    }
1✔
1062

1063
    /// Write the graph to GFA format
1064
    pub fn to_gfa_with_tags<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(
1✔
1065
        &self,
1✔
1066
        gfa_out: P,
1✔
1067
        tag_func: F,
1✔
1068
    ) -> Result<(), Error> {
1✔
1069
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creatinf gfa file"));
1✔
1070
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1071

1072
        let pb = ProgressBar::new(self.len() as u64);
1✔
1073
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1074
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1075

1076
        for i in (0..self.len()).progress_with(pb) {
32,658✔
1077
            let n = self.get_node(i);
32,658✔
1078
            self.node_to_gfa(&n, &mut wtr, Some(&tag_func))?;
32,658✔
1079
        }
1080

1081
        wtr.flush().unwrap();
1✔
1082

1083
        Ok(())
1✔
1084
    }
1✔
1085

1086
    /// Write the graph to GFA format, with multithreading, 
1087
    /// pass `tag_func=None` to write without tags
1088
    pub fn to_gfa_otags_parallel<P: AsRef<Path> + Display + Sync, F: Fn(&Node<'_, K, D>) -> String + Sync>(
1✔
1089
        &self,
1✔
1090
        gfa_out: P,
1✔
1091
        tag_func: Option<&F>,
1✔
1092
    ) -> Result<(), Error> 
1✔
1093
    where 
1✔
1094
    K: Sync,
1✔
1095
    D: Sync,
1✔
1096
    {
1097
        // split into ranges according to thread count
1098
        let slices = current_num_threads();
1✔
1099
        let n_nodes = self.len();
1✔
1100
        let sz = n_nodes / slices + 1;
1✔
1101

1102
        debug!("n_nodes: {}", n_nodes);
1✔
1103
        debug!("sz: {}", sz);
1✔
1104

1105
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
1106
        let mut start = 0;
1✔
1107
        while start < n_nodes {
5✔
1108
            parallel_ranges.push(start..start + sz);
4✔
1109
            start += sz;
4✔
1110
        }
4✔
1111

1112
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
1113
        parallel_ranges.push(last_start..n_nodes);
1✔
1114
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
1115

1116
        let mut files = Vec::with_capacity(current_num_threads());
1✔
1117

1118
        for i in 0..parallel_ranges.len() {
4✔
1119
            files.push(format!("{}-{}.gfa", gfa_out, i));
4✔
1120
        } 
4✔
1121

1122
        let pb = ProgressBar::new(self.len() as u64);
1✔
1123
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1124
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1125
        
1126
        
1127
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
4✔
1128
            let mut wtr = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel gfa file"));
4✔
1129

1130
            for i in range {
32,662✔
1131
                let n = self.get_node(i);
32,658✔
1132
                self.node_to_gfa(&n, &mut wtr, tag_func).unwrap();
32,658✔
1133
                pb.inc(1);
32,658✔
1134
            }
32,658✔
1135

1136
            wtr.flush().unwrap();
4✔
1137
        });
4✔
1138

1139
        pb.finish_and_clear();
1✔
1140

1141
        // combine files
1142
        let mut out_file = BufWriter::with_capacity(BUF, File::create(format!("{}.gfa", gfa_out)).expect("error creating combined gfa file"));
1✔
1143
        writeln!(out_file, "H\tVN:Z:debruijn-rs")?;
1✔
1144

1145
        let pb = ProgressBar::new(files.len() as u64);
1✔
1146
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1147
        pb.set_message(format!("{:<32}", "combining files"));
1✔
1148

1149
        for file in files.iter() {
4✔
1150
            let open_file = File::open(file).expect("error opening parallel gfa file");
4✔
1151
            let mut reader = BufReader::new(open_file);
4✔
1152
            let mut buffer = [0; BUF];
4✔
1153

1154
            loop {
1155
                let linecount = reader.read(&mut buffer).unwrap();
113✔
1156
                if linecount == 0 { break }
113✔
1157
                out_file.write_all(&buffer[..linecount]).unwrap();
109✔
1158
            }
1159

1160
            remove_file(file).unwrap();
4✔
1161
        }
1162

1163
        out_file.flush().unwrap();
1✔
1164

1165
        Ok(())
1✔
1166
    }
1✔
1167

1168
    /// Write the graph to GFA format
1169
    pub fn to_gfa_partial<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(&self, gfa_out: P, tag_func: Option<&F>, nodes: Vec<usize>) -> Result<(), Error> {
1✔
1170
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
1✔
1171
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1172

1173
        let pb = ProgressBar::new(self.len() as u64);
1✔
1174
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1175
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1176

1177
        for i in nodes.into_iter().progress_with(pb) {
4✔
1178
            let n = self.get_node(i);
4✔
1179
            self.node_to_gfa(&n, &mut wtr, tag_func)?;
4✔
1180
        }
1181

1182
        wtr.flush().unwrap();
1✔
1183

1184
        Ok(())    
1✔
1185
    }
1✔
1186

1187
    pub fn to_json_rest<W: Write, F: Fn(&D) -> Value>(
×
1188
        &self,
×
1189
        fmt_func: F,
×
1190
        mut writer: &mut W,
×
1191
        rest: Option<Value>,
×
1192
    ) {
×
1193
        writeln!(writer, "{{\n\"nodes\": [").unwrap();
×
1194
        for i in 0..self.len() {
×
1195
            let node = self.get_node(i);
×
1196
            node.to_json(&fmt_func, writer);
×
1197
            if i == self.len() - 1 {
×
1198
                writeln!(writer).unwrap();
×
1199
            } else {
×
1200
                writeln!(writer, ",").unwrap();
×
1201
            }
×
1202
        }
1203
        writeln!(writer, "],").unwrap();
×
1204

1205
        writeln!(writer, "\"links\": [").unwrap();
×
1206
        for i in 0..self.len() {
×
1207
            let node = self.get_node(i);
×
1208
            match node.edges_to_json(writer) {
×
1209
                true => {
1210
                    if i == self.len() - 1 {
×
1211
                        writeln!(writer).unwrap();
×
1212
                    } else {
×
1213
                        writeln!(writer, ",").unwrap();
×
1214
                    }
×
1215
                }
1216
                _ => continue,
×
1217
            }
1218
        }
1219
        writeln!(writer, "]").unwrap();
×
1220

1221
        match rest {
×
1222
            Some(Value::Object(v)) => {
×
1223
                for (k, v) in v.iter() {
×
1224
                    writeln!(writer, ",").expect("io error");
×
1225
                    write!(writer, "\"{}\": ", k).expect("io error");
×
1226
                    serde_json::to_writer(&mut writer, v).expect("io error");
×
1227
                    writeln!(writer).expect("io error");
×
1228
                }
×
1229
            }
1230
            _ => {
×
1231
                writeln!(writer).expect("io error");
×
1232
            }
×
1233
        }
1234

1235
        writeln!(writer, "}}").expect("io error");
×
1236
    }
×
1237

1238
    /// Write the graph to JSON
1239
    pub fn to_json<W: Write, F: Fn(&D) -> Value, RF: Fn(&mut W)>(
×
1240
        &self,
×
1241
        fmt_func: F,
×
1242
        writer: &mut W,
×
1243
    ) {
×
1244
        self.to_json_rest(fmt_func, writer, None);
×
1245
    }
×
1246

1247
    /// Print a text representation of the graph.
1248
    pub fn print(&self) {
2✔
1249
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
2✔
1250
        for node in self.iter_nodes() {
141✔
1251
            println!("{:?}", node);
141✔
1252
        }
141✔
1253
    }
2✔
1254

1255
    pub fn print_with_data(&self) {
×
1256
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
×
1257
        for node in self.iter_nodes() {
×
1258
            println!("{:?} ({:?})", node, node.data());
×
1259
        }
×
1260
    }
×
1261

1262
    pub fn max_path_beam<F, F2>(&self, beam: usize, score: F, _solid_path: F2) -> Vec<(usize, Dir)>
×
1263
    where
×
1264
        F: Fn(&D) -> f32,
×
1265
        F2: Fn(&D) -> bool,
×
1266
    {
1267
        if self.is_empty() {
×
1268
            return Vec::default();
×
1269
        }
×
1270

1271
        let mut states = Vec::new();
×
1272

1273
        for i in 0..self.len() {
×
1274
            let node = self.get_node(i);
×
1275

1276
            // Initialize beam search on terminal nodes
1277
            if node.exts().num_exts_l() == 0 || node.exts().num_exts_r() == 0 {
×
1278
                let dir = if node.exts().num_exts_l() > 0 {
×
1279
                    Dir::Right
×
1280
                } else {
1281
                    Dir::Left
×
1282
                };
1283

1284
                let status = if node.exts().num_exts_l() == 0 && node.exts().num_exts_r() == 0 {
×
1285
                    Status::End
×
1286
                } else {
1287
                    Status::Active
×
1288
                };
1289

1290
                let mut path = SmallVec8::new();
×
1291
                path.push((i as u32, dir));
×
1292

1293
                let s = State {
×
1294
                    path,
×
1295
                    status,
×
1296
                    score: score(node.data()),
×
1297
                };
×
1298
                states.push(s);
×
1299
            }
×
1300
        }
1301

1302
        // No end nodes -- just start on the first node
1303
        if states.is_empty() {
×
1304
            // Make a start
×
1305
            let node = self.get_node(0);
×
1306
            let mut path = SmallVec8::new();
×
1307
            path.push((0, Dir::Left));
×
1308
            states.push(State {
×
1309
                path,
×
1310
                status: Status::Active,
×
1311
                score: score(node.data()),
×
1312
            });
×
1313
        }
×
1314

1315
        // Beam search until we can't find any more expansions
1316
        let mut active = true;
×
1317
        while active {
×
1318
            let mut new_states = Vec::with_capacity(states.len());
×
1319
            active = false;
×
1320

1321
            for s in states {
×
1322
                if s.status == Status::Active {
×
1323
                    active = true;
×
1324
                    let expanded = self.expand_state(&s, &score);
×
1325
                    new_states.extend(expanded);
×
1326
                } else {
×
1327
                    new_states.push(s)
×
1328
                }
1329
            }
1330

1331
            // workaround to sort by descending score - will panic if there are NaN scores
1332
            new_states.sort_by(|a, b| (-(a.score)).partial_cmp(&-(b.score)).unwrap());
×
1333
            new_states.truncate(beam);
×
1334
            states = new_states;
×
1335
        }
1336

1337
        for (i, state) in states.iter().take(5).enumerate() {
×
1338
            trace!("i:{}  -- {:?}", i, state);
×
1339
        }
1340

1341
        // convert back to using usize for node_id
1342
        states[0]
×
1343
            .path
×
1344
            .iter()
×
1345
            .map(|&(node, dir)| (node as usize, dir))
×
1346
            .collect()
×
1347
    }
×
1348

1349
    fn expand_state<F>(&self, state: &State, score: &F) -> SmallVec4<State>
×
1350
    where
×
1351
        F: Fn(&D) -> f32,
×
1352
    {
1353
        if state.status != Status::Active {
×
1354
            panic!("only attempt to expand active states")
×
1355
        }
×
1356

1357
        let (node_id, dir) = state.path[state.path.len() - 1];
×
1358
        let node = self.get_node(node_id as usize);
×
1359
        let mut new_states = SmallVec4::new();
×
1360

1361
        for (_, next_node_id, incoming_dir, _) in node.edges(dir.flip()) {
×
1362
            let next_node = self.get_node(next_node_id);
×
1363
            let new_score = state.score + score(next_node.data());
×
1364

1365
            let cycle = state
×
1366
                .path
×
1367
                .iter()
×
1368
                .any(|&(prev_node, _)| prev_node == (next_node_id as u32));
×
1369

1370
            let status = if cycle {
×
1371
                Status::Cycle
×
1372
            } else if next_node.edges(incoming_dir.flip()).is_empty() {
×
1373
                Status::End
×
1374
            } else {
1375
                Status::Active
×
1376
            };
1377

1378
            let mut new_path = state.path.clone();
×
1379
            new_path.push((next_node_id as u32, incoming_dir));
×
1380

1381
            let next_state = State {
×
1382
                path: new_path,
×
1383
                score: new_score,
×
1384
                status,
×
1385
            };
×
1386

1387
            new_states.push(next_state);
×
1388
        }
1389

1390
        new_states
×
1391
    }
×
1392

1393

1394
    pub fn iter_components(&self) -> IterComponents<K, D> {
5✔
1395
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
5✔
1396
        let pos = 0;
5✔
1397

1398
        for _i in 0..self.len() {
33,214✔
1399
            visited.push(false);
33,214✔
1400
        }
33,214✔
1401

1402
        IterComponents { 
5✔
1403
            graph: self, 
5✔
1404
            visited, 
5✔
1405
            pos }
5✔
1406
    }
5✔
1407

1408

1409
    /// iteratively returns 2D Vec with node_ids grouped according to the connected components they form
1410
    pub fn components_i(&self) -> Vec<Vec<usize>> {
1✔
1411
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
1✔
1412
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
1✔
1413

1414
        for _i in 0..self.len() {
139✔
1415
            visited.push(false);
139✔
1416
        }
139✔
1417

1418
        for i in 0..self.len() {
139✔
1419
            if !visited[i] {
139✔
1420
                let comp = self.component_i(&mut visited, i);
2✔
1421
                components.push(comp);
2✔
1422
            }
137✔
1423
        }
1424

1425
        components
1✔
1426
    }
1✔
1427

1428
    /// recursively detects which nodes form separate graph components
1429
    /// returns 2D vector with node ids per component
1430
    /// (may lead to stack overflow)
1431
    pub fn components_r(&self) -> Vec<Vec<usize>> {
2✔
1432
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
2✔
1433
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
2✔
1434

1435
        for _i in 0..self.len() {
141✔
1436
            visited.push(false);
141✔
1437
        }
141✔
1438

1439
        for i in 0..self.len() {
141✔
1440
            if !visited[i] {
141✔
1441
                let mut comp: Vec<usize> = Vec::new();
4✔
1442
                self.component_r(&mut visited, i, &mut comp);
4✔
1443
                components.push(comp);
4✔
1444
            }
137✔
1445
        }
1446

1447
        components
2✔
1448

1449
    }
2✔
1450

1451
    fn component_r<'a>(&'a self, visited: &'a mut Vec<bool>, i: usize, comp: &'a mut Vec<usize>) {
141✔
1452
        
1453
        visited[i] = true;
141✔
1454
        comp.push(i);
141✔
1455
        let mut edges = self.find_edges(i, Dir::Left);
141✔
1456
        let mut r_edges = self.find_edges(i, Dir::Right);
141✔
1457

1458
        edges.append(&mut r_edges);
141✔
1459

1460
        for (_, edge, _, _) in edges.iter() {
274✔
1461
            if !visited[*edge] {
274✔
1462
                self.component_r(visited, *edge, comp);
137✔
1463
            }
137✔
1464
        }
1465
    }
141✔
1466

1467
    fn component_i<'a>(&'a self, visited: &'a mut [bool], i: usize) -> Vec<usize> {
243✔
1468
        let mut edges: Vec<usize> = Vec::new();
243✔
1469
        let mut comp: Vec<usize> = Vec::new();
243✔
1470

1471
        edges.push(i);
243✔
1472

1473
        while let Some(current_edge) = edges.pop() {
33,627✔
1474
            if !visited[current_edge] { 
33,384✔
1475
                comp.push(current_edge);
33,353✔
1476
                visited[current_edge] = true;
33,353✔
1477

1478
                let mut l_edges = self.find_edges(current_edge, Dir::Left);
33,353✔
1479
                let mut r_edges = self.find_edges(current_edge, Dir::Right);
33,353✔
1480

1481
                l_edges.append(&mut r_edges);
33,353✔
1482

1483
                for (_, new_edge, _, _) in l_edges.into_iter() {
66,286✔
1484
                    if !visited[new_edge] {
66,286✔
1485
                        edges.push(new_edge);
33,141✔
1486
                    }
33,145✔
1487
                }
1488
            }
31✔
1489
        }
1490
        comp
243✔
1491
    }
243✔
1492

1493
    /// iterate over all edges of the graph, item: (node, ext base, ext dir, target node)
1494
    pub fn iter_edges(&self) -> EdgeIter<'_, K, D> {
8✔
1495
        EdgeIter::new(self)
8✔
1496
    }
8✔
1497

1498
    pub fn find_bad_nodes<F: Fn(&Node<'_, K, D>) -> bool>(&self, valid: F) -> Vec<usize> {
×
1499
        let mut bad_nodes = Vec::new();
×
1500

1501
        for (i, node) in enumerate(self.iter_nodes()) {
×
1502
            if !valid(&node) { bad_nodes.push(i); }
×
1503
        }
1504
        
1505
        bad_nodes
×
1506
    }
×
1507
}
1508

1509
impl<K: Kmer, SD: Debug> DebruijnGraph<K, SD> {
1510
    pub fn create_colors<'a, 'b: 'a, DI>(&'a self, config: &SummaryConfig, color_mode: ColorMode<'b>) -> Colors<'b, SD, DI> 
×
1511
    where 
×
1512
    SD: SummaryData<DI>,
×
1513
    {
1514
        Colors::new(self, config, color_mode)
×
1515
    }
×
1516
    
1517
    /// edge mults will contain hanging edges if the nodes were filtered
1518
    pub fn fix_edge_mults<DI>(&mut self) 
2✔
1519
    where 
2✔
1520
        SD: SummaryData<DI>
2✔
1521
    {
1522
        if self.get_node(0).data().edge_mults().is_some() {
2✔
1523
            for i in 0..self.len() {
278✔
1524
                self.base.data[i].fix_edge_mults(self.base.exts[i]);
278✔
1525
            }
278✔
UNCOV
1526
        }
×
1527
    }
2✔
1528

1529
    /// if there are edge mults in the data, prune the graph by removing edges that have a low coverage
1530
    pub fn filter_edges<DI>(&mut self, min: u32) -> Result<(), String>
1✔
1531
    where 
1✔
1532
        SD: SummaryData<DI>
1✔
1533
    {
1534
        // return if there is no edge coverage available
1535
        if self.get_node(0).data().edge_mults().is_none() { return Err(String::from("no edge mults available")) };
1✔
1536

1537
        for i in 0..self.len() {
139✔
1538
            let em = self.get_node(i).data().edge_mults().expect("shold have em").clone();
139✔
1539
            let edges = [(Dir::Left, 0), (Dir::Left, 1), (Dir::Left, 2), (Dir::Left, 3), 
139✔
1540
                (Dir::Right, 0), (Dir::Right, 1), (Dir::Right, 2), (Dir::Right, 3)];
139✔
1541
            
1542
            for (dir, base) in edges {
1,251✔
1543
                if min > em.edge_mult(base, dir) {
1,112✔
1544
                    // remove invalid ext from node
838✔
1545
                    let ext = self.base.exts[i].remove(dir, base);
838✔
1546
                    self.base.exts[i] = ext;
838✔
1547
                }
838✔
1548
            }
1549
        }
1550

1551
        // now that exts are removed, fix hanging edge mults
1552
        self.fix_edge_mults();
1✔
1553

1554
        Ok(())
1✔
1555
    }
1✔
1556

1557
    /// remove simple ladder structure caused by 1 base sequencing errors from the graph
1558
    /// graph must contain edge mults and be stranded
1559
    pub fn remove_ladders<DI, P>(&mut self, min_diff_factor: u32, out_path: Option<P>) -> Result<(), String> 
2✔
1560
    where 
2✔
1561
        SD: SummaryData<DI>,
2✔
1562
        P: AsRef<Path>
2✔
1563
    {
1564
        // interrupt if we dont have edge mults
1565
        if self.get_node(0).data().edge_mults().is_none() { return Err(String::from("no edge mults available")) };
2✔
1566
        if !self.base.stranded { return Err(String::from("must be stranded")) };
2✔
1567

1568
        let mut writer = out_path.map(|path| BufWriter::new(File::create(path).expect("error creating ladder stats file")));
2✔
1569
        if let Some(wtr) = writer.as_mut() {
2✔
1570
            writeln!(wtr, "avg high cov,avg low cov,log2(correct high/correct low)").unwrap();
2✔
1571
        }
2✔
1572

1573
        // iterate over nodes
1574
        for node_id in 0..self.len() {
87✔
1575
            // check if node has outgoing edge(s) with both high and low coverage
1576
            let outs = self.get_node(node_id).data().edge_mults().expect("should have em").right();
87✔
1577

1578
            let Some((out_max_base, &out_max_cov)) = outs.iter().rev().enumerate().filter(|&(_, &c)| c  > 0).max_by(|&(_b1, &c1), &(_b2, c2)| c1.cmp(c2)) else { continue };
348✔
1579
            let smaller_outs = outs.iter().copied().rev().enumerate().filter(|&(_b, c)| (c > 0) & (out_max_cov > c)).collect::<Vec<_>>();
340✔
1580

1581
            // TODO add factor back in, remove writer
1582

1583
            if smaller_outs.is_empty() { continue; }
85✔
1584

1585
            // follow path with highest coverage until target length is reached
1586
            let Some((target_node, avg_high_cov, high_cc)) = self.follow_ladder_path_high(node_id, out_max_base as u8, out_max_cov) else { continue; };
4✔
1587
            
1588
            // check all small outs
1589
            for (s_base, s_cov) in smaller_outs {
6✔
1590
                let Some((target_path, avg_low_cov, low_cc)) = self.follow_ladder_path_low(node_id, s_base as u8, s_cov) else { continue; };
3✔
1591
                let other_target_node = *target_path.last().expect("should have at least two/tree elements");
2✔
1592

1593
                if target_node == other_target_node {
2✔
1594
                    // the two paths landed on the same node -> remove all edges in the low coverage path
1595
                    if s_cov * min_diff_factor <= out_max_cov {
2✔
1596
                        self.remove_path(target_path);
2✔
1597
                    }
2✔
1598
                    
1599
                    if let Some(wtr) = writer.as_mut() {
2✔
1600
                        writeln!(wtr, "{},{},{}", avg_high_cov, avg_low_cov, (high_cc as f32/low_cc as f32).log2()).unwrap();
2✔
1601
                    }
2✔
NEW
1602
                }
×
1603
            }
1604
        }
1605

1606
        Ok(())
2✔
1607
    }
2✔
1608

1609
    /// follow the presumably erroneous ladder path with low coverage
1610
    fn follow_ladder_path_low<DI>(&self, start_node_id: usize, start_ext: u8, start_cov: u32) -> Option<(Vec<usize>, f32, usize)> 
3✔
1611
    where SD: SummaryData<DI>
3✔
1612
    {
1613
        const COV_MARGIN: f32 = 0.3;
1614
        const COV_ADD_MARGIN: f32 = 4.;
1615

1616
        // path, including start and target node
1617
        let mut path = Vec::new();
3✔
1618
        path.push(start_node_id);
3✔
1619

1620
        let target_length = 2 * K::k() - 1;
3✔
1621
        let mut path_length = K::k() - 1;
3✔
1622
        let mut n_correct_edges = 0;
3✔
1623

1624
        // get fist next node
1625
        let sequence = self.base.sequences.get(start_node_id);
3✔
1626
        let term_kmer: K = sequence.term_kmer(Dir::Right);
3✔
1627
        let next_kmer = term_kmer.extend(start_ext, Dir::Right);
3✔
1628
        let (mut current_node_id, _, _) = self.find_link(next_kmer, Dir::Right).expect("link should exist"); 
3✔
1629
        path.push(current_node_id);
3✔
1630

1631
        if self.check_edge_truth(start_node_id, current_node_id) {
3✔
NEW
1632
            n_correct_edges += 1;
×
1633
        }
3✔
1634

1635
        let mut current_cov = start_cov as f32;
3✔
1636
        let mut sum_path_cov = start_cov as f32;
3✔
1637
        let mut coverage_counter = 1;
3✔
1638

1639

1640
        loop {
1641
            // check if current node is "target node", i.e., the path has reached the desired length
1642
            match path_length {
35✔
1643
                len if len == target_length => return Some((path, (sum_path_cov / coverage_counter as f32), n_correct_edges)), // path has target length
35✔
1644
                len if len > target_length => return None, // path has surpassed desired length => invalid
33✔
1645
                _ => () // path has not yet reached desired length, continue
33✔
1646
            }
1647

1648
            // since current node is not target node, add length to path
1649
            let len = self.get_node(current_node_id).len() - (K::k() - 1);
33✔
1650
            path_length += len;
33✔
1651

1652
            // get current node
1653
            let current_node = self.get_node(current_node_id);
33✔
1654

1655
            // low path nodes should have only one incoming and one outgoing edge
1656
            let in_edges = current_node.l_edges();
33✔
1657
            if in_edges.len() != 1 { return None; }
33✔
1658

1659
            let out_edges = current_node.r_edges();
32✔
1660
            if out_edges.len() != 1 { return None; }
32✔
1661

1662
            // if edge cov avalable, check for similarity
1663
            let (out_ext, out_node_id, _, _) = out_edges[0];
32✔
1664
            if let Some(em) = current_node.data().edge_mults() {
32✔
1665
                let coverage = em.edge_mult(out_ext, Dir::Right) as f32;
32✔
1666
                if coverage < current_cov - current_cov * COV_MARGIN - COV_ADD_MARGIN // extra two for small values
32✔
1667
                    || coverage > current_cov + current_cov * COV_MARGIN + COV_ADD_MARGIN {
32✔
NEW
1668
                    return None;
×
1669
                } else {
32✔
1670
                    current_cov = coverage;
32✔
1671
                }
32✔
NEW
1672
            }
×
1673

1674
            // try to check if edge is "correct", add extra length of current node to it to account for compression
1675
            if self.check_edge_truth(current_node_id, out_node_id) {
32✔
NEW
1676
                n_correct_edges += len;
×
1677
            }
32✔
1678

1679
            // set current node id to next node to be visited
1680
            current_node_id = out_node_id;
32✔
1681
            sum_path_cov += current_cov;
32✔
1682
            coverage_counter += 1;
32✔
1683
            // add current node to path
1684
            path.push(current_node_id);
32✔
1685
        }
1686
    }
3✔
1687

1688
    /// follow a path of the length 2*k - 1 by choosing the edges with the hightest coverage
1689
    /// requres the graph to have edge mults
1690
    fn follow_ladder_path_high<DI>(&self, start_node_id: usize, start_ext: u8, start_cov: u32) -> Option<(usize, f32, usize)> 
4✔
1691
    where SD: SummaryData<DI>
4✔
1692
    {
1693

1694
        let target_length = 2 * K::k() - 1;
4✔
1695
        let mut path_length = K::k() - 1;
4✔
1696
        let mut sum_path_cov = start_cov; 
4✔
1697
        let mut coverage_counter = 1;
4✔
1698
        let mut n_correct_edges = 1; 
4✔
1699

1700
        // get fist next node
1701
        let sequence = self.base.sequences.get(start_node_id);
4✔
1702
        let term_kmer: K = sequence.term_kmer(Dir::Right);
4✔
1703
        let next_kmer = term_kmer.extend(start_ext, Dir::Right);
4✔
1704
        let (mut current_node_id, _, _) = self.find_link(next_kmer, Dir::Right).expect("link should exist"); 
4✔
1705

1706
        if self.check_edge_truth(start_node_id, current_node_id) {
4✔
1707
            n_correct_edges += 1;
4✔
1708
        }
4✔
1709

1710
        loop {
1711
            // check if current node is "target node", i.e., the path has reached the desired length
1712
            match path_length {
39✔
1713
                pl if pl == target_length => return Some((current_node_id, (sum_path_cov as f32 / coverage_counter as f32), n_correct_edges)), // path has target length
39✔
1714
                pl if pl > target_length => return None, // path has surpassed desired length => invalid
36✔
1715
                _ => () // path has not yet reached desired length, continue
35✔
1716
            }
1717

1718
            // since current node is not target node, add length to path
1719
            let len = self.get_node(current_node_id).len() - (K::k() - 1);
35✔
1720
            path_length += len;
35✔
1721

1722
            // get current node
1723
            let current_node = self.get_node(current_node_id);
35✔
1724
        
1725
            // find outgoing edge with highest coverage
1726
            let em = current_node.data().edge_mults().expect("must have edge mults").right();
35✔
1727
            let (max_cov_base, &max_cov) = em.iter().rev().enumerate().filter(|&(_, &c)| c > 0).max_by(|&(_b1, &c1), &(_b2, c2)| c1.cmp(c2))?;
140✔
1728

1729
            // get next node id
1730
            let sequence = self.base.sequences.get(current_node_id);
35✔
1731
            let term_kmer: K = sequence.term_kmer(Dir::Right);
35✔
1732
            let next_kmer = term_kmer.extend(max_cov_base as u8, Dir::Right);
35✔
1733
            let (out_node_id, _, _) = self.find_link(next_kmer, Dir::Right).expect("link should exist"); 
35✔
1734

1735
            // try to check if edge is "correct", add extra length of current node to it to account for compression
1736
            if self.check_edge_truth(current_node_id, out_node_id) {
35✔
1737
                n_correct_edges += len;
35✔
1738
            }
35✔
1739

1740
            // set current node id to next node to be visited
1741
            current_node_id = out_node_id;
35✔
1742
            sum_path_cov += max_cov;
35✔
1743
            coverage_counter += 1;
35✔
1744
        }
1745
    }
4✔
1746

1747
    /// remove tips from the graph, reqires edge mults and stranded
1748
    pub fn remove_tips<DI, P>(&mut self, min_diff_factor: u32, out_path: Option<P>) -> Result<(), String> 
2✔
1749
    where 
2✔
1750
        SD: SummaryData<DI>,
2✔
1751
        P: AsRef<Path>
2✔
1752
    {
1753
        // interrupt if we dont have edge mults
1754
        if self.get_node(0).data().edge_mults().is_none() { return Err(String::from("no edge mults available")) };
2✔
1755
        if !self.base.stranded { return Err(String::from("must be stranded")) };
2✔
1756

1757
        let mut writer = out_path.map(|path| BufWriter::new(File::create(path).expect("error creating ladder stats file")));
2✔
1758
        if let Some(wtr) = writer.as_mut() {
2✔
1759
            writeln!(wtr, "high cov,avg low cov,max true,tip truth ratio,tip len").unwrap();
2✔
1760
        }
2✔
1761

1762
        let max_len = K::k();
2✔
1763

1764
        // iterate over nodes
1765
        for node_id in 0..self.len() {
60✔
1766
            let current_node = self.get_node(node_id);
60✔
1767
            // check if node has outgoing edge(s) with both high and low coverage
1768
            let outs = current_node.data().edge_mults().expect("should have em").right();
60✔
1769
            let Some((out_max_base, &out_max_cov)) = outs.iter().rev().enumerate().filter(|&(_, &c)| c  > 0).max_by(|&(_b1, &c1), &(_b2, c2)| c1.cmp(c2)) else { continue };
240✔
1770
            let smaller_outs = outs.iter().copied().rev().enumerate().filter(|&(_b, c)| (c > 0) & (out_max_cov > c)).collect::<Vec<_>>();
224✔
1771
            
1772
            if smaller_outs.is_empty() { continue; }
56✔
1773

1774
            // try and check if max cov edge is correct
1775
            let max_connection_correct = {
2✔
1776
                let sequence = self.base.sequences.get(node_id);
2✔
1777
                let term_kmer: K = sequence.term_kmer(Dir::Right);
2✔
1778
                let next_kmer = term_kmer.extend(out_max_base as u8, Dir::Right);
2✔
1779
                let (next_node_id, _, _) = self.find_link(next_kmer, Dir::Right).expect("missing link");
2✔
1780

1781
                self.check_edge_truth(node_id, next_node_id)
2✔
1782
            }; // if current node is incorrect, connection is incorrect in any case
1783
            
1784
            // check all small outs
1785
            for (s_base, s_cov) in smaller_outs {
4✔
1786
                // path has to be short + unambiguous + consistently low coverage
1787
                let Some((tip_path, avg_tip_coverage, truth_ratio, tip_len)) = self.follow_tip_path(node_id, s_base as u8, s_cov) else { continue; };
2✔
1788

1789
                // remove path if coverage below threshold
1790
                if (s_cov * min_diff_factor <= out_max_cov) & (tip_len <= max_len ) {
2✔
1791
                    self.remove_path(tip_path);
2✔
1792
                }
2✔
1793
                    
1794
                if let Some(wtr) = writer.as_mut() {
2✔
1795
                    writeln!(wtr, "{},{},{},{},{}", out_max_cov, avg_tip_coverage, max_connection_correct, truth_ratio, tip_len).unwrap();
2✔
1796
                }
2✔
1797
            }
1798
        }
1799

1800
        Ok(())
2✔
1801
    }
2✔
1802

1803
    /// follow a tip path, returns path, average coverage and ration of correct to incorrect connections
1804
    fn follow_tip_path<DI>(&self, start_node_id: usize, start_ext: u8, start_cov: u32) -> Option<(Vec<usize>, f32, f32, usize)> 
2✔
1805
    where 
2✔
1806
        SD: SummaryData<DI>
2✔
1807
    {
1808
        const COV_MARGIN: f32 = 0.3;
1809
        const COV_ADD_MARGIN: f32 = 4.;
1810

1811
        // path, including start and target node
1812
        let mut path = Vec::new();
2✔
1813
        path.push(start_node_id);
2✔
1814

1815
        let mut path_length = 0;
2✔
1816
        let mut n_correct_edges = 0;
2✔
1817

1818
        // get fist next node
1819
        let sequence = self.base.sequences.get(start_node_id);
2✔
1820
        let term_kmer: K = sequence.term_kmer(Dir::Right);
2✔
1821
        let next_kmer = term_kmer.extend(start_ext, Dir::Right);
2✔
1822
        let (mut current_node_id, _, _) = self.find_link(next_kmer, Dir::Right).expect("link should exist"); 
2✔
1823
        path.push(current_node_id);
2✔
1824

1825
        if self.check_edge_truth(start_node_id, current_node_id) {
2✔
NEW
1826
            n_correct_edges += 1;
×
1827
        }
2✔
1828

1829
        let mut current_cov = start_cov as f32;
2✔
1830
        let mut sum_path_cov = start_cov as f32;
2✔
1831
        let mut coverage_counter = 1;
2✔
1832

1833
        loop {
1834
            println!("path: {:?}", path);
8✔
1835
        
1836
            // get current node
1837
            let current_node = self.get_node(current_node_id);
8✔
1838

1839
            // add length to path
1840
            let len = current_node.len() - (K::k() - 1);
8✔
1841
            path_length += len;
8✔
1842

1843
            // low path nodes should have only one incoming and one outgoing edge
1844
            let in_edges = current_node.l_edges();
8✔
1845
            if in_edges.len() != 1 { return None; }
8✔
1846

1847
            let out_edges = current_node.r_edges();
8✔
1848
            match out_edges.len() {
8✔
1849
                oe if oe > 1 => return None,
8✔
1850
                0 => return Some((path, (sum_path_cov / coverage_counter as f32), (n_correct_edges as f32 / path_length as f32), path_length)),
2✔
1851
                _ => ()
6✔
1852
            }
1853

1854
            // if edge cov avalable, check for similarity
1855
            let (out_ext, out_node_id, _, _) = out_edges[0];
6✔
1856
            if let Some(em) = current_node.data().edge_mults() {
6✔
1857
                let coverage = em.edge_mult(out_ext, Dir::Right) as f32;
6✔
1858
                if coverage < current_cov - current_cov * COV_MARGIN - COV_ADD_MARGIN // extra two for small values
6✔
1859
                    || coverage > current_cov + current_cov * COV_MARGIN + COV_ADD_MARGIN {
6✔
NEW
1860
                    return None;
×
1861
                } else {
6✔
1862
                    current_cov = coverage;
6✔
1863
                }
6✔
NEW
1864
            }
×
1865

1866
            // try to check if edge is "correct", add extra length of current node to it to account for compression
1867
            if self.check_edge_truth(current_node_id, out_node_id) {
6✔
NEW
1868
                n_correct_edges += len;
×
1869
            }
6✔
1870

1871
            // set current node id to next node to be visited
1872
            current_node_id = out_node_id;
6✔
1873
            sum_path_cov += current_cov;
6✔
1874
            coverage_counter += 1;
6✔
1875
            // add current node to path
1876
            path.push(current_node_id);
6✔
1877

1878
        }
1879
    }
2✔
1880

1881
    /// remove the edges (not the nodes) of a path in the graph
1882
    fn remove_path(&mut self, path: Vec<usize>) {
4✔
1883
        let mut path_iter = path.into_iter();
4✔
1884
        let Some(mut current_node_id) = path_iter.next() else { return; };
4✔
1885

1886
        loop {
1887
            // get next node id
1888
            let Some(next_node_id) = path_iter.next() else { return }; // whole path has been covered
31✔
1889
            // remove ext to the right of current node
1890
            let (out_base, _, _, _) = *self.get_node(current_node_id).r_edges().iter().find(|&(_, id, _, _)| *id == next_node_id).expect("incorrect path");
27✔
1891
            self.base.exts[current_node_id] = self.base.exts[current_node_id].remove(Dir::Right, out_base);
27✔
1892

1893
            // remove ext to the left of the next node
1894
            let (in_base, _, _, _) = *self.get_node(next_node_id).l_edges().iter().find(|&(_, id, _, _)| *id == current_node_id).expect("incorrect path");
27✔
1895
            self.base.exts[next_node_id] = self.base.exts[next_node_id].remove(Dir::Left, in_base); 
27✔
1896

1897
            current_node_id = next_node_id;
27✔
1898
        }
1899
    }
4✔
1900

1901
    /// use mapped ids to check if the nodes of an edge were mapped to the same id
1902
    /// returns false if mapped ids are not available
1903
    pub fn check_edge_truth<DI>(&self, node_id_1: usize, node_id_2: usize) -> bool
84✔
1904
    where 
84✔
1905
        SD: SummaryData<DI>
84✔
1906
    {
1907
        let mapped_ids_1 = self.get_node(node_id_1).data().mapped_ids();
84✔
1908
        let mapped_ids_2 = self.get_node(node_id_2).data().mapped_ids();
84✔
1909

1910
        if let (Some(mids1), Some(mids2)) = (mapped_ids_1, mapped_ids_2) {
84✔
1911
            let hashed_mi2 = mids2.iter().copied().collect::<HashSet<_>>();
84✔
1912
            for mid in mids1 {
89✔
1913
                if hashed_mi2.contains(mid) {
46✔
1914
                    return true;
41✔
1915
                }
5✔
1916
            }
NEW
1917
        }
×
1918

1919
        false
43✔
1920
    }
84✔
1921
}
1922

1923

1924
#[derive(Debug, Eq, PartialEq)]
1925
enum Status {
1926
    Active,
1927
    End,
1928
    Cycle,
1929
}
1930

1931
#[derive(Debug)]
1932
struct State {
1933
    path: SmallVec8<(u32, Dir)>,
1934
    score: f32,
1935
    status: Status,
1936
}
1937

1938
impl State {}
1939

1940
/// Iterator over nodes in a `DeBruijnGraph`
1941
pub struct NodeIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1942
    graph: &'a DebruijnGraph<K, D>,
1943
    node_id: usize,
1944
}
1945

1946
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIter<'a, K, D> {
1947
    type Item = Node<'a, K, D>;
1948

1949
    fn next(&mut self) -> Option<Node<'a, K, D>> {
539,479✔
1950
        if self.node_id < self.graph.len() {
539,479✔
1951
            let node = self.graph.get_node(self.node_id);
539,350✔
1952
            self.node_id += 1;
539,350✔
1953
            Some(node)
539,350✔
1954
        } else {
1955
            None
129✔
1956
        }
1957
    }
539,479✔
1958
}
1959

1960
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for &'a DebruijnGraph<K, D> {
1961
    type Item = NodeKmer<'a, K, D>;
1962
    type IntoIter = NodeIntoIter<'a, K, D>;
1963

1964
    fn into_iter(self) -> Self::IntoIter {
2,232✔
1965
        NodeIntoIter {
2,232✔
1966
            graph: self,
2,232✔
1967
            node_id: 0,
2,232✔
1968
        }
2,232✔
1969
    }
2,232✔
1970
}
1971

1972
/// Iterator over nodes in a `DeBruijnGraph`
1973
pub struct NodeIntoIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1974
    graph: &'a DebruijnGraph<K, D>,
1975
    node_id: usize,
1976
}
1977

1978
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIntoIter<'a, K, D> {
1979
    type Item = NodeKmer<'a, K, D>;
1980

1981
    fn next(&mut self) -> Option<Self::Item> {
230,178✔
1982
        if self.node_id < self.graph.len() {
230,178✔
1983
            let node_id = self.node_id;
230,178✔
1984
            let node = self.graph.get_node(node_id);
230,178✔
1985
            let node_seq = node.sequence();
230,178✔
1986

1987
            self.node_id += 1;
230,178✔
1988
            Some(NodeKmer {
230,178✔
1989
                node_id,
230,178✔
1990
                node_seq_slice: node_seq,
230,178✔
1991
                phantom_d: PhantomData,
230,178✔
1992
                phantom_k: PhantomData,
230,178✔
1993
            })
230,178✔
1994
        } else {
1995
            None
×
1996
        }
1997
    }
230,178✔
1998
}
1999

2000
/// A `DebruijnGraph` node with a reference to the sequence of the node.
2001
#[derive(Clone)]
2002
pub struct NodeKmer<'a, K: Kmer + 'a, D: Debug + 'a> {
2003
    pub node_id: usize,
2004
    node_seq_slice: DnaStringSlice<'a>,
2005
    phantom_k: PhantomData<K>,
2006
    phantom_d: PhantomData<D>,
2007
}
2008

2009
/// An iterator over the kmers in a `DeBruijn graph node`
2010
pub struct NodeKmerIter<'a, K: Kmer + 'a, D: Debug + 'a> {
2011
    kmer_id: usize,
2012
    kmer: K,
2013
    num_kmers: usize,
2014
    node_seq_slice: DnaStringSlice<'a>,
2015
    phantom_k: PhantomData<K>,
2016
    phantom_d: PhantomData<D>,
2017
}
2018

2019
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for NodeKmer<'a, K, D> {
2020
    type Item = K;
2021
    type IntoIter = NodeKmerIter<'a, K, D>;
2022

2023
    fn into_iter(self) -> Self::IntoIter {
460,356✔
2024
        let num_kmers = self.node_seq_slice.len() - K::k() + 1;
460,356✔
2025

2026
        let kmer = if num_kmers > 0 {
460,356✔
2027
            self.node_seq_slice.get_kmer::<K>(0)
460,356✔
2028
        } else {
2029
            K::empty()
×
2030
        };
2031

2032
        NodeKmerIter {
460,356✔
2033
            kmer_id: 0,
460,356✔
2034
            kmer,
460,356✔
2035
            num_kmers,
460,356✔
2036
            node_seq_slice: self.node_seq_slice,
460,356✔
2037
            phantom_k: PhantomData,
460,356✔
2038
            phantom_d: PhantomData,
460,356✔
2039
        }
460,356✔
2040
    }
460,356✔
2041
}
2042

2043
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeKmerIter<'a, K, D> {
2044
    type Item = K;
2045

2046
    fn next(&mut self) -> Option<Self::Item> {
3,595,956✔
2047
        if self.num_kmers == self.kmer_id {
3,595,956✔
2048
            None
×
2049
        } else {
2050
            let current_kmer = self.kmer;
3,595,956✔
2051

2052
            self.kmer_id += 1;
3,595,956✔
2053
            if self.kmer_id < self.num_kmers {
3,595,956✔
2054
                let next_base = self.node_seq_slice.get(self.kmer_id + K::k() - 1);
3,536,348✔
2055
                let new_kmer = self.kmer.extend_right(next_base);
3,536,348✔
2056
                self.kmer = new_kmer;
3,536,348✔
2057
            }
3,536,348✔
2058

2059
            Some(current_kmer)
3,595,956✔
2060
        }
2061
    }
3,595,956✔
2062

2063
    fn size_hint(&self) -> (usize, Option<usize>) {
230,178✔
2064
        (self.num_kmers, Some(self.num_kmers))
230,178✔
2065
    }
230,178✔
2066

2067
    /// Provide a 'fast-forward' capability for this iterator
2068
    /// MPHF will use this to reduce the number of kmers that
2069
    /// need to be produced.
2070
    fn nth(&mut self, n: usize) -> Option<Self::Item> {
2,591,804✔
2071
        if n <= 4 {
2,591,804✔
2072
            // for small skips forward, shift one base at a time
2073
            for _ in 0..n {
2,328,110✔
2074
                self.next();
1,004,152✔
2075
            }
1,004,152✔
2076
        } else {
263,694✔
2077
            self.kmer_id += n;
263,694✔
2078
            self.kmer = self.node_seq_slice.get_kmer::<K>(self.kmer_id);
263,694✔
2079
        }
263,694✔
2080

2081
        self.next()
2,591,804✔
2082
    }
2,591,804✔
2083
}
2084

2085
/// Marker signifying that NodeKmerIter has a known size.
2086
impl<'a, K: Kmer + 'a, D: Debug + 'a> ExactSizeIterator for NodeKmerIter<'a, K, D> {}
2087

2088
/// Unbranched sequence in the DeBruijn graph
2089
pub struct Node<'a, K: Kmer + 'a, D: 'a> {
2090
    pub node_id: usize,
2091
    pub graph: &'a DebruijnGraph<K, D>,
2092
}
2093

2094
impl<'a, K: Kmer, D: Debug> Node<'a, K, D> {
2095
    /// Length of the sequence of this node
2096
    pub fn len(&self) -> usize {
1,673,910✔
2097
        self.graph.base.sequences.get(self.node_id).len()
1,673,910✔
2098
    }
1,673,910✔
2099

2100
    pub fn is_empty(&self) -> bool {
×
2101
        self.graph.base.sequences.get(self.node_id).is_empty()
×
2102
    }
×
2103

2104
    /// Sequence of the node
2105
    pub fn sequence(&self) -> DnaStringSlice<'a> {
3,423,503✔
2106
        self.graph.base.sequences.get(self.node_id)
3,423,503✔
2107
    }
3,423,503✔
2108

2109
    /// Reference to auxiliarly data associated with the node
2110
    pub fn data(&self) -> &'a D {
4,011,140✔
2111
        &self.graph.base.data[self.node_id]
4,011,140✔
2112
    }
4,011,140✔
2113

2114
    /// Extension bases from this node
2115
    pub fn exts(&self) -> Exts {
2,434,089✔
2116
        self.graph.base.exts[self.node_id]
2,434,089✔
2117
    }
2,434,089✔
2118

2119
    /// Edges leaving the left side of the node in the format
2120
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
2121
    pub fn l_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
294,437✔
2122
        self.graph.find_edges(self.node_id, Dir::Left)
294,437✔
2123
    }
294,437✔
2124

2125
    /// Edges leaving the right side of the node in the format
2126
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
2127
    pub fn r_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
294,436✔
2128
        self.graph.find_edges(self.node_id, Dir::Right)
294,436✔
2129
    }
294,436✔
2130

2131
    /// Edges leaving the 'dir' side of the node in the format
2132
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
2133
    pub fn edges(&self, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
433,641✔
2134
        self.graph.find_edges(self.node_id, dir)
433,641✔
2135
    }
433,641✔
2136

2137
    fn to_json<F: Fn(&D) -> Value>(&self, func: &F, f: &mut dyn Write) {
×
2138
        write!(
×
2139
            f,
×
2140
            "{{\"id\":\"{}\",\"L\":{},\"D\":{},\"Se\":\"{:?}\"}}",
×
2141
            self.node_id,
2142
            self.sequence().len(),
×
2143
            (func)(self.data()),
×
2144
            self.sequence(),
×
2145
        )
2146
        .unwrap();
×
2147
    }
×
2148

2149
    fn edges_to_json(&self, f: &mut dyn Write) -> bool {
×
2150
        let mut wrote = false;
×
2151
        let edges = self.r_edges();
×
2152
        for (idx, &(_, id, incoming_dir, _)) in edges.iter().enumerate() {
×
2153
            write!(
×
2154
                f,
×
2155
                "{{\"source\":\"{}\",\"target\":\"{}\",\"D\":\"{}\"}}",
×
2156
                self.node_id,
2157
                id,
2158
                match incoming_dir {
×
2159
                    Dir::Left => "L",
×
2160
                    Dir::Right => "R",
×
2161
                }
2162
            )
2163
            .unwrap();
×
2164

2165
            if idx < edges.len() - 1 {
×
2166
                write!(f, ",").unwrap();
×
2167
            }
×
2168

2169
            wrote = true;
×
2170
        }
2171
        wrote
×
2172
    }
×
2173
}
2174

2175
// TODO make generic instead of u8 (u8 is sufficient for dbg)
2176
impl<K: Kmer, SD: Debug> Node<'_, K, SD>  {
2177
    /// get default format for dot edges based on node data
2178
    pub fn edge_dot_default<DI>(&self, colors: &Colors<SD, DI>, base: u8, incoming_dir: Dir, flipped: bool) -> String 
194,761✔
2179
    where SD: SummaryData<DI>
194,761✔
2180
    {
2181
        // set color based on dir
2182
        let color = match incoming_dir {
194,761✔
2183
            Dir::Left => "blue",
97,593✔
2184
            Dir::Right => "red"
97,168✔
2185
        };
2186
        
2187
        if let Some(em) = self.data().edge_mults() {
194,761✔
2188
            
2189
            let dir = if flipped { 
194,761✔
2190
                incoming_dir 
77,306✔
2191
            } else {
2192
                incoming_dir.flip()
117,455✔
2193
            };
2194

2195
            // set penwidth based on count
2196
            let count = em.edge_mult(base, dir);
194,761✔
2197
            let penwidth = colors.edge_width(count);
194,761✔
2198

2199
            format!("[color={color}, penwidth={penwidth}, label=\"{}: {count}\"]", bits_to_base(base))
194,761✔
2200
        } else {
NEW
2201
            format!("[color={color}, penwidth={}]", colors.edge_width(1)) // since there should be no edge mults, this will return default value
×
2202
        }
2203
    }
194,761✔
2204

2205
    /// get default format for dot nodes, based on node data
2206
    pub fn node_dot_default<DI>(&self, colors: &Colors<SD, DI>, config: &SummaryConfig, translator: &Translator, outline: bool, translate_id_groups: bool) -> String
97,994✔
2207
    where SD: SummaryData<DI>
97,994✔
2208
    {
2209
        // set color based on labels/fold change/p-value
2210
        let color = colors.node_color(self.data(), config, outline);
97,994✔
2211
        let translate_id_groups = if translate_id_groups { colors.id_group_ids() } else { None };
97,994✔
2212

2213
        let data_info = self.data().print(translator, config, translate_id_groups);
97,994✔
2214
        const MIN_TEXT_WIDTH: usize = 40;
2215
        let wrap = if self.len() > MIN_TEXT_WIDTH { self.len() } else { MIN_TEXT_WIDTH };
97,994✔
2216

2217
        let label = textwrap::fill(&format!("id: {}, len: {}, exts: {:?}, seq: {}\n{}", 
97,994✔
2218
            self.node_id,
97,994✔
2219
            self.len(),
97,994✔
2220
            self.exts(),
97,994✔
2221
            self.sequence(),
97,994✔
2222
            data_info
97,994✔
2223
        ), wrap);
97,994✔
2224

2225
        format!("[style=filled, {color}, label=\"{label}\"]")
97,994✔
2226
    }
97,994✔
2227
}
2228

2229
impl<K: Kmer, D> fmt::Debug for Node<'_, K, D>
2230
where
2231
    D: Debug,
2232
{
2233
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
141✔
2234
        write!(
141✔
2235
            f,
141✔
2236
            "Node {{ id:{}, Exts: {:?}, L:{:?} R:{:?}, Seq: {:?}, Data: {:?} }}",
141✔
2237
            self.node_id,
2238
            self.exts(),
141✔
2239
            self.l_edges(),
141✔
2240
            self.r_edges(),
141✔
2241
            self.sequence().len(),
141✔
2242
            self.data()
141✔
2243
        )
2244
    }
141✔
2245
}
2246

2247
pub struct IterComponents<'a, K: Kmer, D> {
2248
    graph: &'a DebruijnGraph<K, D>,
2249
    visited: Vec<bool>,
2250
    pos: usize,
2251
}
2252

2253
impl<K: Kmer, D: Debug> Iterator for IterComponents<'_, K, D> {
2254
    type Item = Vec<usize>;
2255
    fn next(&mut self) -> Option<Self::Item> {
246✔
2256
        while self.pos < self.graph.len() {
33,219✔
2257
            if !self.visited[self.pos] {
33,214✔
2258
                let comp = self.graph.component_i(&mut self.visited, self.pos);
241✔
2259
                self.pos += 1;
241✔
2260
                return Some(comp)
241✔
2261
            } else {
32,973✔
2262
                self.pos += 1;
32,973✔
2263
            }
32,973✔
2264
        }
2265
        assert!(self.visited.iter().map(|x| *x as usize).sum::<usize>() == self.graph.len());
33,214✔
2266
        None
5✔
2267
    }
246✔
2268
    
2269
}
2270

2271
pub struct PathCompIter<'a, K: Kmer, D: Debug, F, F2> 
2272
where 
2273
F: Fn(&D) -> f32,
2274
F2: Fn(&D) -> bool
2275
{
2276
    graph: &'a DebruijnGraph<K, D>,
2277
    component_iterator: IterComponents<'a, K, D>,
2278
    graph_pos: usize,
2279
    score: F,
2280
    solid_path: F2,
2281
}
2282

2283
/// returns the component and the "best" path in the component
2284
impl<K: Kmer, D: Debug, F, F2> Iterator for PathCompIter<'_, K, D, F, F2> 
2285
where 
2286
F: Fn(&D) -> f32,
2287
F2: Fn(&D) -> bool
2288
{
2289
    type Item = (Vec<usize>, VecDeque<(usize, Dir)>,);
2290
    fn next(&mut self) -> Option<Self::Item> {
240✔
2291
        match self.component_iterator.next() {
240✔
2292
            Some(component) => {
237✔
2293
                let current_comp = component;
237✔
2294
                
2295
    
2296
                let mut best_node = current_comp[0];
237✔
2297
                let mut best_score = f32::MIN;
237✔
2298
                for c in current_comp.iter() {
32,936✔
2299
                    let node = self.graph.get_node(*c);
32,936✔
2300
                    let node_score = (self.score)(node.data());
32,936✔
2301
    
2302
                    if node_score > best_score {
32,936✔
2303
                        best_node = *c;
281✔
2304
                        best_score = node_score;
281✔
2305
                    }
32,655✔
2306
                }
2307
    
2308
                let oscore = |state| match state {
60,170✔
2309
                    None => 0.0,
30,003✔
2310
                    Some((id, _)) => (self.score)(self.graph.get_node(id).data()),
30,167✔
2311
                };
60,170✔
2312
    
2313
                let osolid_path = |state| match state {
30,085✔
2314
                    None => false,
×
2315
                    Some((id, _)) => (self.solid_path)(self.graph.get_node(id).data()),
30,085✔
2316
                };
30,085✔
2317
    
2318
                // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
2319
                // already put into the path
2320
                let mut used_nodes = HashSet::new();
237✔
2321
                let mut path = VecDeque::new();
237✔
2322
    
2323
                // Start w/ initial state
2324
                used_nodes.insert(best_node);
237✔
2325
                path.push_front((best_node, Dir::Left));
237✔
2326
    
2327
                for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
474✔
2328
                    let &(start_node, dir, do_flip) = init;
474✔
2329
                    let mut current = (start_node, dir);
474✔
2330
    
2331
                    loop {
2332
                        let mut next = None;
30,474✔
2333
                        let (cur_id, incoming_dir) = current;
30,474✔
2334
                        let node = self.graph.get_node(cur_id);
30,474✔
2335
                        let edges = node.edges(incoming_dir.flip());
30,474✔
2336
    
2337
                        let mut solid_paths = 0;
30,474✔
2338
                        for (_, id, dir, _) in edges {
60,559✔
2339
                            let cand = Some((id, dir));
30,085✔
2340
                            if osolid_path(cand) {
30,085✔
2341
                                solid_paths += 1;
30,085✔
2342

2343
                                // second if clause is outside of first in original code (see max_path) 
2344
                                // but would basically ignore path validity.
2345
                                if oscore(cand) > oscore(next) {
30,085✔
2346
                                    next = cand;
30,041✔
2347
                                }
30,041✔
2348
                            }
×
2349
                        }
2350
                        
2351
                        // break if multiple solid paths are available
2352
                        /* if solid_paths > 1 {
2353
                            break;
2354
                        } */
2355
    
2356
                        match next {
30,003✔
2357
                            Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
30,003✔
2358
                                if do_flip {
30,000✔
2359
                                    path.push_front((next_id, next_incoming.flip()));
15,041✔
2360
                                } else {
15,041✔
2361
                                    path.push_back((next_id, next_incoming));
14,959✔
2362
                                }
14,959✔
2363
    
2364
                                used_nodes.insert(next_id);
30,000✔
2365
                                current = (next_id, next_incoming);
30,000✔
2366
                            }
2367
                            _ => break,
474✔
2368
                        }
2369
                    }
2370
                }
2371
                
2372
                
2373
                Some((current_comp, path))
237✔
2374
            }, 
2375
            None => {
2376
                // should technically not need graph_pos after this 
2377
                self.graph_pos += 1;
3✔
2378
                None
3✔
2379
            }
2380
        }
2381
    }
240✔
2382
}
2383

2384

2385
/// iterator over the edges of the de bruijn graph
2386
pub struct EdgeIter<'a, K: Kmer, D: Debug> {
2387
    graph: &'a DebruijnGraph<K, D>,
2388
    visited_edges: HashSet<(usize, usize)>,
2389
    current_node: usize,
2390
    current_dir: Dir,
2391
    node_edge_iter: smallvec::IntoIter<[(u8, usize, Dir, bool); 4]>
2392
}
2393

2394
impl<K: Kmer, D: Debug> EdgeIter<'_, K, D> {
2395
    pub fn new(graph: &DebruijnGraph<K, D>) -> EdgeIter<'_, K, D>{
8✔
2396
        let node_edge_iter = graph.get_node(0).l_edges().into_iter();
8✔
2397

2398
        EdgeIter { 
8✔
2399
            graph, 
8✔
2400
            visited_edges: HashSet::new(), 
8✔
2401
            current_node: 0, 
8✔
2402
            current_dir: Dir::Left, 
8✔
2403
            node_edge_iter
8✔
2404
        }
8✔
2405
    }
8✔
2406
}
2407

2408
impl<K: Kmer, D: Debug> Iterator for EdgeIter<'_, K, D> {
2409
    type Item = (usize, Dir, u8, usize); // node, direction leaving node, base, target node
2410

2411
    fn next(&mut self) -> Option<Self::Item> {
275✔
2412
        loop {
2413
            if let Some((base, nb_node_id, _, _)) = self.node_edge_iter.next() {
1,122✔
2414
                let edge = if self.current_node > nb_node_id { (nb_node_id, self.current_node) } else { (self.current_node, nb_node_id) };
534✔
2415

2416
                if self.visited_edges.insert(edge) { return Some((self.current_node, self.current_dir, base, nb_node_id)); } // else simply skip and move on
534✔
2417

2418
            } else {
2419
                match self.current_dir {
588✔
2420
                Dir::Left => {
294✔
2421
                    // no left edges, switch to right edges
294✔
2422
                    self.current_dir = Dir::Right;
294✔
2423
                    self.node_edge_iter = self.graph.get_node(self.current_node).r_edges().into_iter();
294✔
2424
                    
294✔
2425
                }
294✔
2426
                Dir::Right => {
2427
                    // no right edges, switch to next node left edges
2428
                    self.current_node += 1;
294✔
2429

2430
                    // quit if end of graph is reached
2431
                    if self.current_node == self.graph.len() { return None }
294✔
2432

2433
                    self.current_dir = Dir::Left;
286✔
2434
                    self.node_edge_iter = self.graph.get_node(self.current_node).l_edges().into_iter();
286✔
2435
                }
2436
            }
2437
            }
2438
            
2439
        }
2440
    }
275✔
2441
}
2442

2443
#[cfg(test)]
2444
mod test {
2445
    use std::{fs::File, io::BufReader};
2446

2447
    use crate::{colors::Colors, compression::{compress_kmers_with_hash, uncompressed_graph, CheckCompress}, filter::filter_kmers, kmer::{Kmer16, Kmer22}, reads::{Reads, ReadsPaired}, serde::SerKmers, summarizer::{IDMapEMData, IDTag, SampleInfo, SummaryConfig, Tag, TagsCountsEMData, TagsCountsSumData, Translator, ID}, Exts};
2448

2449
    use super::DebruijnGraph;
2450
    use crate::{summarizer::SummaryData, Dir, BUF};
2451

2452

2453
    #[test]
2454
    #[cfg(not(feature = "sample128"))]
2455
    fn test_components() {
2456

2457
        let path = "test_data/400.graph.dbg";
2458
        let file = BufReader::with_capacity(BUF, File::open(path).unwrap());
2459

2460
        let (graph, _, _): (DebruijnGraph<Kmer16, TagsCountsSumData>, Vec<String>, crate::summarizer::SummaryConfig) = 
2461
            bincode::deserialize_from(file).expect("error deserializing graph");
2462

2463
        let components = graph.iter_components();
2464

2465
        let check_components = [
2466
            vec![3, 67, 130, 133, 59, 119, 97, 110, 68, 137, 29, 84, 131, 43, 30, 91, 14, 70, 79, 142, 136, 105, 103, 62, 
2467
                141, 104, 134, 88, 38, 81, 108, 92, 135, 96, 116, 121, 63, 124, 106, 129, 132, 126, 93, 109, 83, 112, 118, 
2468
                123, 125, 78, 122, 115, 75, 128, 140, 111, 26, 143, 113],
2469
            vec![41, 138, 100, 139, 86],
2470
            vec![53, 117, 127],
2471
            vec![69, 144, 77, 120, 114, 107, 101],
2472
        ];
2473

2474
        let mut counter = 0;
2475

2476
        for component in components {
2477
            if component.len() > 1 { 
2478
                println!("component: {:?}", component);
2479
                assert_eq!(component, check_components[counter]);
2480
                counter += 1;
2481
            }
2482
        }
2483
        assert_eq!(vec![(139, Dir::Left)], graph.max_path(|data| data.sum().unwrap_or(1) as f32, |_| true));
2484
    }
2485

2486
    #[test]
2487
    #[cfg(not(feature = "sample128"))]
2488
    fn test_iter_edges() {
2489
        let path = "test_data/400.graph.dbg";
2490
        let file = BufReader::with_capacity(BUF, File::open(path).unwrap());
2491

2492
        let (graph, _, _): (DebruijnGraph<Kmer16, TagsCountsSumData>, Vec<String>, crate::summarizer::SummaryConfig) = 
2493
            bincode::deserialize_from(file).expect("error deserializing graph");
2494

2495
        let check_edges = vec![(3, Dir::Left, 2, 134), (3, Dir::Right, 2, 67), (14, Dir::Left, 0, 91), 
2496
            (14, Dir::Right, 0, 70), (26, Dir::Left, 2, 111), (29, Dir::Left, 1, 131), (29, Dir::Left, 3, 84), (29, Dir::Right, 1, 137), 
2497
            (30, Dir::Left, 3, 43), (30, Dir::Right, 2, 91), (38, Dir::Left, 3, 81), (38, Dir::Right, 1, 88), (41, Dir::Left, 0, 138), 
2498
            (43, Dir::Left, 0, 131), (53, Dir::Left, 0, 127), (53, Dir::Right, 1, 117), (59, Dir::Left, 3, 133), (59, Dir::Right, 0, 119), 
2499
            (62, Dir::Left, 3, 119), (62, Dir::Right, 3, 103), (63, Dir::Left, 1, 121), (63, Dir::Right, 0, 124), (67, Dir::Left, 3, 130), 
2500
            (68, Dir::Left, 0, 137), (68, Dir::Right, 3, 110), (69, Dir::Left, 1, 114), (69, Dir::Left, 3, 77), (69, Dir::Right, 1, 144), 
2501
            (70, Dir::Right, 0, 79), (75, Dir::Left, 2, 121), (75, Dir::Right, 2, 128), (77, Dir::Left, 1, 120), (78, Dir::Left, 2, 122), 
2502
            (78, Dir::Right, 2, 125), (79, Dir::Right, 2, 142), (81, Dir::Left, 0, 113), (81, Dir::Right, 0, 143), (81, Dir::Right, 3, 108), 
2503
            (83, Dir::Right, 3, 109), (86, Dir::Left, 1, 139), (88, Dir::Left, 0, 113), (88, Dir::Right, 3, 134), (91, Dir::Left, 3, 141), 
2504
            (92, Dir::Left, 3, 135), (92, Dir::Right, 0, 108), (93, Dir::Left, 0, 109), (93, Dir::Right, 0, 126), (96, Dir::Left, 0, 116), 
2505
            (96, Dir::Right, 0, 135), (97, Dir::Left, 1, 119), (97, Dir::Right, 0, 110), (100, Dir::Left, 2, 139), (100, Dir::Right, 0, 138), 
2506
            (101, Dir::Right, 0, 120), (103, Dir::Right, 3, 105), (104, Dir::Left, 3, 110), (104, Dir::Right, 2, 119), (105, Dir::Right, 0, 136), 
2507
            (106, Dir::Left, 3, 124), (106, Dir::Right, 3, 129), (107, Dir::Right, 0, 120), (111, Dir::Right, 2, 140), (112, Dir::Left, 2, 115), 
2508
            (112, Dir::Left, 3, 118), (112, Dir::Right, 0, 124), (114, Dir::Left, 1, 120), (115, Dir::Right, 0, 123), (116, Dir::Left, 2, 121), 
2509
            (118, Dir::Right, 0, 123), (121, Dir::Right, 2, 132), (123, Dir::Right, 0, 125), (126, Dir::Right, 0, 132), (128, Dir::Left, 1, 140), 
2510
            (129, Dir::Right, 0, 132), (130, Dir::Left, 0, 133), (136, Dir::Right, 3, 142), (138, Dir::Left, 0, 138), (139, Dir::Left, 1, 139)];
2511

2512
        let edges = graph.iter_edges().collect::<Vec<_>>();
2513

2514
        assert_eq!(check_edges, edges);
2515
    }
2516

2517
    // dbg -c ../marbel_datasets/sim_reads_100.csv -s sum --stranded -o ../rust-debruijn/test_data/marbel_100_sum --checkpoint -k 22
2518
    #[cfg(not(feature = "sample128"))]
2519
    const TEST_GRAPH: &str = "test_data/marbel_100_sum.kmers.dbg";
2520

2521
    // cargo run --features sample128 -- -c ../marbel_datasets/sim_reads_100.csv -s sum --stranded -o ../rust-debruijn/test_data/marbel_100_sum_128 --checkpoint -k 22
2522
    #[cfg(feature = "sample128")]
2523
    const TEST_GRAPH: &str = "test_data/marbel_100_sum_128.kmers.dbg";
2524

2525
    #[test]
2526
    fn test_map_transcripts() {
1✔
2527
        // dbg -c ../marbel_datasets/sim_reads_100.csv -s sum --stranded -o ../rust-debruijn/test_data/marbel_100_sum --checkpoint -k 22
2528
        let graph_path = TEST_GRAPH;
1✔
2529
        let t_ref_path = "test_data/marbel_100_tr_ref.fasta";
1✔
2530
        let (kmers, mut translator, _) = SerKmers::<Kmer22, u32>::deserialize_from(graph_path).dissolve();
1✔
2531

2532
        let unc_graph = uncompressed_graph(&kmers, true).finish();
1✔
2533

2534
        let t_map = unc_graph.map_transcripts(t_ref_path, &mut translator).unwrap();
1✔
2535
        assert_eq!(t_map.len(), unc_graph.len());
1✔
2536
        assert_eq!(t_map.iter().filter(|&ids| !ids.is_empty()).collect::<Vec<_>>().len(), 10720);
21,993✔
2537
    }
1✔
2538

2539
    #[test]
2540
    fn test_remove_ladders() {
1✔
2541
        let   correct = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
1✔
2542
        let incorrect = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGGCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
1✔
2543
        let insertion = "ACGATCGATCGCGATCGTAAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
1✔
2544

2545
        let mut reads = Reads::new(crate::reads::Strandedness::Forward);
1✔
2546
        for _i in 0..1000 {
1,001✔
2547
            reads.add_from_bytes(correct, Exts::empty(), IDTag::new(0, 0));
1,000✔
2548
        }
1,000✔
2549

2550
        for _i in 0..10 {
11✔
2551
            reads.add_from_bytes(incorrect, Exts::empty(), IDTag::new(1, 1)); // should be removed
10✔
2552
        }
10✔
2553

2554
        for _i in 0..20 {
21✔
2555
            reads.add_from_bytes(insertion, Exts::empty(), IDTag::new(1, 2)); // should not be removed
20✔
2556
        }
20✔
2557

2558
        let seqs = ReadsPaired::Unpaired { reads };
1✔
2559
        let sample_info = SampleInfo::new(1, 6, 1, 2, vec![1000, 10, 20]);
1✔
2560
        let summary_config = SummaryConfig::new(1, None, crate::summarizer::GroupFrac::None, 0.03, sample_info, None, crate::summarizer::StatTest::WelchsTTest);
1✔
2561
        let (kmers, _) = filter_kmers::<IDMapEMData, Kmer16, IDTag>(&seqs, &summary_config, false, 1, false);
1✔
2562

2563

2564
        // test with uncompressed graph
2565
        let mut unc_graph = uncompressed_graph(&kmers, true).finish();
1✔
2566
        // add ids to graph
2567
        for i in 0..unc_graph.len() {
81✔
2568
            let data = unc_graph.mut_data(i);
81✔
2569
            if let Some(ids) = data.ids() {
81✔
2570
                if ids.contains(&0) {
81✔
2571
                    data.set_mapped_ids(vec![0].into());
50✔
2572
                }
50✔
NEW
2573
            }
×
2574
        }
2575
        //let colors = Colors::new(&unc_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 3 });
2576
        //unc_graph.to_dot("uncompressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip));
2577
        let n_edges = unc_graph.iter_edges().count();
1✔
2578
        unc_graph.remove_ladders(10, Some("uc.csv")).unwrap();
1✔
2579
        //unc_graph.to_dot("uncompressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip));
2580
        assert_eq!(n_edges - 17, unc_graph.iter_edges().count());
1✔
2581

2582
        // test with compressed graph
2583
        let spec = CheckCompress::new(|d: IDMapEMData, _| d, |d, d1| d.join_test(d1));
79✔
2584
        let mut c_graph = compress_kmers_with_hash(true, &spec, &kmers, false, false).finish();
1✔
2585
        // add ids to graph
2586
        for i in 0..c_graph.len() {
6✔
2587
            let data = c_graph.mut_data(i);
6✔
2588
            if let Some(ids) = data.ids() {
6✔
2589
                if ids.contains(&0) {
6✔
2590
                    data.set_mapped_ids(vec![0].into());
4✔
2591
                }
4✔
NEW
2592
            }
×
2593
        }
2594
        //let colors = Colors::new(&c_graph, &summary_config, crate::colors::ColorMode::SampleGroups);
2595
        //c_graph.to_dot("compressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip));
2596
        let n_edges = c_graph.iter_edges().count();
1✔
2597
        c_graph.remove_ladders(10, Some("c.csv")).unwrap();
1✔
2598
        //c_graph.to_dot("compressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip));
2599
        assert_eq!(n_edges - 2, c_graph.iter_edges().count());
1✔
2600
    }
1✔
2601

2602
    #[test]
2603
    fn test_remove_tips() {
1✔
2604
        let   correct = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
1✔
2605
        let incorrect = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTAACGTGAC".as_bytes();
1✔
2606

2607

2608
        let mut reads = Reads::new(crate::reads::Strandedness::Forward);
1✔
2609
        for _i in 0..1000 {
1,001✔
2610
            reads.add_from_bytes(correct, Exts::empty(), IDTag::new(0, 0));
1,000✔
2611
        }
1,000✔
2612

2613
        for _i in 0..10 {
11✔
2614
            reads.add_from_bytes(incorrect, Exts::empty(), IDTag::new(1, 1)); // should be removed
10✔
2615
        }
10✔
2616

2617
        let seqs = ReadsPaired::Unpaired { reads };
1✔
2618
        let sample_info = SampleInfo::new(1, 6, 1, 2, vec![1000, 10, 20]);
1✔
2619
        let summary_config = SummaryConfig::new(1, None, crate::summarizer::GroupFrac::None, 0.03, sample_info, None, crate::summarizer::StatTest::WelchsTTest);
1✔
2620
        let (kmers, _) = filter_kmers::<IDMapEMData, Kmer16, IDTag>(&seqs, &summary_config, false, 1, false);
1✔
2621

2622

2623
        // test with uncompressed graph
2624
        let mut unc_graph = uncompressed_graph(&kmers, true).finish();
1✔
2625
        // add ids to graph
2626
        for i in 0..unc_graph.len() {
57✔
2627
            let data = unc_graph.mut_data(i);
57✔
2628
            if let Some(ids) = data.ids() {
57✔
2629
                if ids.contains(&0) {
57✔
2630
                    data.set_mapped_ids(vec![0].into());
50✔
2631
                }
50✔
NEW
2632
            }
×
2633
        }
2634

2635
        //let colors = Colors::new(&unc_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 3 });
2636
        //unc_graph.to_dot("uncompressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip));
2637
        let n_edges = unc_graph.iter_edges().count();
1✔
2638
        
2639
        unc_graph.remove_tips(10, Some("uc.csv")).unwrap();
1✔
2640
        //unc_graph.to_dot("uncompressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip));
2641
        assert_eq!(n_edges - 7, unc_graph.iter_edges().count());
1✔
2642

2643
        // test with compressed graph
2644
        let spec = CheckCompress::new(|d: IDMapEMData, _| d, |d, d1| d.join_test(d1));
54✔
2645
        let mut c_graph = compress_kmers_with_hash(true, &spec, &kmers, false, false).finish();
1✔
2646
        // add ids to graph
2647
        for i in 0..c_graph.len() {
3✔
2648
            let data = c_graph.mut_data(i);
3✔
2649
            if let Some(ids) = data.ids() {
3✔
2650
                if ids.contains(&0) {
3✔
2651
                    data.set_mapped_ids(vec![0].into());
2✔
2652
                }
2✔
NEW
2653
            }
×
2654
        }
2655

2656
        //let colors = Colors::new(&c_graph, &summary_config, crate::colors::ColorMode::SampleGroups);
2657
        //c_graph.to_dot("compressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip));
2658
        let n_edges = c_graph.iter_edges().count();
1✔
2659
        
2660
        c_graph.remove_tips(10, Some("c.csv")).unwrap();
1✔
2661
        //c_graph.to_dot("compressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip));
2662
        assert_eq!(n_edges - 1, c_graph.iter_edges().count());
1✔
2663
    }
1✔
2664
}
2665

STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc