• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jlab / rust-debruijn / 17039801205

18 Aug 2025 11:53AM UTC coverage: 86.958% (-0.3%) from 87.287%
17039801205

Pull #28

github

web-flow
Merge dc6996ea4 into f0284da73
Pull Request #28: New kmer sizes, edge filter

338 of 363 new or added lines in 5 files covered. (93.11%)

7 existing lines in 2 files now uncovered.

6881 of 7913 relevant lines covered (86.96%)

4548833.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

75.1
/src/graph.rs
1
// Copyright 2017 10x Genomics
2

3
//! Containers for path-compressed De Bruijn graphs
4

5
use bit_set::BitSet;
6
use indicatif::ProgressBar;
7
use indicatif::ProgressIterator;
8
use indicatif::ProgressStyle;
9
use itertools::enumerate;
10
use log::{debug, trace};
11
use rayon::prelude::*;
12
use rayon::current_num_threads;
13
use serde_derive::{Deserialize, Serialize};
14
use smallvec::SmallVec;
15
use std::borrow::Borrow;
16

17
use std::collections::HashMap;
18
use std::collections::HashSet;
19
use std::collections::VecDeque;
20
use std::f32;
21
use std::fmt::{self, Debug, Display};
22
use std::fs::{remove_file, File};
23
use std::hash::Hash;
24
use std::io::BufWriter;
25
use std::io::{BufReader, Error, Read};
26
use std::io::Write;
27
use std::iter::FromIterator;
28
use std::marker::PhantomData;
29
use std::path::Path;
30

31
use boomphf::hashmap::BoomHashMap;
32

33
use serde_json;
34
use serde_json::Value;
35

36
type SmallVec4<T> = SmallVec<[T; 4]>;
37
type SmallVec8<T> = SmallVec<[T; 8]>;
38

39
use crate::bits_to_base;
40
use crate::colors::ColorMode;
41
use crate::colors::Colors;
42
use crate::compression::CompressionSpec;
43
use crate::dna_string::{DnaString, DnaStringSlice, PackedDnaStringSet};
44
use crate::graph;
45
use crate::summarizer::SummaryConfig;
46
use crate::summarizer::SummaryData;
47
use crate::summarizer::Translator;
48
use crate::BUF;
49
use crate::PROGRESS_STYLE;
50
use crate::{Dir, Exts, Kmer, Mer, Vmer};
51

52
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
53
/// This type does not carry the sorted index arrays the allow the graph
54
/// to be walked efficiently. The `DeBruijnGraph` type wraps this type and add those
55
/// vectors.
56
#[derive(Serialize, Deserialize, Clone, Debug)]
57
pub struct BaseGraph<K, D> {
58
    pub sequences: PackedDnaStringSet,
59
    pub exts: Vec<Exts>,
60
    pub data: Vec<D>,
61
    pub stranded: bool,
62
    phantom: PhantomData<K>,
63
}
64

65
impl<K, D> BaseGraph<K, D> {
66
    pub fn new(stranded: bool) -> Self {
2,305✔
67
        BaseGraph {
2,305✔
68
            sequences: PackedDnaStringSet::new(),
2,305✔
69
            exts: Vec::new(),
2,305✔
70
            data: Vec::new(),
2,305✔
71
            phantom: PhantomData,
2,305✔
72
            stranded,
2,305✔
73
        }
2,305✔
74
    }
2,305✔
75

76
    pub fn len(&self) -> usize {
784,619✔
77
        self.sequences.len()
784,619✔
78
    }
784,619✔
79

80
    pub fn is_empty(&self) -> bool {
1✔
81
        self.sequences.is_empty()
1✔
82
    }
1✔
83

84
    pub fn combine<I: Iterator<Item = BaseGraph<K, D>>>(graphs: I) -> Self {
12✔
85
        let mut sequences = PackedDnaStringSet::new();
12✔
86
        let mut exts = Vec::new();
12✔
87
        let mut data = Vec::new();
12✔
88
        let mut stranded = Vec::new();
12✔
89

90
        for g in graphs {
1,624✔
91
            for s in 0..g.sequences.len() {
5,614✔
92
                sequences.add(&g.sequences.get(s));
5,614✔
93
            }
5,614✔
94

95
            exts.extend(g.exts);
1,612✔
96
            data.extend(g.data);
1,612✔
97
            stranded.push(g.stranded);
1,612✔
98
        }
99

100
        let out_stranded = stranded.iter().all(|x| *x);
12✔
101

102
        if !out_stranded && !stranded.iter().all(|x| !*x) {
1,612✔
103
            panic!("attempted to combine stranded and unstranded graphs");
×
104
        }
12✔
105

106
        BaseGraph {
12✔
107
            sequences,
12✔
108
            stranded: out_stranded,
12✔
109
            exts,
12✔
110
            data,
12✔
111
            phantom: PhantomData,
12✔
112
        }
12✔
113
    }
12✔
114
}
115

116
impl<K: Kmer, D> BaseGraph<K, D> {
117
    pub fn add<R: Borrow<u8>, S: IntoIterator<Item = R>>(
1,494,285✔
118
        &mut self,
1,494,285✔
119
        sequence: S,
1,494,285✔
120
        exts: Exts,
1,494,285✔
121
        data: D,
1,494,285✔
122
    ) {
1,494,285✔
123
        self.sequences.add(sequence);
1,494,285✔
124
        self.exts.push(exts);
1,494,285✔
125
        self.data.push(data);
1,494,285✔
126
    }
1,494,285✔
127
}
128

129
impl<K: Kmer + Send + Sync, D> BaseGraph<K, D> {
130
    pub fn finish(self) -> DebruijnGraph<K, D> {
582✔
131
        let indices: Vec<u32> = (0..self.len() as u32).collect();
582✔
132

133
        let left_order = {
582✔
134
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
582✔
135
            for idx in &indices {
780,767✔
136
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
780,185✔
137
                
780,185✔
138
            }
780,185✔
139
            
140
            BoomHashMap::new_parallel(kmers, indices.clone())
582✔
141
        };
142

143
        let right_order = {
582✔
144
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
582✔
145
            for idx in &indices {
780,767✔
146
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
780,185✔
147
            }
780,185✔
148
            
149
            BoomHashMap::new_parallel(kmers, indices)
582✔
150
        };
151
        debug!("finish graph loops: 2x {}", self.len());
582✔
152

153
        DebruijnGraph {
582✔
154
            base: self,
582✔
155
            left_order,
582✔
156
            right_order,
582✔
157
        }
582✔
158
    }
582✔
159
}
160

161
impl<K: Kmer, D> BaseGraph<K, D> {
162
    pub fn finish_serial(self) -> DebruijnGraph<K, D> {
111✔
163
        let indices: Vec<u32> = (0..self.len() as u32).collect();
111✔
164

165
        let left_order = {
111✔
166
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
111✔
167
            let mut sequences: Vec<String> = Vec::new();
111✔
168
            for idx in &indices {
712,109✔
169
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
711,998✔
170
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
711,998✔
171
            }
711,998✔
172
            println!("left kmers: {:?}", kmers);
111✔
173
            println!("left seqs: {:?}", sequences);
111✔
174
            BoomHashMap::new(kmers, indices.clone())
111✔
175
        };
176

177
        let right_order = {
111✔
178
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
111✔
179
            let mut sequences: Vec<String> = Vec::new();
111✔
180
            for idx in &indices {
712,109✔
181
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
711,998✔
182
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
711,998✔
183
            }
711,998✔
184
            println!("right kmers: {:?}", kmers);
111✔
185
            println!("right seqs: {:?}", sequences);
111✔
186
            BoomHashMap::new(kmers, indices)
111✔
187
        };
188

189
        DebruijnGraph {
111✔
190
            base: self,
111✔
191
            left_order,
111✔
192
            right_order,
111✔
193
        }
111✔
194
    }
111✔
195
}
196

197
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
198
/// The struct carries sorted index arrays the allow the graph
199
/// to be walked efficiently.
200
#[derive(Serialize, Deserialize, Debug)]
201
pub struct DebruijnGraph<K: Hash, D> {
202
    pub base: BaseGraph<K, D>,
203
    left_order: BoomHashMap<K, u32>,
204
    right_order: BoomHashMap<K, u32>,
205
}
206

207
impl<K: Kmer, D: Debug> DebruijnGraph<K, D> {
208
    /// Total number of nodes in the DeBruijn graph
209
    pub fn len(&self) -> usize {
782,516✔
210
        self.base.len()
782,516✔
211
    }
782,516✔
212

213
    pub fn is_empty(&self) -> bool {
1✔
214
        self.base.is_empty()
1✔
215
    }
1✔
216

217
    /// Get a node given it's `node_id`
218
    pub fn get_node(&self, node_id: usize) -> Node<K, D> {
4,958,710✔
219
        Node {
4,958,710✔
220
            node_id,
4,958,710✔
221
            graph: self,
4,958,710✔
222
        }
4,958,710✔
223
    }
4,958,710✔
224

225
    /// Get a node given it's `node_id`
226
    pub fn get_node_kmer(&self, node_id: usize) -> NodeKmer<K, D> {
×
227
        let node = self.get_node(node_id);
×
228
        let node_seq = node.sequence();
×
229

230
        NodeKmer {
×
231
            node_id,
×
232
            node_seq_slice: node_seq,
×
233
            phantom_d: PhantomData,
×
234
            phantom_k: PhantomData,
×
235
        }
×
236
    }
×
237

238
    /// Return an iterator over all nodes in the graph
239
    pub fn iter_nodes(&self) -> NodeIter<K, D> {
129✔
240
        NodeIter {
129✔
241
            graph: self,
129✔
242
            node_id: 0,
129✔
243
        }
129✔
244
    }
129✔
245

246
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
247
    /// accessed via a Node wrapper object
248
    fn find_edges(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
1,055,408✔
249
        let exts = self.base.exts[node_id];
1,055,408✔
250
        let sequence = self.base.sequences.get(node_id);
1,055,408✔
251
        let kmer: K = sequence.term_kmer(dir);
1,055,408✔
252
        let mut edges = SmallVec4::new();
1,055,408✔
253

254
        for i in 0..4 {
5,277,040✔
255
            if exts.has_ext(dir, i) {
4,221,632✔
256
                let link = self.find_link(kmer.extend(i, dir), dir).expect("missing link");
1,400,144✔
257
                edges.push((i, link.0, link.1, link.2));
1,400,144✔
258
            }
2,821,488✔
259
        }
260

261
        edges
1,055,408✔
262
    }
1,055,408✔
263

264
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
265
    /// accessed via a Node wrapper object
266
    /// 
267
    /// allows missing links
268
    fn _find_edges_sharded(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
×
269
        let exts = self.base.exts[node_id];
×
270
        let sequence = self.base.sequences.get(node_id);
×
271
        let kmer: K = sequence.term_kmer(dir);
×
272
        let mut edges = SmallVec4::new();
×
273

274
        for i in 0..4 {
×
275
            if exts.has_ext(dir, i) {
×
276
                let link = self.find_link(kmer.extend(i, dir), dir); //.expect("missing link");
×
277
                if let Some(l) = link {
×
278
                    edges.push((i, l.0, l.1, l.2));
×
279
                }
×
280
                // Otherwise, this edge doesn't exist within this shard, so ignore it.
281
                // NOTE: this should be allowed in a 'complete' DBG
282
            }
×
283
        }
284

285
        edges
×
286
    }
×
287

288
    /// Seach for the kmer `kmer`, appearing at the given `side` of a node sequence.
289
    fn search_kmer(&self, kmer: K, side: Dir) -> Option<usize> {
5,175,043✔
290
        match side {
5,175,043✔
291
            Dir::Left => self.left_order.get(&kmer).map(|pos| *pos as usize),
2,585,825✔
292
            Dir::Right => self.right_order.get(&kmer).map(|pos| *pos as usize),
2,589,218✔
293
        }
294
    }
5,175,043✔
295

296
    /// Find a link in the graph, possibly handling a RC switch.
297
    pub fn find_link(&self, kmer: K, dir: Dir) -> Option<(usize, Dir, bool)> {
3,621,682✔
298
        // Only test self-consistent paths through
299
        // the edges
300
        // Avoids problems due to single kmer edges
301
        // (dir, next_side_incoming, rc)
302
        // (Dir::Left, Dir::Right, false) => true,
303
        // (Dir::Left, Dir::Left,  true) => true,
304
        // (Dir::Right, Dir::Left, false) => true,
305
        // (Dir::Right, Dir::Right, true) => true,
306

307
        let rc = kmer.rc();
3,621,682✔
308

309
        match dir {
3,621,682✔
310
            Dir::Left => {
311
                if let Some(idx) = self.search_kmer(kmer, Dir::Right) {
1,816,670✔
312
                    return Some((idx, Dir::Right, false));
1,035,857✔
313
                }
780,813✔
314

315
                if !self.base.stranded {
780,813✔
316
                    if let Some(idx) = self.search_kmer(rc, Dir::Left) {
780,813✔
317
                        return Some((idx, Dir::Left, true));
780,813✔
318
                    }
×
319
                }
×
320
            }
321

322
            Dir::Right => {
323
                if let Some(idx) = self.search_kmer(kmer, Dir::Left) {
1,805,012✔
324
                    return Some((idx, Dir::Left, false));
1,032,464✔
325
                }
772,548✔
326

327
                if !self.base.stranded {
772,548✔
328
                    if let Some(idx) = self.search_kmer(rc, Dir::Right) {
772,548✔
329
                        return Some((idx, Dir::Right, true));
772,548✔
330
                    }
×
331
                }
×
332
            }
333
        }
334

335
        None
×
336
    }
3,621,682✔
337

338
    /// Check whether the graph is fully compressed. Return `None` if it's compressed,
339
    /// otherwise return `Some(node1, node2)` representing a pair of node that could
340
    /// be collapsed. Probably only useful for testing.
341
    pub fn is_compressed<S: CompressionSpec<D>>(&self, spec: &S) -> Option<(usize, usize)> {
789✔
342
        for i in 0..self.len() {
108,123✔
343
            let n = self.get_node(i);
108,123✔
344

345
            for dir in &[Dir::Left, Dir::Right] {
324,369✔
346
                let dir_edges = n.edges(*dir);
216,246✔
347
                if dir_edges.len() == 1 {
216,246✔
348
                    let (_, next_id, return_dir, _) = dir_edges[0];
153,396✔
349
                    let next = self.get_node(next_id);
153,396✔
350

351
                    let ret_edges = next.edges(return_dir);
153,396✔
352
                    if ret_edges.len() == 1 {
153,396✔
353
                        // Test for us being a palindrome: this makes it OK
354
                        if n.len() == K::k() && n.sequence().first_kmer::<K>().is_palindrome() {
24✔
355
                            continue;
16✔
356
                        }
8✔
357

358
                        // Test for a neighbor being a palindrome: this makes it OK
359
                        if next.len() == K::k() && next.sequence().first_kmer::<K>().is_palindrome()
8✔
360
                        {
361
                            continue;
8✔
362
                        }
×
363

364
                        // Test for this edge representing a smooth circle (biting it's own tail):
365
                        if n.node_id == next_id {
×
366
                            continue;
×
367
                        }
×
368

369
                        if spec.join_test(n.data(), next.data()) {
×
370
                            // Found a unbranched edge that should have been eliminated
371
                            return Some((i, next_id));
×
372
                        }
×
373
                    }
153,372✔
374
                }
62,850✔
375
            }
376
        }
377

378
        None
789✔
379
    }
789✔
380

381
    /// Remove non-existent extensions that may be created due to filtered kmers
382
    /// 
383
    /// if `valid_nodes` if `None`, all nodes are valid
384
    pub fn fix_exts(&mut self, valid_nodes: Option<&BitSet>) {
247✔
385
        for i in 0..self.len() {
734,427✔
386
            let valid_exts = self.get_valid_exts(i, valid_nodes);
734,427✔
387
            self.base.exts[i] = valid_exts;
734,427✔
388
        }
734,427✔
389
    }
247✔
390

391
    pub fn get_valid_exts(&self, node_id: usize, valid_nodes: Option<&BitSet>) -> Exts {
734,427✔
392
        let mut new_exts = Exts::empty();
734,427✔
393
        let node = self.get_node(node_id);
734,427✔
394
        let exts = node.exts();
734,427✔
395
        let l_kmer: K = node.sequence().first_kmer();
734,427✔
396
        let r_kmer: K = node.sequence().last_kmer();
734,427✔
397

398
        let check_node = |id| match valid_nodes {
1,496,946✔
399
            Some(bs) => bs.contains(id),
1,449,272✔
400
            None => true,
47,674✔
401
        };
1,496,946✔
402

403
        for i in 0..4 {
3,672,135✔
404
            if exts.has_ext(Dir::Left, i) {
2,937,708✔
405
                match self.find_link(l_kmer.extend_left(i), Dir::Left) {
748,689✔
406
                    Some((target, _, _)) if check_node(target) => {
748,689✔
407
                        new_exts = new_exts.set(Dir::Left, i)
748,689✔
408
                    }
409
                    _ => (),
×
410
                }
411
            }
2,189,019✔
412

413
            if exts.has_ext(Dir::Right, i) {
2,937,708✔
414
                match self.find_link(r_kmer.extend_right(i), Dir::Right) {
748,257✔
415
                    Some((target, _, _)) if check_node(target) => {
748,257✔
416
                        new_exts = new_exts.set(Dir::Right, i)
748,257✔
417
                    }
418
                    _ => (),
×
419
                }
420
            }
2,189,451✔
421
        }
422

423
        new_exts
734,427✔
424
    }
734,427✔
425

426
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
427
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
428
    /// paths will be terminated if there are multiple valid paths emanating from a node.
429
    pub fn max_path<F, F2>(&self, score: F, solid_path: F2) -> Vec<(usize, Dir)>
×
430
    where
×
431
        F: Fn(&D) -> f32,
×
432
        F2: Fn(&D) -> bool,
×
433
    {
434
        if self.is_empty() {
×
435
            return Vec::default();
×
436
        }
×
437

438
        let mut best_node = 0;
×
439
        let mut best_score = f32::MIN;
×
440
        for i in 0..self.len() {
×
441
            let node = self.get_node(i);
×
442
            let node_score = score(node.data());
×
443

444
            if node_score > best_score {
×
445
                best_node = i;
×
446
                best_score = node_score;
×
447
            }
×
448
        }
449

450
        let oscore = |state| match state {
×
451
            None => 0.0,
×
452
            Some((id, _)) => score(self.get_node(id).data()),
×
453
        };
×
454

455
        let osolid_path = |state| match state {
×
456
            None => false,
×
457
            Some((id, _)) => solid_path(self.get_node(id).data()),
×
458
        };
×
459

460
        // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
461
        // already put into the path
462
        let mut used_nodes = HashSet::new();
×
463
        let mut path = VecDeque::new();
×
464

465
        // Start w/ initial state
466
        used_nodes.insert(best_node);
×
467
        path.push_front((best_node, Dir::Left));
×
468

469
        for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
×
470
            let &(start_node, dir, do_flip) = init;
×
471
            let mut current = (start_node, dir);
×
472

473
            loop {
474
                let mut next = None;
×
475
                let (cur_id, incoming_dir) = current;
×
476
                let node = self.get_node(cur_id);
×
477
                let edges = node.edges(incoming_dir.flip());
×
478

479
                let mut solid_paths = 0;
×
480
                for (_, id, dir, _) in edges {
×
481
                    let cand = Some((id, dir));
×
482
                    if osolid_path(cand) {
×
483
                        solid_paths += 1;
×
484
                    }
×
485

486
                    if oscore(cand) > oscore(next) {
×
487
                        next = cand;
×
488
                    }
×
489
                }
490

491
                if solid_paths > 1 {
×
492
                    break;
×
493
                }
×
494

495
                match next {
×
496
                    Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
×
497
                        if do_flip {
×
498
                            path.push_front((next_id, next_incoming.flip()));
×
499
                        } else {
×
500
                            path.push_back((next_id, next_incoming));
×
501
                        }
×
502

503
                        used_nodes.insert(next_id);
×
504
                        current = (next_id, next_incoming);
×
505
                    }
506
                    _ => break,
×
507
                }
508
            }
509
        }
510

511
        Vec::from_iter(path)
×
512
    }
×
513

514

515
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
516
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
517
    /// paths will be terminated if there are multiple valid paths emanating from a node.
518
    /// Returns vec with path for each component
519
    pub fn max_path_comp<F, F2>(&self, score: F, solid_path: F2) -> Vec<VecDeque<(usize, Dir)>>
1✔
520
    where
1✔
521
        F: Fn(&D) -> f32,
1✔
522
        F2: Fn(&D) -> bool,
1✔
523
    {
524
        if self.is_empty() {
1✔
525
            let vec: Vec<VecDeque<(usize, Dir)>> = Vec::new();
×
526
            return vec;
×
527
        }
1✔
528

529
        let components = self.iter_components();
1✔
530
        let mut paths: Vec<VecDeque<(usize, Dir)>> = Vec::new();
1✔
531

532
        for component in components {
3✔
533

534
            let current_comp = &component;
2✔
535
            
536

537
            let mut best_node = current_comp[0];
2✔
538
            let mut best_score = f32::MIN;
2✔
539
            for c in current_comp.iter() {
137✔
540
                let node = self.get_node(*c);
137✔
541
                let node_score = score(node.data());
137✔
542

543
                if node_score > best_score {
137✔
544
                    best_node = *c;
2✔
545
                    best_score = node_score;
2✔
546
                }
135✔
547
            }
548

549
            let oscore = |state| match state {
252✔
550
                None => 0.0,
126✔
551
                Some((id, _)) => score(self.get_node(id).data()),
126✔
552
            };
252✔
553

554
            let osolid_path = |state| match state {
126✔
555
                None => false,
×
556
                Some((id, _)) => solid_path(self.get_node(id).data()),
126✔
557
            };
126✔
558

559
            // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
560
            // already put into the path
561
            let mut used_nodes = HashSet::new();
2✔
562
            let mut path = VecDeque::new();
2✔
563

564
            // Start w/ initial state
565
            used_nodes.insert(best_node);
2✔
566
            path.push_front((best_node, Dir::Left));
2✔
567

568
            for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
4✔
569
                let &(start_node, dir, do_flip) = init;
4✔
570
                let mut current = (start_node, dir);
4✔
571

572
                loop {
573
                    let mut next = None;
130✔
574
                    let (cur_id, incoming_dir) = current;
130✔
575
                    let node = self.get_node(cur_id);
130✔
576
                    let edges = node.edges(incoming_dir.flip());
130✔
577

578
                    let mut solid_paths = 0;
130✔
579
                    for (_, id, dir, _) in edges {
256✔
580
                        let cand = Some((id, dir));
126✔
581
                        if osolid_path(cand) {
126✔
582
                            solid_paths += 1;
126✔
583
                        }
126✔
584

585
                        if oscore(cand) > oscore(next) {
126✔
586
                            next = cand;
126✔
587
                        }
126✔
588
                    }
589

590
                    if solid_paths > 1 {
130✔
UNCOV
591
                        break;
×
592
                    }
130✔
593

594
                    match next {
126✔
595
                        Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
126✔
596
                            if do_flip {
126✔
597
                                path.push_front((next_id, next_incoming.flip()));
82✔
598
                            } else {
82✔
599
                                path.push_back((next_id, next_incoming));
44✔
600
                            }
44✔
601

602
                            used_nodes.insert(next_id);
126✔
603
                            current = (next_id, next_incoming);
126✔
604
                        }
605
                        _ => break,
4✔
606
                    }
607
                }
608
            }
609
            
610
            paths.push(path);
2✔
611
            //paths.push(Vec::from_iter(path));
612
        }
613

614
        paths
1✔
615
    
616
    }
1✔
617

618
    pub fn iter_max_path_comp<F, F2>(&self, score: F, solid_path: F2) -> PathCompIter<K, D, F, F2> 
3✔
619
    where 
3✔
620
    F: Fn(&D) -> f32,
3✔
621
    F2: Fn(&D) -> bool
3✔
622
    {
623
        let component_iterator = self.iter_components();
3✔
624
        PathCompIter { graph: self, component_iterator, graph_pos: 0, score, solid_path }
3✔
625
    }
3✔
626

627
    /// write the paths from `iter_max_path_comp` to a fasta file
628
    pub fn path_to_fasta<F, F2>(&self, f: &mut dyn std::io::Write, path_iter: PathCompIter<K, D, F, F2>, return_lens: bool) -> (Vec<usize>, Vec<usize>)
1✔
629
    where 
1✔
630
    F: Fn(&D) -> f32,
1✔
631
    F2: Fn(&D) -> bool
1✔
632
    {
633
        // width of fasta lines
634
        let columns = 80;
1✔
635

636
        // sizes of components and of paths
637
        let mut comp_sizes = Vec::new();
1✔
638
        let mut path_lens = Vec::new();
1✔
639

640
        for (seq_counter, (component, path)) in path_iter.enumerate() {
2✔
641
            // get dna sequence from path
642
            let seq = self.sequence_of_path(path.iter());
2✔
643

644
            //write header with length & start node
645
            writeln!(f, ">path{} len={} start_node={}", seq_counter, seq.len(), path[0].0).unwrap();
2✔
646

647
            // calculate how sequence has to be split up
648
            let slices = (seq.len() / columns) + 1;
2✔
649
            let mut ranges = Vec::with_capacity(slices);
2✔
650

651
            let mut start = 0;
2✔
652
            while start < seq.len() {
6✔
653
                ranges.push(start..start + columns);
4✔
654
                start += columns;
4✔
655
            }
4✔
656

657
            let last_start = ranges.pop().expect("no kmers in parallel ranges").start;
2✔
658
            ranges.push(last_start..seq.len());
2✔
659

660
            // split up sequence and write to file accordingly
661
            for range in ranges {
6✔
662
                writeln!(f, "{:?}", seq.slice(range.start, range.end)).unwrap();
4✔
663
            }
4✔
664

665
            if return_lens {
2✔
666
                comp_sizes.push(component.len());
×
667
                path_lens.push(path.len());
×
668
            }
2✔
669
        }    
670

671
        (comp_sizes, path_lens)
1✔
672
        
673
    }
1✔
674

675

676
    /// Get the sequence of a path through the graph. The path is given as a sequence of node_id integers
677
    pub fn sequence_of_path<'a, I: 'a + Iterator<Item = &'a (usize, Dir)>>(
16,684✔
678
        &self,
16,684✔
679
        path: I,
16,684✔
680
    ) -> DnaString {
16,684✔
681
        let mut seq = DnaString::new();
16,684✔
682

683
        for (idx, &(node_id, dir)) in path.enumerate() {
717,996✔
684
            let start = if idx == 0 { 0 } else { K::k() - 1 };
717,996✔
685

686
            let node_seq = match dir {
717,996✔
687
                Dir::Left => self.get_node(node_id).sequence(),
370,067✔
688
                Dir::Right => self.get_node(node_id).sequence().rc(),
347,929✔
689
            };
690

691
            for p in start..node_seq.len() {
1,272,192✔
692
                seq.push(node_seq.get(p))
1,272,192✔
693
            }
694
        }
695

696
        seq
16,684✔
697
    }
16,684✔
698

699
    /// write a node to a dot file
700
    /// 
701
    /// ### Arguments: 
702
    /// 
703
    /// * `node`: [`Node<K, D>`] which will be written to a dot file
704
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
705
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
706
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
707
    /// * `f`: writer
708
    fn node_to_dot<FN: Fn(&Node<K, D>) -> String, FE: Fn(&Node<K, D>, u8, Dir, bool) -> String>(
97,978✔
709
        &self,
97,978✔
710
        node: &Node<'_, K, D>,
97,978✔
711
        node_label: &FN,
97,978✔
712
        edge_label: &FE,
97,978✔
713
        f: &mut dyn Write,
97,978✔
714
    ) {
97,978✔
715
        writeln!(f, "n{} {}", node.node_id, node_label(node)).unwrap();
97,978✔
716
        assert_eq!(node.exts().val.count_ones() as usize, node.l_edges().len() + node.r_edges().len());
97,978✔
717

718
        for (base, id, incoming_dir, flipped) in node.l_edges() {
97,978✔
719
            writeln!(f, "n{} -> n{} {}", id, node.node_id, edge_label(node, base, incoming_dir, flipped)).unwrap();
97,591✔
720
        }
97,591✔
721

722
        for (base, id, incoming_dir, flipped) in node.r_edges() {
97,978✔
723
            writeln!(f, "n{} -> n{} {}", node.node_id, id, edge_label(node, base, incoming_dir, flipped)).unwrap();
97,165✔
724
        }
97,165✔
725
    }
97,978✔
726

727
    /// Write the graph to a dot file
728
    /// 
729
    /// ### Arguments: 
730
    /// 
731
    /// * `path`: path to the output file
732
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes, e.g. [`Node::node_dot_default`]
733
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge, e.g. [`Node::edge_dot_default`]
734
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
735
    pub fn to_dot<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
1✔
736
    where 
1✔
737
    P: AsRef<Path>,
1✔
738
    FN: Fn(&Node<K, D>) -> String,
1✔
739
    FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
740
    {
741
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
742

743
        let pb = ProgressBar::new(self.len() as u64);
1✔
744
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
745
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
1✔
746

747
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
748
        for i in (0..self.len()).progress_with(pb) {
32,658✔
749
            self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
32,658✔
750
        }
32,658✔
751
        writeln!(&mut f, "}}").unwrap();
1✔
752
        
753
        f.flush().unwrap();
1✔
754
        debug!("large to dot loop: {}", self.len());
1✔
755
    }
1✔
756

757
    /// Write the graph to a dot file, highlight the nodes which form the 
758
    /// "best" path, according to [`PathCompIter`], with the number of occurences 
759
    /// as the score and `solid_path` always `true`.
760
    /// The nodes are formatted according to [`Node::node_dot_default`].
761
    /// 
762
    /// ### Arguments: 
763
    /// 
764
    /// * `path`: path to the output file
765
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge, e.g. [`Node::edge_dot_default`]
766
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
767
    /// * `colors`: a [`Colors`] with the color settings for the graph
768
    /// * `translator`: a [`Translator`] which translates tags or IDs to strings
769
    /// * `config`: a [`SummaryConfig`] which contains settings for the graph
770
    pub fn to_dot_with_path<P, FE, DI>(&self, path: P, edge_label: &FE, colors: &Colors<'_, D, DI>, translator: &Translator, config: &SummaryConfig)
1✔
771
    where 
1✔
772
    P: AsRef<Path>,
1✔
773
    D: SummaryData<DI>,
1✔
774
    FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
775
    {
776
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
777

778
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
779

780
        // iterate over components
781
        for (component, path) in self.iter_max_path_comp(|d| d.sum().unwrap_or(1) as f32, |_| true) {
62,565✔
782
            let hashed_path = path.into_iter().map(|(id, _)| id).collect::<HashSet<usize>>();
233✔
783
            for node_id in component {
32,891✔
784
                self.node_to_dot(
32,658✔
785
                    &self.get_node(node_id),
32,658✔
786
                    &|node| node.node_dot_default(colors, config, translator, hashed_path.contains(&node_id)), 
32,658✔
787
                    edge_label, 
32,658✔
788
                    &mut f
32,658✔
789
                );
790
            }
791
        }
792

793
        writeln!(&mut f, "}}").unwrap();
1✔
794
        
795
        f.flush().unwrap();
1✔
796
        debug!("large to dot loop: {}", self.len());
1✔
797
    }
1✔
798

799
    /// Write the graph to a dot file in parallel
800
    /// Will write in to n_threads files simultaniously,
801
    /// then go though the files and add the contents to a larger file, 
802
    /// and delete the small files.
803
    /// 
804
    /// ### Arguments: 
805
    /// 
806
    /// * `path`: path to the output file
807
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
808
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
809
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
810
    pub fn to_dot_parallel<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
1✔
811
    where 
1✔
812
        D: Sync,
1✔
813
        K: Sync,
1✔
814
        P: AsRef<Path> + Display + Sync,
1✔
815
        FN: Fn(&Node<K, D>) -> String + Sync,
1✔
816
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String + Sync,
1✔
817
    {        
818
        let slices = current_num_threads();
1✔
819
        let n_nodes = self.len();
1✔
820
        let sz = n_nodes / slices + 1;
1✔
821

822
        debug!("n_nodes: {}", n_nodes);
1✔
823
        debug!("sz: {}", sz);
1✔
824

825
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
826
        let mut start = 0;
1✔
827
        while start < n_nodes {
5✔
828
            parallel_ranges.push(start..start + sz);
4✔
829
            start += sz;
4✔
830
        }
4✔
831

832
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
833
        parallel_ranges.push(last_start..n_nodes);
1✔
834
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
835

836
        let mut files = Vec::with_capacity(current_num_threads());
1✔
837

838
        for i in 0..parallel_ranges.len() {
4✔
839
            files.push(format!("{}-{}.dot", path, i));
4✔
840
        } 
4✔
841

842
        let pb = ProgressBar::new(self.len() as u64);
1✔
843
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
844
        pb.set_message(format!("{:<32}", "writing graph to DOT files"));
1✔
845
    
846
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
4✔
847
            let mut f = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel dot file"));
4✔
848

849
            for i in range {
32,662✔
850
                self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
32,658✔
851
                pb.inc(1);
32,658✔
852
            }
32,658✔
853

854
            f.flush().unwrap();
4✔
855
        });
4✔
856
        pb.finish_and_clear();
1✔
857

858
        let mut out_file = BufWriter::with_capacity(BUF, File::create(path).expect("error creating combined dot file"));
1✔
859

860
        writeln!(&mut out_file, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
861

862
        let pb = ProgressBar::new(files.len() as u64);
1✔
863
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
864
        pb.set_message(format!("{:<32}", "combining files"));
1✔
865

866
        for file in files.iter().progress_with(pb) {
4✔
867
            let open_file = File::open(file).expect("error opening parallel dot file");
4✔
868
            let mut reader = BufReader::new(open_file);
4✔
869
            let mut buffer = [0; BUF];
4✔
870

871
            loop {
872
                let linecount = reader.read(&mut buffer).unwrap();
219✔
873
                if linecount == 0 { break }
219✔
874
                out_file.write_all(&buffer[..linecount]).unwrap();
215✔
875
            }
876

877
            remove_file(file).unwrap();
4✔
878
        }
879

880
        writeln!(&mut out_file, "}}").unwrap();
1✔
881

882
        out_file.flush().unwrap();
1✔
883

884

885
    }
1✔
886

887

888
    /// Write part of the graph to a dot file
889
    /// 
890
    /// ### Arguments: 
891
    /// 
892
    /// * `path`: path to the output file
893
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
894
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
895
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
896
    /// * `nodes`: [`Vec<usize>`] listing all IDs of nodes which should be included
897
    pub fn to_dot_partial<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE, nodes: Vec<usize>) 
1✔
898
    where 
1✔
899
        P: AsRef<Path>,
1✔
900
        FN: Fn(&Node<K, D>) -> String,
1✔
901
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
902
    {
903
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
904

905
        let pb = ProgressBar::new(nodes.len() as u64);
1✔
906
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
907
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
1✔
908

909
        writeln!(&mut f, "# {:?}", nodes).unwrap();
1✔
910
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
911
        for i in nodes.into_iter().progress_with(pb) {
4✔
912
            self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
4✔
913
        }
4✔
914
        writeln!(&mut f, "}}").unwrap();
1✔
915

916
        f.flush().unwrap();
1✔
917

918
        debug!("large to dot loop: {}", self.len());
1✔
919
    }
1✔
920

921
    fn node_to_gfa<F: Fn(&Node<'_, K, D>) -> String>(
97,978✔
922
        &self,
97,978✔
923
        node: &Node<'_, K, D>,
97,978✔
924
        w: &mut dyn Write,
97,978✔
925
        tag_func: Option<&F>,
97,978✔
926
    ) -> Result<(), Error> {
97,978✔
927
        match tag_func {
97,978✔
928
            Some(f) => {
65,320✔
929
                let tags = (f)(node);
65,320✔
930
                writeln!(
65,320✔
931
                    w,
65,320✔
932
                    "S\t{}\t{}\t{}",
65,320✔
933
                    node.node_id,
934
                    node.sequence().to_dna_string(),
65,320✔
935
                    tags
UNCOV
936
                )?;
×
937
            }
938
            _ => writeln!(
32,658✔
939
                w,
32,658✔
940
                "S\t{}\t{}",
32,658✔
941
                node.node_id,
942
                node.sequence().to_dna_string()
32,658✔
UNCOV
943
            )?,
×
944
        }
945

946
        for (_, target, dir, _) in node.l_edges() {
97,978✔
947
            if target >= node.node_id {
97,591✔
948
                let to_dir = match dir {
48,748✔
949
                    Dir::Left => "+",
19,433✔
950
                    Dir::Right => "-",
29,315✔
951
                };
952
                writeln!(
48,748✔
953
                    w,
48,748✔
954
                    "L\t{}\t-\t{}\t{}\t{}M",
48,748✔
955
                    node.node_id,
956
                    target,
957
                    to_dir,
958
                    K::k() - 1
48,748✔
UNCOV
959
                )?;
×
960
            }
48,843✔
961
        }
962

963
        for (_, target, dir, _) in node.r_edges() {
97,978✔
964
            if target > node.node_id {
97,165✔
965
                let to_dir = match dir {
48,634✔
966
                    Dir::Left => "+",
29,415✔
967
                    Dir::Right => "-",
19,219✔
968
                };
969
                writeln!(
48,634✔
970
                    w,
48,634✔
971
                    "L\t{}\t+\t{}\t{}\t{}M",
48,634✔
972
                    node.node_id,
973
                    target,
974
                    to_dir,
975
                    K::k() - 1
48,634✔
UNCOV
976
                )?;
×
977
            }
48,531✔
978
        }
979

980
        Ok(())
97,978✔
981
    }
97,978✔
982

983
    /// Write the graph to GFA format
984
    pub fn to_gfa<P: AsRef<Path>>(&self, gfa_out: P) -> Result<(), Error> {
1✔
985
        let wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
1✔
986
        self.write_gfa(&mut std::io::BufWriter::new(wtr))
1✔
987
    }
1✔
988

989
    pub fn write_gfa(&self, wtr: &mut impl Write) -> Result<(), Error> {
1✔
990
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
991

992
        type DummyFn<K, D> = fn(&Node<'_, K, D>) -> String;
993
        let dummy_opt: Option<&DummyFn<K, D>> = None;
1✔
994

995
        let pb = ProgressBar::new(self.len() as u64);
1✔
996
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
997
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
998

999
        for i in (0..self.len()).progress_with(pb) {
32,658✔
1000
            let n = self.get_node(i);
32,658✔
1001
            self.node_to_gfa(&n, wtr, dummy_opt)?;
32,658✔
1002
        }
1003

1004
        wtr.flush().unwrap();
1✔
1005

1006
        Ok(())
1✔
1007
    }
1✔
1008

1009
    /// Write the graph to GFA format
1010
    pub fn to_gfa_with_tags<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(
1✔
1011
        &self,
1✔
1012
        gfa_out: P,
1✔
1013
        tag_func: F,
1✔
1014
    ) -> Result<(), Error> {
1✔
1015
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creatinf gfa file"));
1✔
1016
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1017

1018
        let pb = ProgressBar::new(self.len() as u64);
1✔
1019
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1020
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1021

1022
        for i in (0..self.len()).progress_with(pb) {
32,658✔
1023
            let n = self.get_node(i);
32,658✔
1024
            self.node_to_gfa(&n, &mut wtr, Some(&tag_func))?;
32,658✔
1025
        }
1026

1027
        wtr.flush().unwrap();
1✔
1028

1029
        Ok(())
1✔
1030
    }
1✔
1031

1032
    /// Write the graph to GFA format, with multithreading, 
1033
    /// pass `tag_func=None` to write without tags
1034
    pub fn to_gfa_otags_parallel<P: AsRef<Path> + Display + Sync, F: Fn(&Node<'_, K, D>) -> String + Sync>(
1✔
1035
        &self,
1✔
1036
        gfa_out: P,
1✔
1037
        tag_func: Option<&F>,
1✔
1038
    ) -> Result<(), Error> 
1✔
1039
    where 
1✔
1040
    K: Sync,
1✔
1041
    D: Sync,
1✔
1042
    {
1043
        // split into ranges according to thread count
1044
        let slices = current_num_threads();
1✔
1045
        let n_nodes = self.len();
1✔
1046
        let sz = n_nodes / slices + 1;
1✔
1047

1048
        debug!("n_nodes: {}", n_nodes);
1✔
1049
        debug!("sz: {}", sz);
1✔
1050

1051
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
1052
        let mut start = 0;
1✔
1053
        while start < n_nodes {
5✔
1054
            parallel_ranges.push(start..start + sz);
4✔
1055
            start += sz;
4✔
1056
        }
4✔
1057

1058
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
1059
        parallel_ranges.push(last_start..n_nodes);
1✔
1060
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
1061

1062
        let mut files = Vec::with_capacity(current_num_threads());
1✔
1063

1064
        for i in 0..parallel_ranges.len() {
4✔
1065
            files.push(format!("{}-{}.gfa", gfa_out, i));
4✔
1066
        } 
4✔
1067

1068
        let pb = ProgressBar::new(self.len() as u64);
1✔
1069
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1070
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1071
        
1072
        
1073
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
4✔
1074
            let mut wtr = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel gfa file"));
4✔
1075

1076
            for i in range {
32,662✔
1077
                let n = self.get_node(i);
32,658✔
1078
                self.node_to_gfa(&n, &mut wtr, tag_func).unwrap();
32,658✔
1079
                pb.inc(1);
32,658✔
1080
            }
32,658✔
1081

1082
            wtr.flush().unwrap();
4✔
1083
        });
4✔
1084

1085
        pb.finish_and_clear();
1✔
1086

1087
        // combine files
1088
        let mut out_file = BufWriter::with_capacity(BUF, File::create(format!("{}.gfa", gfa_out)).expect("error creating combined gfa file"));
1✔
1089
        writeln!(out_file, "H\tVN:Z:debruijn-rs")?;
1✔
1090

1091
        let pb = ProgressBar::new(files.len() as u64);
1✔
1092
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1093
        pb.set_message(format!("{:<32}", "combining files"));
1✔
1094

1095
        for file in files.iter() {
4✔
1096
            let open_file = File::open(file).expect("error opening parallel gfa file");
4✔
1097
            let mut reader = BufReader::new(open_file);
4✔
1098
            let mut buffer = [0; BUF];
4✔
1099

1100
            loop {
1101
                let linecount = reader.read(&mut buffer).unwrap();
113✔
1102
                if linecount == 0 { break }
113✔
1103
                out_file.write_all(&buffer[..linecount]).unwrap();
109✔
1104
            }
1105

1106
            remove_file(file).unwrap();
4✔
1107
        }
1108

1109
        out_file.flush().unwrap();
1✔
1110

1111
        Ok(())
1✔
1112
    }
1✔
1113

1114
    /// Write the graph to GFA format
1115
    pub fn to_gfa_partial<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(&self, gfa_out: P, tag_func: Option<&F>, nodes: Vec<usize>) -> Result<(), Error> {
1✔
1116
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
1✔
1117
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1118

1119
        let pb = ProgressBar::new(self.len() as u64);
1✔
1120
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1121
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1122

1123
        for i in nodes.into_iter().progress_with(pb) {
4✔
1124
            let n = self.get_node(i);
4✔
1125
            self.node_to_gfa(&n, &mut wtr, tag_func)?;
4✔
1126
        }
1127

1128
        wtr.flush().unwrap();
1✔
1129

1130
        Ok(())    
1✔
1131
    }
1✔
1132

1133
    pub fn to_json_rest<W: Write, F: Fn(&D) -> Value>(
×
1134
        &self,
×
1135
        fmt_func: F,
×
1136
        mut writer: &mut W,
×
1137
        rest: Option<Value>,
×
1138
    ) {
×
1139
        writeln!(writer, "{{\n\"nodes\": [").unwrap();
×
1140
        for i in 0..self.len() {
×
1141
            let node = self.get_node(i);
×
1142
            node.to_json(&fmt_func, writer);
×
1143
            if i == self.len() - 1 {
×
1144
                writeln!(writer).unwrap();
×
1145
            } else {
×
1146
                writeln!(writer, ",").unwrap();
×
1147
            }
×
1148
        }
1149
        writeln!(writer, "],").unwrap();
×
1150

1151
        writeln!(writer, "\"links\": [").unwrap();
×
1152
        for i in 0..self.len() {
×
1153
            let node = self.get_node(i);
×
1154
            match node.edges_to_json(writer) {
×
1155
                true => {
1156
                    if i == self.len() - 1 {
×
1157
                        writeln!(writer).unwrap();
×
1158
                    } else {
×
1159
                        writeln!(writer, ",").unwrap();
×
1160
                    }
×
1161
                }
1162
                _ => continue,
×
1163
            }
1164
        }
1165
        writeln!(writer, "]").unwrap();
×
1166

1167
        match rest {
×
1168
            Some(Value::Object(v)) => {
×
1169
                for (k, v) in v.iter() {
×
1170
                    writeln!(writer, ",").expect("io error");
×
1171
                    write!(writer, "\"{}\": ", k).expect("io error");
×
1172
                    serde_json::to_writer(&mut writer, v).expect("io error");
×
1173
                    writeln!(writer).expect("io error");
×
1174
                }
×
1175
            }
1176
            _ => {
×
1177
                writeln!(writer).expect("io error");
×
1178
            }
×
1179
        }
1180

1181
        writeln!(writer, "}}").expect("io error");
×
1182
    }
×
1183

1184
    /// Write the graph to JSON
1185
    pub fn to_json<W: Write, F: Fn(&D) -> Value, RF: Fn(&mut W)>(
×
1186
        &self,
×
1187
        fmt_func: F,
×
1188
        writer: &mut W,
×
1189
    ) {
×
1190
        self.to_json_rest(fmt_func, writer, None);
×
1191
    }
×
1192

1193
    /// Print a text representation of the graph.
1194
    pub fn print(&self) {
2✔
1195
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
2✔
1196
        for node in self.iter_nodes() {
139✔
1197
            println!("{:?}", node);
139✔
1198
        }
139✔
1199
    }
2✔
1200

1201
    pub fn print_with_data(&self) {
×
1202
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
×
1203
        for node in self.iter_nodes() {
×
1204
            println!("{:?} ({:?})", node, node.data());
×
1205
        }
×
1206
    }
×
1207

1208
    pub fn max_path_beam<F, F2>(&self, beam: usize, score: F, _solid_path: F2) -> Vec<(usize, Dir)>
×
1209
    where
×
1210
        F: Fn(&D) -> f32,
×
1211
        F2: Fn(&D) -> bool,
×
1212
    {
1213
        if self.is_empty() {
×
1214
            return Vec::default();
×
1215
        }
×
1216

1217
        let mut states = Vec::new();
×
1218

1219
        for i in 0..self.len() {
×
1220
            let node = self.get_node(i);
×
1221

1222
            // Initialize beam search on terminal nodes
1223
            if node.exts().num_exts_l() == 0 || node.exts().num_exts_r() == 0 {
×
1224
                let dir = if node.exts().num_exts_l() > 0 {
×
1225
                    Dir::Right
×
1226
                } else {
1227
                    Dir::Left
×
1228
                };
1229

1230
                let status = if node.exts().num_exts_l() == 0 && node.exts().num_exts_r() == 0 {
×
1231
                    Status::End
×
1232
                } else {
1233
                    Status::Active
×
1234
                };
1235

1236
                let mut path = SmallVec8::new();
×
1237
                path.push((i as u32, dir));
×
1238

1239
                let s = State {
×
1240
                    path,
×
1241
                    status,
×
1242
                    score: score(node.data()),
×
1243
                };
×
1244
                states.push(s);
×
1245
            }
×
1246
        }
1247

1248
        // No end nodes -- just start on the first node
1249
        if states.is_empty() {
×
1250
            // Make a start
×
1251
            let node = self.get_node(0);
×
1252
            let mut path = SmallVec8::new();
×
1253
            path.push((0, Dir::Left));
×
1254
            states.push(State {
×
1255
                path,
×
1256
                status: Status::Active,
×
1257
                score: score(node.data()),
×
1258
            });
×
1259
        }
×
1260

1261
        // Beam search until we can't find any more expansions
1262
        let mut active = true;
×
1263
        while active {
×
1264
            let mut new_states = Vec::with_capacity(states.len());
×
1265
            active = false;
×
1266

1267
            for s in states {
×
1268
                if s.status == Status::Active {
×
1269
                    active = true;
×
1270
                    let expanded = self.expand_state(&s, &score);
×
1271
                    new_states.extend(expanded);
×
1272
                } else {
×
1273
                    new_states.push(s)
×
1274
                }
1275
            }
1276

1277
            // workaround to sort by descending score - will panic if there are NaN scores
1278
            new_states.sort_by(|a, b| (-(a.score)).partial_cmp(&-(b.score)).unwrap());
×
1279
            new_states.truncate(beam);
×
1280
            states = new_states;
×
1281
        }
1282

1283
        for (i, state) in states.iter().take(5).enumerate() {
×
1284
            trace!("i:{}  -- {:?}", i, state);
×
1285
        }
1286

1287
        // convert back to using usize for node_id
1288
        states[0]
×
1289
            .path
×
1290
            .iter()
×
1291
            .map(|&(node, dir)| (node as usize, dir))
×
1292
            .collect()
×
1293
    }
×
1294

1295
    fn expand_state<F>(&self, state: &State, score: &F) -> SmallVec4<State>
×
1296
    where
×
1297
        F: Fn(&D) -> f32,
×
1298
    {
1299
        if state.status != Status::Active {
×
1300
            panic!("only attempt to expand active states")
×
1301
        }
×
1302

1303
        let (node_id, dir) = state.path[state.path.len() - 1];
×
1304
        let node = self.get_node(node_id as usize);
×
1305
        let mut new_states = SmallVec4::new();
×
1306

1307
        for (_, next_node_id, incoming_dir, _) in node.edges(dir.flip()) {
×
1308
            let next_node = self.get_node(next_node_id);
×
1309
            let new_score = state.score + score(next_node.data());
×
1310

1311
            let cycle = state
×
1312
                .path
×
1313
                .iter()
×
1314
                .any(|&(prev_node, _)| prev_node == (next_node_id as u32));
×
1315

1316
            let status = if cycle {
×
1317
                Status::Cycle
×
1318
            } else if next_node.edges(incoming_dir.flip()).is_empty() {
×
1319
                Status::End
×
1320
            } else {
1321
                Status::Active
×
1322
            };
1323

1324
            let mut new_path = state.path.clone();
×
1325
            new_path.push((next_node_id as u32, incoming_dir));
×
1326

1327
            let next_state = State {
×
1328
                path: new_path,
×
1329
                score: new_score,
×
1330
                status,
×
1331
            };
×
1332

1333
            new_states.push(next_state);
×
1334
        }
1335

1336
        new_states
×
1337
    }
×
1338

1339

1340
    pub fn iter_components(&self) -> IterComponents<K, D> {
5✔
1341
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
5✔
1342
        let pos = 0;
5✔
1343

1344
        for _i in 0..self.len() {
33,206✔
1345
            visited.push(false);
33,206✔
1346
        }
33,206✔
1347

1348
        IterComponents { 
5✔
1349
            graph: self, 
5✔
1350
            visited, 
5✔
1351
            pos }
5✔
1352
    }
5✔
1353

1354

1355
    /// iteratively returns 2D Vec with node_ids grouped according to the connected components they form
1356
    pub fn components_i(&self) -> Vec<Vec<usize>> {
1✔
1357
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
1✔
1358
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
1✔
1359

1360
        for _i in 0..self.len() {
137✔
1361
            visited.push(false);
137✔
1362
        }
137✔
1363

1364
        for i in 0..self.len() {
137✔
1365
            if !visited[i] {
137✔
1366
                let comp = self.component_i(&mut visited, i);
2✔
1367
                components.push(comp);
2✔
1368
            }
135✔
1369
        }
1370

1371
        components
1✔
1372
    }
1✔
1373

1374
    /// recursively detects which nodes form separate graph components
1375
    /// returns 2D vector with node ids per component
1376
    /// (may lead to stack overflow)
1377
    pub fn components_r(&self) -> Vec<Vec<usize>> {
2✔
1378
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
2✔
1379
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
2✔
1380

1381
        for _i in 0..self.len() {
139✔
1382
            visited.push(false);
139✔
1383
        }
139✔
1384

1385
        for i in 0..self.len() {
139✔
1386
            if !visited[i] {
139✔
1387
                let mut comp: Vec<usize> = Vec::new();
4✔
1388
                self.component_r(&mut visited, i, &mut comp);
4✔
1389
                components.push(comp);
4✔
1390
            }
135✔
1391
        }
1392

1393
        components
2✔
1394

1395
    }
2✔
1396

1397
    fn component_r<'a>(&'a self, visited: &'a mut Vec<bool>, i: usize, comp: &'a mut Vec<usize>) {
139✔
1398
        
1399
        visited[i] = true;
139✔
1400
        comp.push(i);
139✔
1401
        let mut edges = self.find_edges(i, Dir::Left);
139✔
1402
        let mut r_edges = self.find_edges(i, Dir::Right);
139✔
1403

1404
        edges.append(&mut r_edges);
139✔
1405

1406
        for (_, edge, _, _) in edges.iter() {
270✔
1407
            if !visited[*edge] {
270✔
1408
                self.component_r(visited, *edge, comp);
135✔
1409
            }
135✔
1410
        }
1411
    }
139✔
1412

1413
    fn component_i<'a>(&'a self, visited: &'a mut [bool], i: usize) -> Vec<usize> {
243✔
1414
        let mut edges: Vec<usize> = Vec::new();
243✔
1415
        let mut comp: Vec<usize> = Vec::new();
243✔
1416

1417
        edges.push(i);
243✔
1418

1419
        while let Some(current_edge) = edges.pop() {
33,617✔
1420
            if !visited[current_edge] { 
33,374✔
1421
                comp.push(current_edge);
33,343✔
1422
                visited[current_edge] = true;
33,343✔
1423

1424
                let mut l_edges = self.find_edges(current_edge, Dir::Left);
33,343✔
1425
                let mut r_edges = self.find_edges(current_edge, Dir::Right);
33,343✔
1426

1427
                l_edges.append(&mut r_edges);
33,343✔
1428

1429
                for (_, new_edge, _, _) in l_edges.into_iter() {
66,266✔
1430
                    if !visited[new_edge] {
66,266✔
1431
                        edges.push(new_edge);
33,131✔
1432
                    }
33,135✔
1433
                }
1434
            }
31✔
1435
        }
1436
        comp
243✔
1437
    }
243✔
1438

1439
    /// iterate over all edges of the graph, item: (node, ext base, ext dir, target node)
1440
    pub fn iter_edges(&self) -> EdgeIter<'_, K, D> {
1✔
1441
        EdgeIter::new(self)
1✔
1442
    }
1✔
1443

1444
    pub fn find_bad_nodes<F: Fn(&Node<'_, K, D>) -> bool>(&self, valid: F) -> Vec<usize> {
×
1445
        let mut bad_nodes = Vec::new();
×
1446

1447
        for (i, node) in enumerate(self.iter_nodes()) {
×
1448
            if !valid(&node) { bad_nodes.push(i); }
×
1449
        }
1450
        
1451
        bad_nodes
×
1452
    }
×
1453
}
1454

1455
impl<K: Kmer, SD: Debug> DebruijnGraph<K, SD> {
1456
    pub fn create_colors<'a, 'b: 'a, DI>(&'a self, config: &SummaryConfig, color_mode: ColorMode<'b>) -> Colors<'b, SD, DI> 
×
1457
    where 
×
1458
    SD: SummaryData<DI>,
×
1459
    {
1460
        Colors::new(self, config, color_mode)
×
1461
    }
×
1462
    
1463
    /// edge mults will contain hanging edges if the nodes were filtered
1464
    pub fn fix_edge_mults<DI>(&mut self) 
2✔
1465
    where 
2✔
1466
        SD: SummaryData<DI>
2✔
1467
    {
1468
        if self.get_node(0).data().edge_mults().is_some() {
2✔
1469
            for i in 0..self.len() {
274✔
1470
                self.base.data[i].fix_edge_mults(self.base.exts[i]);
274✔
1471
            }
274✔
UNCOV
1472
        }
×
1473
    }
2✔
1474

1475
    /// if there are edge mults in the data, prune the graph by removing edges that have a low coverage
1476
    pub fn filter_edges<DI>(&mut self, min: u32) -> Result<(), String>
1✔
1477
    where 
1✔
1478
        SD: SummaryData<DI>
1✔
1479
    {
1480
        // return if there is no edge coverage available
1481
        if self.get_node(0).data().edge_mults().is_none() { return Err(String::from("no edge mults available")) };
1✔
1482

1483
        for i in 0..self.len() {
137✔
1484
            let em = self.get_node(i).data().edge_mults().expect("shold have em").clone();
137✔
1485
            let edges = [(Dir::Left, 0), (Dir::Left, 1), (Dir::Left, 2), (Dir::Left, 3), 
137✔
1486
                (Dir::Right, 0), (Dir::Right, 1), (Dir::Right, 2), (Dir::Right, 3)];
137✔
1487
            
1488
            for (dir, base) in edges {
1,233✔
1489
                if min > em.edge_mult(base, dir) {
1,096✔
1490
                    // remove invalid ext from node
826✔
1491
                    let ext = self.base.exts[i].remove(dir, base);
826✔
1492
                    self.base.exts[i] = ext;
826✔
1493
                }
826✔
1494
            }
1495
        }
1496

1497
        // now that exts are remove, fix hanging edge mults
1498
        self.fix_edge_mults();
1✔
1499

1500
        Ok(())
1✔
1501
    }
1✔
1502
}
1503

1504
#[derive(Debug, Eq, PartialEq)]
1505
enum Status {
1506
    Active,
1507
    End,
1508
    Cycle,
1509
}
1510

1511
#[derive(Debug)]
1512
struct State {
1513
    path: SmallVec8<(u32, Dir)>,
1514
    score: f32,
1515
    status: Status,
1516
}
1517

1518
impl State {}
1519

1520
/// Iterator over nodes in a `DeBruijnGraph`
1521
pub struct NodeIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1522
    graph: &'a DebruijnGraph<K, D>,
1523
    node_id: usize,
1524
}
1525

1526
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIter<'a, K, D> {
1527
    type Item = Node<'a, K, D>;
1528

1529
    fn next(&mut self) -> Option<Node<'a, K, D>> {
538,035✔
1530
        if self.node_id < self.graph.len() {
538,035✔
1531
            let node = self.graph.get_node(self.node_id);
537,906✔
1532
            self.node_id += 1;
537,906✔
1533
            Some(node)
537,906✔
1534
        } else {
1535
            None
129✔
1536
        }
1537
    }
538,035✔
1538
}
1539

1540
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for &'a DebruijnGraph<K, D> {
1541
    type Item = NodeKmer<'a, K, D>;
1542
    type IntoIter = NodeIntoIter<'a, K, D>;
1543

1544
    fn into_iter(self) -> Self::IntoIter {
2,220✔
1545
        NodeIntoIter {
2,220✔
1546
            graph: self,
2,220✔
1547
            node_id: 0,
2,220✔
1548
        }
2,220✔
1549
    }
2,220✔
1550
}
1551

1552
/// Iterator over nodes in a `DeBruijnGraph`
1553
pub struct NodeIntoIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1554
    graph: &'a DebruijnGraph<K, D>,
1555
    node_id: usize,
1556
}
1557

1558
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIntoIter<'a, K, D> {
1559
    type Item = NodeKmer<'a, K, D>;
1560

1561
    fn next(&mut self) -> Option<Self::Item> {
210,038✔
1562
        if self.node_id < self.graph.len() {
210,038✔
1563
            let node_id = self.node_id;
210,038✔
1564
            let node = self.graph.get_node(node_id);
210,038✔
1565
            let node_seq = node.sequence();
210,038✔
1566

1567
            self.node_id += 1;
210,038✔
1568
            Some(NodeKmer {
210,038✔
1569
                node_id,
210,038✔
1570
                node_seq_slice: node_seq,
210,038✔
1571
                phantom_d: PhantomData,
210,038✔
1572
                phantom_k: PhantomData,
210,038✔
1573
            })
210,038✔
1574
        } else {
1575
            None
×
1576
        }
1577
    }
210,038✔
1578
}
1579

1580
/// A `DebruijnGraph` node with a reference to the sequence of the node.
1581
#[derive(Clone)]
1582
pub struct NodeKmer<'a, K: Kmer + 'a, D: Debug + 'a> {
1583
    pub node_id: usize,
1584
    node_seq_slice: DnaStringSlice<'a>,
1585
    phantom_k: PhantomData<K>,
1586
    phantom_d: PhantomData<D>,
1587
}
1588

1589
/// An iterator over the kmers in a `DeBruijn graph node`
1590
pub struct NodeKmerIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1591
    kmer_id: usize,
1592
    kmer: K,
1593
    num_kmers: usize,
1594
    node_seq_slice: DnaStringSlice<'a>,
1595
    phantom_k: PhantomData<K>,
1596
    phantom_d: PhantomData<D>,
1597
}
1598

1599
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for NodeKmer<'a, K, D> {
1600
    type Item = K;
1601
    type IntoIter = NodeKmerIter<'a, K, D>;
1602

1603
    fn into_iter(self) -> Self::IntoIter {
420,076✔
1604
        let num_kmers = self.node_seq_slice.len() - K::k() + 1;
420,076✔
1605

1606
        let kmer = if num_kmers > 0 {
420,076✔
1607
            self.node_seq_slice.get_kmer::<K>(0)
420,076✔
1608
        } else {
1609
            K::empty()
×
1610
        };
1611

1612
        NodeKmerIter {
420,076✔
1613
            kmer_id: 0,
420,076✔
1614
            kmer,
420,076✔
1615
            num_kmers,
420,076✔
1616
            node_seq_slice: self.node_seq_slice,
420,076✔
1617
            phantom_k: PhantomData,
420,076✔
1618
            phantom_d: PhantomData,
420,076✔
1619
        }
420,076✔
1620
    }
420,076✔
1621
}
1622

1623
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeKmerIter<'a, K, D> {
1624
    type Item = K;
1625

1626
    fn next(&mut self) -> Option<Self::Item> {
3,532,886✔
1627
        if self.num_kmers == self.kmer_id {
3,532,886✔
1628
            None
×
1629
        } else {
1630
            let current_kmer = self.kmer;
3,532,886✔
1631

1632
            self.kmer_id += 1;
3,532,886✔
1633
            if self.kmer_id < self.num_kmers {
3,532,886✔
1634
                let next_base = self.node_seq_slice.get(self.kmer_id + K::k() - 1);
3,478,570✔
1635
                let new_kmer = self.kmer.extend_right(next_base);
3,478,570✔
1636
                self.kmer = new_kmer;
3,478,570✔
1637
            }
3,478,570✔
1638

1639
            Some(current_kmer)
3,532,886✔
1640
        }
1641
    }
3,532,886✔
1642

1643
    fn size_hint(&self) -> (usize, Option<usize>) {
210,038✔
1644
        (self.num_kmers, Some(self.num_kmers))
210,038✔
1645
    }
210,038✔
1646

1647
    /// Provide a 'fast-forward' capability for this iterator
1648
    /// MPHF will use this to reduce the number of kmers that
1649
    /// need to be produced.
1650
    fn nth(&mut self, n: usize) -> Option<Self::Item> {
2,547,588✔
1651
        if n <= 4 {
2,547,588✔
1652
            // for small skips forward, shift one base at a time
1653
            for _ in 0..n {
2,287,202✔
1654
                self.next();
985,298✔
1655
            }
985,298✔
1656
        } else {
260,386✔
1657
            self.kmer_id += n;
260,386✔
1658
            self.kmer = self.node_seq_slice.get_kmer::<K>(self.kmer_id);
260,386✔
1659
        }
260,386✔
1660

1661
        self.next()
2,547,588✔
1662
    }
2,547,588✔
1663
}
1664

1665
/// Marker signifying that NodeKmerIter has a known size.
1666
impl<'a, K: Kmer + 'a, D: Debug + 'a> ExactSizeIterator for NodeKmerIter<'a, K, D> {}
1667

1668
/// Unbranched sequence in the DeBruijn graph
1669
pub struct Node<'a, K: Kmer + 'a, D: 'a> {
1670
    pub node_id: usize,
1671
    pub graph: &'a DebruijnGraph<K, D>,
1672
}
1673

1674
impl<'a, K: Kmer, D: Debug> Node<'a, K, D> {
1675
    /// Length of the sequence of this node
1676
    pub fn len(&self) -> usize {
1,645,212✔
1677
        self.graph.base.sequences.get(self.node_id).len()
1,645,212✔
1678
    }
1,645,212✔
1679

1680
    pub fn is_empty(&self) -> bool {
×
1681
        self.graph.base.sequences.get(self.node_id).is_empty()
×
1682
    }
×
1683

1684
    /// Sequence of the node
1685
    pub fn sequence(&self) -> DnaStringSlice<'a> {
3,342,552✔
1686
        self.graph.base.sequences.get(self.node_id)
3,342,552✔
1687
    }
3,342,552✔
1688

1689
    /// Reference to auxiliarly data associated with the node
1690
    pub fn data(&self) -> &'a D {
3,954,711✔
1691
        &self.graph.base.data[self.node_id]
3,954,711✔
1692
    }
3,954,711✔
1693

1694
    /// Extension bases from this node
1695
    pub fn exts(&self) -> Exts {
2,389,420✔
1696
        self.graph.base.exts[self.node_id]
2,389,420✔
1697
    }
2,389,420✔
1698

1699
    /// Edges leaving the left side of the node in the format
1700
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
1701
    pub fn l_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
294,101✔
1702
        self.graph.find_edges(self.node_id, Dir::Left)
294,101✔
1703
    }
294,101✔
1704

1705
    /// Edges leaving the right side of the node in the format
1706
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
1707
    pub fn r_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
294,101✔
1708
        self.graph.find_edges(self.node_id, Dir::Right)
294,101✔
1709
    }
294,101✔
1710

1711
    /// Edges leaving the 'dir' side of the node in the format
1712
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
1713
    pub fn edges(&self, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
400,242✔
1714
        self.graph.find_edges(self.node_id, dir)
400,242✔
1715
    }
400,242✔
1716

1717
    fn to_json<F: Fn(&D) -> Value>(&self, func: &F, f: &mut dyn Write) {
×
1718
        write!(
×
1719
            f,
×
1720
            "{{\"id\":\"{}\",\"L\":{},\"D\":{},\"Se\":\"{:?}\"}}",
×
1721
            self.node_id,
1722
            self.sequence().len(),
×
1723
            (func)(self.data()),
×
1724
            self.sequence(),
×
1725
        )
1726
        .unwrap();
×
1727
    }
×
1728

1729
    fn edges_to_json(&self, f: &mut dyn Write) -> bool {
×
1730
        let mut wrote = false;
×
1731
        let edges = self.r_edges();
×
1732
        for (idx, &(_, id, incoming_dir, _)) in edges.iter().enumerate() {
×
1733
            write!(
×
1734
                f,
×
1735
                "{{\"source\":\"{}\",\"target\":\"{}\",\"D\":\"{}\"}}",
×
1736
                self.node_id,
1737
                id,
1738
                match incoming_dir {
×
1739
                    Dir::Left => "L",
×
1740
                    Dir::Right => "R",
×
1741
                }
1742
            )
1743
            .unwrap();
×
1744

1745
            if idx < edges.len() - 1 {
×
1746
                write!(f, ",").unwrap();
×
1747
            }
×
1748

1749
            wrote = true;
×
1750
        }
1751
        wrote
×
1752
    }
×
1753
}
1754

1755
// TODO make generic instead of u8 (u8 is sufficient for dbg)
1756
impl<K: Kmer, SD: Debug> Node<'_, K, SD>  {
1757
    /// get default format for dot edges based on node data
1758
    pub fn edge_dot_default<DI>(&self, colors: &Colors<SD, DI>, base: u8, incoming_dir: Dir, flipped: bool) -> String 
194,761✔
1759
    where SD: SummaryData<DI>
194,761✔
1760
    {
1761
        // set color based on dir
1762
        let color = match incoming_dir {
194,761✔
1763
            Dir::Left => "blue",
97,593✔
1764
            Dir::Right => "red"
97,168✔
1765
        };
1766
        
1767
        if let Some(em) = self.data().edge_mults() {
194,761✔
1768
            
1769
            let dir = if flipped { 
194,761✔
1770
                incoming_dir 
77,306✔
1771
            } else {
1772
                incoming_dir.flip()
117,455✔
1773
            };
1774

1775
            // set penwidth based on count
1776
            let count = em.edge_mult(base, dir);
194,761✔
1777
            let penwidth = colors.edge_width(count);
194,761✔
1778

1779
            format!("[color={color}, penwidth={penwidth}, label=\"{}: {count}\"]", bits_to_base(base))
194,761✔
1780
        } else {
NEW
1781
            format!("[color={color}, penwidth={}]", colors.edge_width(1)) // since there should be no edge mults, this will return default value
×
1782
        }
1783
    }
194,761✔
1784

1785
    /// get default format for dot nodes, based on node data
1786
    pub fn node_dot_default<DI>(&self, colors: &Colors<SD, DI>, config: &SummaryConfig, translator: &Translator, outline: bool) -> String
97,994✔
1787
    where SD: SummaryData<DI>
97,994✔
1788
    {
1789
        // set color based on labels/fold change/p-value
1790
        let color = colors.node_color(self.data(), config, outline);
97,994✔
1791

1792
        let data_info = self.data().print(translator, config);
97,994✔
1793
        const MIN_TEXT_WIDTH: usize = 40;
1794
        let wrap = if self.len() > MIN_TEXT_WIDTH { self.len() } else { MIN_TEXT_WIDTH };
97,994✔
1795

1796
        let label = textwrap::fill(&format!("id: {}, len: {}, exts: {:?}, seq: {}\n{}", 
97,994✔
1797
            self.node_id,
97,994✔
1798
            self.len(),
97,994✔
1799
            self.exts(),
97,994✔
1800
            self.sequence(),
97,994✔
1801
            data_info
97,994✔
1802
        ), wrap);
97,994✔
1803

1804
        format!("[style=filled, {color}, label=\"{label}\"]")
97,994✔
1805
    }
97,994✔
1806
}
1807

1808
impl<K: Kmer, D> fmt::Debug for Node<'_, K, D>
1809
where
1810
    D: Debug,
1811
{
1812
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
139✔
1813
        write!(
139✔
1814
            f,
139✔
1815
            "Node {{ id:{}, Exts: {:?}, L:{:?} R:{:?}, Seq: {:?}, Data: {:?} }}",
139✔
1816
            self.node_id,
1817
            self.exts(),
139✔
1818
            self.l_edges(),
139✔
1819
            self.r_edges(),
139✔
1820
            self.sequence().len(),
139✔
1821
            self.data()
139✔
1822
        )
1823
    }
139✔
1824
}
1825

1826
pub struct IterComponents<'a, K: Kmer, D> {
1827
    graph: &'a DebruijnGraph<K, D>,
1828
    visited: Vec<bool>,
1829
    pos: usize,
1830
}
1831

1832
impl<K: Kmer, D: Debug> Iterator for IterComponents<'_, K, D> {
1833
    type Item = Vec<usize>;
1834
    fn next(&mut self) -> Option<Self::Item> {
246✔
1835
        while self.pos < self.graph.len() {
33,211✔
1836
            if !self.visited[self.pos] {
33,206✔
1837
                let comp = self.graph.component_i(&mut self.visited, self.pos);
241✔
1838
                self.pos += 1;
241✔
1839
                return Some(comp)
241✔
1840
            } else {
32,965✔
1841
                self.pos += 1;
32,965✔
1842
            }
32,965✔
1843
        }
1844
        assert!(self.visited.iter().map(|x| *x as usize).sum::<usize>() == self.graph.len());
33,206✔
1845
        None
5✔
1846
    }
246✔
1847
    
1848
}
1849

1850
pub struct PathCompIter<'a, K: Kmer, D: Debug, F, F2> 
1851
where 
1852
F: Fn(&D) -> f32,
1853
F2: Fn(&D) -> bool
1854
{
1855
    graph: &'a DebruijnGraph<K, D>,
1856
    component_iterator: IterComponents<'a, K, D>,
1857
    graph_pos: usize,
1858
    score: F,
1859
    solid_path: F2,
1860
}
1861

1862
/// returns the component and the "best" path in the component
1863
impl<K: Kmer, D: Debug, F, F2> Iterator for PathCompIter<'_, K, D, F, F2> 
1864
where 
1865
F: Fn(&D) -> f32,
1866
F2: Fn(&D) -> bool
1867
{
1868
    type Item = (Vec<usize>, VecDeque<(usize, Dir)>,);
1869
    fn next(&mut self) -> Option<Self::Item> {
240✔
1870
        match self.component_iterator.next() {
240✔
1871
            Some(component) => {
237✔
1872
                let current_comp = component;
237✔
1873
                
1874
    
1875
                let mut best_node = current_comp[0];
237✔
1876
                let mut best_score = f32::MIN;
237✔
1877
                for c in current_comp.iter() {
32,932✔
1878
                    let node = self.graph.get_node(*c);
32,932✔
1879
                    let node_score = (self.score)(node.data());
32,932✔
1880
    
1881
                    if node_score > best_score {
32,932✔
1882
                        best_node = *c;
281✔
1883
                        best_score = node_score;
281✔
1884
                    }
32,651✔
1885
                }
1886
    
1887
                let oscore = |state| match state {
60,158✔
1888
                    None => 0.0,
29,999✔
1889
                    Some((id, _)) => (self.score)(self.graph.get_node(id).data()),
30,159✔
1890
                };
60,158✔
1891
    
1892
                let osolid_path = |state| match state {
30,079✔
1893
                    None => false,
×
1894
                    Some((id, _)) => (self.solid_path)(self.graph.get_node(id).data()),
30,079✔
1895
                };
30,079✔
1896
    
1897
                // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
1898
                // already put into the path
1899
                let mut used_nodes = HashSet::new();
237✔
1900
                let mut path = VecDeque::new();
237✔
1901
    
1902
                // Start w/ initial state
1903
                used_nodes.insert(best_node);
237✔
1904
                path.push_front((best_node, Dir::Left));
237✔
1905
    
1906
                for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
474✔
1907
                    let &(start_node, dir, do_flip) = init;
474✔
1908
                    let mut current = (start_node, dir);
474✔
1909
    
1910
                    loop {
1911
                        let mut next = None;
30,470✔
1912
                        let (cur_id, incoming_dir) = current;
30,470✔
1913
                        let node = self.graph.get_node(cur_id);
30,470✔
1914
                        let edges = node.edges(incoming_dir.flip());
30,470✔
1915
    
1916
                        let mut solid_paths = 0;
30,470✔
1917
                        for (_, id, dir, _) in edges {
60,549✔
1918
                            let cand = Some((id, dir));
30,079✔
1919
                            if osolid_path(cand) {
30,079✔
1920
                                solid_paths += 1;
30,079✔
1921

1922
                                // second if clause is outside of first in original code (see max_path) 
1923
                                // but would basically ignore path validity.
1924
                                if oscore(cand) > oscore(next) {
30,079✔
1925
                                    next = cand;
30,037✔
1926
                                }
30,037✔
1927
                            }
×
1928
                        }
1929
                        
1930
                        // break if multiple solid paths are available
1931
                        /* if solid_paths > 1 {
1932
                            break;
1933
                        } */
1934
    
1935
                        match next {
29,999✔
1936
                            Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
29,999✔
1937
                                if do_flip {
29,996✔
1938
                                    path.push_front((next_id, next_incoming.flip()));
15,087✔
1939
                                } else {
15,087✔
1940
                                    path.push_back((next_id, next_incoming));
14,909✔
1941
                                }
14,909✔
1942
    
1943
                                used_nodes.insert(next_id);
29,996✔
1944
                                current = (next_id, next_incoming);
29,996✔
1945
                            }
1946
                            _ => break,
474✔
1947
                        }
1948
                    }
1949
                }
1950
                
1951
                
1952
                Some((current_comp, path))
237✔
1953
            }, 
1954
            None => {
1955
                // should technically not need graph_pos after this 
1956
                self.graph_pos += 1;
3✔
1957
                None
3✔
1958
            }
1959
        }
1960
    }
240✔
1961
}
1962

1963

1964
/// iterator over the edges of the de bruijn graph
1965
pub struct EdgeIter<'a, K: Kmer, D: Debug> {
1966
    graph: &'a DebruijnGraph<K, D>,
1967
    visited_edges: HashSet<(usize, usize)>,
1968
    current_node: usize,
1969
    current_dir: Dir,
1970
    node_edge_iter: smallvec::IntoIter<[(u8, usize, Dir, bool); 4]>
1971
}
1972

1973
impl<K: Kmer, D: Debug> EdgeIter<'_, K, D> {
1974
    pub fn new(graph: &DebruijnGraph<K, D>) -> EdgeIter<'_, K, D>{
1✔
1975
        let node_edge_iter = graph.get_node(0).l_edges().into_iter();
1✔
1976

1977
        EdgeIter { 
1✔
1978
            graph, 
1✔
1979
            visited_edges: HashSet::new(), 
1✔
1980
            current_node: 0, 
1✔
1981
            current_dir: Dir::Left, 
1✔
1982
            node_edge_iter
1✔
1983
        }
1✔
1984
    }
1✔
1985
}
1986

1987
impl<K: Kmer, D: Debug> Iterator for EdgeIter<'_, K, D> {
1988
    type Item = (usize, Dir, u8, usize); // node, direction leaving node, base, target node
1989

1990
    fn next(&mut self) -> Option<Self::Item> {
27✔
1991
        loop {
1992
            if let Some((base, nb_node_id, _, _)) = self.node_edge_iter.next() {
108✔
1993
                let edge = if self.current_node > nb_node_id { (nb_node_id, self.current_node) } else { (self.current_node, nb_node_id) };
52✔
1994

1995
                if self.visited_edges.insert(edge) { return Some((self.current_node, self.current_dir, base, nb_node_id)); } // else simply skip and move on
52✔
1996

1997
            } else {
1998
                match self.current_dir {
56✔
1999
                Dir::Left => {
28✔
2000
                    // no left edges, switch to right edges
28✔
2001
                    self.current_dir = Dir::Right;
28✔
2002
                    self.node_edge_iter = self.graph.get_node(self.current_node).r_edges().into_iter();
28✔
2003
                    
28✔
2004
                }
28✔
2005
                Dir::Right => {
2006
                    // no right edges, switch to next node left edges
2007
                    self.current_node += 1;
28✔
2008

2009
                    // quit if end of graph is reached
2010
                    if self.current_node == self.graph.len() { return None }
28✔
2011

2012
                    self.current_dir = Dir::Left;
27✔
2013
                    self.node_edge_iter = self.graph.get_node(self.current_node).l_edges().into_iter();
27✔
2014
                }
2015
            }
2016
            }
2017
            
2018
        }
2019
    }
27✔
2020
}
2021

2022
#[cfg(test)]
2023
mod test {
2024
    use std::{fs::File, io::BufReader};
2025

2026
    use crate::{kmer::Kmer16, summarizer::TagsCountsSumData};
2027

2028
    use super::DebruijnGraph;
2029
    use crate::{summarizer::SummaryData, Dir, BUF};
2030

2031

2032
    #[test]
2033
    #[cfg(not(feature = "sample128"))]
2034
    fn test_components() {
2035

2036
        let path = "test_data/400.graph.dbg";
2037
        let file = BufReader::with_capacity(BUF, File::open(path).unwrap());
2038

2039
        let (graph, _, _): (DebruijnGraph<Kmer16, TagsCountsSumData>, Vec<String>, crate::summarizer::SummaryConfig) = 
2040
            bincode::deserialize_from(file).expect("error deserializing graph");
2041

2042
        let components = graph.iter_components();
2043

2044
        let check_components = [
2045
            vec![3, 67, 130, 133, 59, 119, 97, 110, 68, 137, 29, 84, 131, 43, 30, 91, 14, 70, 79, 142, 136, 105, 103, 62, 
2046
                141, 104, 134, 88, 38, 81, 108, 92, 135, 96, 116, 121, 63, 124, 106, 129, 132, 126, 93, 109, 83, 112, 118, 
2047
                123, 125, 78, 122, 115, 75, 128, 140, 111, 26, 143, 113],
2048
            vec![41, 138, 100, 139, 86],
2049
            vec![53, 117, 127],
2050
            vec![69, 144, 77, 120, 114, 107, 101],
2051
        ];
2052

2053
        let mut counter = 0;
2054

2055
        for component in components {
2056
            if component.len() > 1 { 
2057
                println!("component: {:?}", component);
2058
                assert_eq!(component, check_components[counter]);
2059
                counter += 1;
2060
            }
2061
        }
2062
        assert_eq!(vec![(139, Dir::Left)], graph.max_path(|data| data.sum().unwrap_or(1) as f32, |_| true));
2063
    }
2064

2065
    #[test]
2066
    fn test_iter_edges() {
1✔
2067
        use crate::{compression::uncompressed_graph, filter::filter_kmers, reads::{Reads, ReadsPaired}, summarizer::{SampleInfo, SummaryConfig, TagsData}, Exts};
2068

2069
        let read1 = "CAGCATCGATGCGACGAGCGCTCGCATCGA".as_bytes();
1✔
2070
        let read2 = "ACGATCGTACGTAGCTAGCTGACTGAGC".as_bytes();
1✔
2071

2072
        let mut reads = Reads::new(crate::reads::Strandedness::Forward);
1✔
2073
        reads.add_from_bytes(read1, Exts::empty(), 0u8);
1✔
2074
        reads.add_from_bytes(read2, Exts::empty(), 1);
1✔
2075

2076
        let reads_paired = ReadsPaired::Unpaired { reads };
1✔
2077

2078
        let sample_info = SampleInfo::new(0b1, 0b10, 1, 1, vec![12, 12]);
1✔
2079
        let summary_config = SummaryConfig::new(1, None, crate::summarizer::GroupFrac::None, 0.3, sample_info, None, crate::summarizer::StatTest::WelchsTTest);
1✔
2080
        let (kmers, _) = filter_kmers::<TagsData, Kmer16, _>(&reads_paired, &summary_config, false, 1, false);
1✔
2081

2082
        let graph = uncompressed_graph(&kmers).finish();
1✔
2083

2084
        let check_edges: Vec<(usize, Dir, u8, usize)> = vec![(0, Dir::Left, 2, 16), (0, Dir::Right, 1, 2), (1, Dir::Left, 0, 11), 
1✔
2085
        (1, Dir::Right, 0, 16), (3, Dir::Left, 0, 13), (3, Dir::Right, 3, 10), (4, Dir::Left, 2, 21), (4, Dir::Right, 1, 17), (5, Dir::Left, 1, 27), 
1✔
2086
        (5, Dir::Right, 2, 19), (6, Dir::Left, 1, 24), (6, Dir::Right, 2, 12), (7, Dir::Left, 2, 23), (7, Dir::Right, 3, 11), (8, Dir::Left, 0, 12), 
1✔
2087
        (9, Dir::Left, 3, 17), (9, Dir::Right, 3, 24), (10, Dir::Right, 1, 21), (13, Dir::Left, 1, 18), (14, Dir::Right, 0, 26), 
1✔
2088
        (15, Dir::Left, 2, 20), (15, Dir::Right, 3, 22), (18, Dir::Left, 2, 19), (20, Dir::Left, 1, 26), (22, Dir::Right, 2, 25), (23, Dir::Left, 1, 25)];
1✔
2089

2090
        let edges = graph.iter_edges().collect::<Vec<_>>();
1✔
2091

2092
        assert_eq!(check_edges, edges);
1✔
2093
    }
1✔
2094
}
2095

STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc