• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jlab / rust-debruijn / 17153846953

22 Aug 2025 11:17AM UTC coverage: 87.371% (+0.08%) from 87.287%
17153846953

Pull #28

github

web-flow
Merge a7214fb43 into f0284da73
Pull Request #28: New kmer sizes, edge filter

404 of 408 new or added lines in 5 files covered. (99.02%)

7 existing lines in 2 files now uncovered.

6953 of 7958 relevant lines covered (87.37%)

4871061.8 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

75.1
/src/graph.rs
1
// Copyright 2017 10x Genomics
2

3
//! Containers for path-compressed De Bruijn graphs
4

5
use bit_set::BitSet;
6
use indicatif::ProgressBar;
7
use indicatif::ProgressIterator;
8
use indicatif::ProgressStyle;
9
use itertools::enumerate;
10
use log::{debug, trace};
11
use rayon::prelude::*;
12
use rayon::current_num_threads;
13
use serde_derive::{Deserialize, Serialize};
14
use smallvec::SmallVec;
15
use std::borrow::Borrow;
16

17
use std::collections::HashSet;
18
use std::collections::VecDeque;
19
use std::f32;
20
use std::fmt::{self, Debug, Display};
21
use std::fs::{remove_file, File};
22
use std::hash::Hash;
23
use std::io::BufWriter;
24
use std::io::{BufReader, Error, Read};
25
use std::io::Write;
26
use std::iter::FromIterator;
27
use std::marker::PhantomData;
28
use std::path::Path;
29

30
use boomphf::hashmap::BoomHashMap;
31

32
use serde_json;
33
use serde_json::Value;
34

35
type SmallVec4<T> = SmallVec<[T; 4]>;
36
type SmallVec8<T> = SmallVec<[T; 8]>;
37

38
use crate::bits_to_base;
39
use crate::colors::ColorMode;
40
use crate::colors::Colors;
41
use crate::compression::CompressionSpec;
42
use crate::dna_string::{DnaString, DnaStringSlice, PackedDnaStringSet};
43
use crate::summarizer::SummaryConfig;
44
use crate::summarizer::SummaryData;
45
use crate::summarizer::Translator;
46
use crate::BUF;
47
use crate::PROGRESS_STYLE;
48
use crate::{Dir, Exts, Kmer, Mer, Vmer};
49

50
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
51
/// This type does not carry the sorted index arrays the allow the graph
52
/// to be walked efficiently. The `DeBruijnGraph` type wraps this type and add those
53
/// vectors.
54
#[derive(Serialize, Deserialize, Clone, Debug)]
55
pub struct BaseGraph<K, D> {
56
    pub sequences: PackedDnaStringSet,
57
    pub exts: Vec<Exts>,
58
    pub data: Vec<D>,
59
    pub stranded: bool,
60
    phantom: PhantomData<K>,
61
}
62

63
impl<K, D> BaseGraph<K, D> {
64
    pub fn new(stranded: bool) -> Self {
2,477✔
65
        BaseGraph {
2,477✔
66
            sequences: PackedDnaStringSet::new(),
2,477✔
67
            exts: Vec::new(),
2,477✔
68
            data: Vec::new(),
2,477✔
69
            phantom: PhantomData,
2,477✔
70
            stranded,
2,477✔
71
        }
2,477✔
72
    }
2,477✔
73

74
    pub fn len(&self) -> usize {
842,291✔
75
        self.sequences.len()
842,291✔
76
    }
842,291✔
77

78
    pub fn is_empty(&self) -> bool {
1✔
79
        self.sequences.is_empty()
1✔
80
    }
1✔
81

82
    pub fn combine<I: Iterator<Item = BaseGraph<K, D>>>(graphs: I) -> Self {
12✔
83
        let mut sequences = PackedDnaStringSet::new();
12✔
84
        let mut exts = Vec::new();
12✔
85
        let mut data = Vec::new();
12✔
86
        let mut stranded = Vec::new();
12✔
87

88
        for g in graphs {
1,796✔
89
            for s in 0..g.sequences.len() {
7,939✔
90
                sequences.add(&g.sequences.get(s));
7,939✔
91
            }
7,939✔
92

93
            exts.extend(g.exts);
1,784✔
94
            data.extend(g.data);
1,784✔
95
            stranded.push(g.stranded);
1,784✔
96
        }
97

98
        let out_stranded = stranded.iter().all(|x| *x);
12✔
99

100
        if !out_stranded && !stranded.iter().all(|x| !*x) {
1,784✔
101
            panic!("attempted to combine stranded and unstranded graphs");
×
102
        }
12✔
103

104
        BaseGraph {
12✔
105
            sequences,
12✔
106
            stranded: out_stranded,
12✔
107
            exts,
12✔
108
            data,
12✔
109
            phantom: PhantomData,
12✔
110
        }
12✔
111
    }
12✔
112
}
113

114
impl<K: Kmer, D> BaseGraph<K, D> {
115
    pub fn add<R: Borrow<u8>, S: IntoIterator<Item = R>>(
1,503,561✔
116
        &mut self,
1,503,561✔
117
        sequence: S,
1,503,561✔
118
        exts: Exts,
1,503,561✔
119
        data: D,
1,503,561✔
120
    ) {
1,503,561✔
121
        self.sequences.add(sequence);
1,503,561✔
122
        self.exts.push(exts);
1,503,561✔
123
        self.data.push(data);
1,503,561✔
124
    }
1,503,561✔
125
}
126

127
impl<K: Kmer + Send + Sync, D> BaseGraph<K, D> {
128
    pub fn finish(self) -> DebruijnGraph<K, D> {
582✔
129
        let indices: Vec<u32> = (0..self.len() as u32).collect();
582✔
130

131
        let left_order = {
582✔
132
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
582✔
133
            for idx in &indices {
795,367✔
134
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
794,785✔
135
                
794,785✔
136
            }
794,785✔
137
            
138
            BoomHashMap::new_parallel(kmers, indices.clone())
582✔
139
        };
140

141
        let right_order = {
582✔
142
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
582✔
143
            for idx in &indices {
795,367✔
144
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
794,785✔
145
            }
794,785✔
146
            
147
            BoomHashMap::new_parallel(kmers, indices)
582✔
148
        };
149
        debug!("finish graph loops: 2x {}", self.len());
582✔
150

151
        DebruijnGraph {
582✔
152
            base: self,
582✔
153
            left_order,
582✔
154
            right_order,
582✔
155
        }
582✔
156
    }
582✔
157
}
158

159
impl<K: Kmer, D> BaseGraph<K, D> {
160
    pub fn finish_serial(self) -> DebruijnGraph<K, D> {
111✔
161
        let indices: Vec<u32> = (0..self.len() as u32).collect();
111✔
162

163
        let left_order = {
111✔
164
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
111✔
165
            let mut sequences: Vec<String> = Vec::new();
111✔
166
            for idx in &indices {
708,153✔
167
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
708,042✔
168
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
708,042✔
169
            }
708,042✔
170
            println!("left kmers: {:?}", kmers);
111✔
171
            println!("left seqs: {:?}", sequences);
111✔
172
            BoomHashMap::new(kmers, indices.clone())
111✔
173
        };
174

175
        let right_order = {
111✔
176
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
111✔
177
            let mut sequences: Vec<String> = Vec::new();
111✔
178
            for idx in &indices {
708,153✔
179
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
708,042✔
180
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
708,042✔
181
            }
708,042✔
182
            println!("right kmers: {:?}", kmers);
111✔
183
            println!("right seqs: {:?}", sequences);
111✔
184
            BoomHashMap::new(kmers, indices)
111✔
185
        };
186

187
        DebruijnGraph {
111✔
188
            base: self,
111✔
189
            left_order,
111✔
190
            right_order,
111✔
191
        }
111✔
192
    }
111✔
193
}
194

195
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
196
/// The struct carries sorted index arrays the allow the graph
197
/// to be walked efficiently.
198
#[derive(Serialize, Deserialize, Debug)]
199
pub struct DebruijnGraph<K: Hash, D> {
200
    pub base: BaseGraph<K, D>,
201
    left_order: BoomHashMap<K, u32>,
202
    right_order: BoomHashMap<K, u32>,
203
}
204

205
impl<K: Kmer, D: Debug> DebruijnGraph<K, D> {
206
    /// Total number of nodes in the DeBruijn graph
207
    pub fn len(&self) -> usize {
840,188✔
208
        self.base.len()
840,188✔
209
    }
840,188✔
210

211
    pub fn is_empty(&self) -> bool {
1✔
212
        self.base.is_empty()
1✔
213
    }
1✔
214

215
    /// Get a node given it's `node_id`
216
    pub fn get_node(&self, node_id: usize) -> Node<K, D> {
5,086,412✔
217
        Node {
5,086,412✔
218
            node_id,
5,086,412✔
219
            graph: self,
5,086,412✔
220
        }
5,086,412✔
221
    }
5,086,412✔
222

223
    /// Get a node given it's `node_id`
224
    pub fn get_node_kmer(&self, node_id: usize) -> NodeKmer<K, D> {
×
225
        let node = self.get_node(node_id);
×
226
        let node_seq = node.sequence();
×
227

228
        NodeKmer {
×
229
            node_id,
×
230
            node_seq_slice: node_seq,
×
231
            phantom_d: PhantomData,
×
232
            phantom_k: PhantomData,
×
233
        }
×
234
    }
×
235

236
    /// Return an iterator over all nodes in the graph
237
    pub fn iter_nodes(&self) -> NodeIter<K, D> {
129✔
238
        NodeIter {
129✔
239
            graph: self,
129✔
240
            node_id: 0,
129✔
241
        }
129✔
242
    }
129✔
243

244
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
245
    /// accessed via a Node wrapper object
246
    fn find_edges(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
1,149,755✔
247
        let exts = self.base.exts[node_id];
1,149,755✔
248
        let sequence = self.base.sequences.get(node_id);
1,149,755✔
249
        let kmer: K = sequence.term_kmer(dir);
1,149,755✔
250
        let mut edges = SmallVec4::new();
1,149,755✔
251

252
        for i in 0..4 {
5,748,775✔
253
            if exts.has_ext(dir, i) {
4,599,020✔
254
                let link = self.find_link(kmer.extend(i, dir), dir).expect("missing link");
1,577,430✔
255
                edges.push((i, link.0, link.1, link.2));
1,577,430✔
256
            }
3,021,590✔
257
        }
258

259
        edges
1,149,755✔
260
    }
1,149,755✔
261

262
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
263
    /// accessed via a Node wrapper object
264
    /// 
265
    /// allows missing links
266
    fn _find_edges_sharded(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
×
267
        let exts = self.base.exts[node_id];
×
268
        let sequence = self.base.sequences.get(node_id);
×
269
        let kmer: K = sequence.term_kmer(dir);
×
270
        let mut edges = SmallVec4::new();
×
271

272
        for i in 0..4 {
×
273
            if exts.has_ext(dir, i) {
×
274
                let link = self.find_link(kmer.extend(i, dir), dir); //.expect("missing link");
×
275
                if let Some(l) = link {
×
276
                    edges.push((i, l.0, l.1, l.2));
×
277
                }
×
278
                // Otherwise, this edge doesn't exist within this shard, so ignore it.
279
                // NOTE: this should be allowed in a 'complete' DBG
280
            }
×
281
        }
282

283
        edges
×
284
    }
×
285

286
    /// Seach for the kmer `kmer`, appearing at the given `side` of a node sequence.
287
    fn search_kmer(&self, kmer: K, side: Dir) -> Option<usize> {
5,460,523✔
288
        match side {
5,460,523✔
289
            Dir::Left => self.left_order.get(&kmer).map(|pos| *pos as usize),
2,728,559✔
290
            Dir::Right => self.right_order.get(&kmer).map(|pos| *pos as usize),
2,731,964✔
291
        }
292
    }
5,460,523✔
293

294
    /// Find a link in the graph, possibly handling a RC switch.
295
    pub fn find_link(&self, kmer: K, dir: Dir) -> Option<(usize, Dir, bool)> {
3,813,518✔
296
        // Only test self-consistent paths through
297
        // the edges
298
        // Avoids problems due to single kmer edges
299
        // (dir, next_side_incoming, rc)
300
        // (Dir::Left, Dir::Right, false) => true,
301
        // (Dir::Left, Dir::Left,  true) => true,
302
        // (Dir::Right, Dir::Left, false) => true,
303
        // (Dir::Right, Dir::Right, true) => true,
304

305
        let rc = kmer.rc();
3,813,518✔
306

307
        match dir {
3,813,518✔
308
            Dir::Left => {
309
                if let Some(idx) = self.search_kmer(kmer, Dir::Right) {
1,909,764✔
310
                    return Some((idx, Dir::Right, false));
1,084,959✔
311
                }
824,805✔
312

313
                if !self.base.stranded {
824,805✔
314
                    if let Some(idx) = self.search_kmer(rc, Dir::Left) {
824,805✔
315
                        return Some((idx, Dir::Left, true));
824,805✔
316
                    }
×
317
                }
×
318
            }
319

320
            Dir::Right => {
321
                if let Some(idx) = self.search_kmer(kmer, Dir::Left) {
1,903,754✔
322
                    return Some((idx, Dir::Left, false));
1,081,554✔
323
                }
822,200✔
324

325
                if !self.base.stranded {
822,200✔
326
                    if let Some(idx) = self.search_kmer(rc, Dir::Right) {
822,200✔
327
                        return Some((idx, Dir::Right, true));
822,200✔
328
                    }
×
329
                }
×
330
            }
331
        }
332

333
        None
×
334
    }
3,813,518✔
335

336
    /// Check whether the graph is fully compressed. Return `None` if it's compressed,
337
    /// otherwise return `Some(node1, node2)` representing a pair of node that could
338
    /// be collapsed. Probably only useful for testing.
339
    pub fn is_compressed<S: CompressionSpec<D>>(&self, spec: &S) -> Option<(usize, usize)> {
789✔
340
        for i in 0..self.len() {
135,798✔
341
            let n = self.get_node(i);
135,798✔
342

343
            for dir in &[Dir::Left, Dir::Right] {
407,394✔
344
                let dir_edges = n.edges(*dir);
271,596✔
345
                if dir_edges.len() == 1 {
271,596✔
346
                    let (_, next_id, return_dir, _) = dir_edges[0];
192,376✔
347
                    let next = self.get_node(next_id);
192,376✔
348

349
                    let ret_edges = next.edges(return_dir);
192,376✔
350
                    if ret_edges.len() == 1 {
192,376✔
351
                        // Test for us being a palindrome: this makes it OK
352
                        if n.len() == K::k() && n.sequence().first_kmer::<K>().is_palindrome() {
24✔
353
                            continue;
16✔
354
                        }
8✔
355

356
                        // Test for a neighbor being a palindrome: this makes it OK
357
                        if next.len() == K::k() && next.sequence().first_kmer::<K>().is_palindrome()
8✔
358
                        {
359
                            continue;
8✔
360
                        }
×
361

362
                        // Test for this edge representing a smooth circle (biting it's own tail):
363
                        if n.node_id == next_id {
×
364
                            continue;
×
365
                        }
×
366

367
                        if spec.join_test(n.data(), next.data()) {
×
368
                            // Found a unbranched edge that should have been eliminated
369
                            return Some((i, next_id));
×
370
                        }
×
371
                    }
192,352✔
372
                }
79,220✔
373
            }
374
        }
375

376
        None
789✔
377
    }
789✔
378

379
    /// Remove non-existent extensions that may be created due to filtered kmers
380
    /// 
381
    /// if `valid_nodes` if `None`, all nodes are valid
382
    pub fn fix_exts(&mut self, valid_nodes: Option<&BitSet>) {
247✔
383
        for i in 0..self.len() {
737,561✔
384
            let valid_exts = self.get_valid_exts(i, valid_nodes);
737,561✔
385
            self.base.exts[i] = valid_exts;
737,561✔
386
        }
737,561✔
387
    }
247✔
388

389
    pub fn get_valid_exts(&self, node_id: usize, valid_nodes: Option<&BitSet>) -> Exts {
737,561✔
390
        let mut new_exts = Exts::empty();
737,561✔
391
        let node = self.get_node(node_id);
737,561✔
392
        let exts = node.exts();
737,561✔
393
        let l_kmer: K = node.sequence().first_kmer();
737,561✔
394
        let r_kmer: K = node.sequence().last_kmer();
737,561✔
395

396
        let check_node = |id| match valid_nodes {
1,511,138✔
397
            Some(bs) => bs.contains(id),
1,449,972✔
398
            None => true,
61,166✔
399
        };
1,511,138✔
400

401
        for i in 0..4 {
3,687,805✔
402
            if exts.has_ext(Dir::Left, i) {
2,950,244✔
403
                match self.find_link(l_kmer.extend_left(i), Dir::Left) {
755,458✔
404
                    Some((target, _, _)) if check_node(target) => {
755,458✔
405
                        new_exts = new_exts.set(Dir::Left, i)
755,458✔
406
                    }
407
                    _ => (),
×
408
                }
409
            }
2,194,786✔
410

411
            if exts.has_ext(Dir::Right, i) {
2,950,244✔
412
                match self.find_link(r_kmer.extend_right(i), Dir::Right) {
755,680✔
413
                    Some((target, _, _)) if check_node(target) => {
755,680✔
414
                        new_exts = new_exts.set(Dir::Right, i)
755,680✔
415
                    }
416
                    _ => (),
×
417
                }
418
            }
2,194,564✔
419
        }
420

421
        new_exts
737,561✔
422
    }
737,561✔
423

424
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
425
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
426
    /// paths will be terminated if there are multiple valid paths emanating from a node.
427
    pub fn max_path<F, F2>(&self, score: F, solid_path: F2) -> Vec<(usize, Dir)>
×
428
    where
×
429
        F: Fn(&D) -> f32,
×
430
        F2: Fn(&D) -> bool,
×
431
    {
432
        if self.is_empty() {
×
433
            return Vec::default();
×
434
        }
×
435

436
        let mut best_node = 0;
×
437
        let mut best_score = f32::MIN;
×
438
        for i in 0..self.len() {
×
439
            let node = self.get_node(i);
×
440
            let node_score = score(node.data());
×
441

442
            if node_score > best_score {
×
443
                best_node = i;
×
444
                best_score = node_score;
×
445
            }
×
446
        }
447

448
        let oscore = |state| match state {
×
449
            None => 0.0,
×
450
            Some((id, _)) => score(self.get_node(id).data()),
×
451
        };
×
452

453
        let osolid_path = |state| match state {
×
454
            None => false,
×
455
            Some((id, _)) => solid_path(self.get_node(id).data()),
×
456
        };
×
457

458
        // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
459
        // already put into the path
460
        let mut used_nodes = HashSet::new();
×
461
        let mut path = VecDeque::new();
×
462

463
        // Start w/ initial state
464
        used_nodes.insert(best_node);
×
465
        path.push_front((best_node, Dir::Left));
×
466

467
        for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
×
468
            let &(start_node, dir, do_flip) = init;
×
469
            let mut current = (start_node, dir);
×
470

471
            loop {
472
                let mut next = None;
×
473
                let (cur_id, incoming_dir) = current;
×
474
                let node = self.get_node(cur_id);
×
475
                let edges = node.edges(incoming_dir.flip());
×
476

477
                let mut solid_paths = 0;
×
478
                for (_, id, dir, _) in edges {
×
479
                    let cand = Some((id, dir));
×
480
                    if osolid_path(cand) {
×
481
                        solid_paths += 1;
×
482
                    }
×
483

484
                    if oscore(cand) > oscore(next) {
×
485
                        next = cand;
×
486
                    }
×
487
                }
488

489
                if solid_paths > 1 {
×
490
                    break;
×
491
                }
×
492

493
                match next {
×
494
                    Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
×
495
                        if do_flip {
×
496
                            path.push_front((next_id, next_incoming.flip()));
×
497
                        } else {
×
498
                            path.push_back((next_id, next_incoming));
×
499
                        }
×
500

501
                        used_nodes.insert(next_id);
×
502
                        current = (next_id, next_incoming);
×
503
                    }
504
                    _ => break,
×
505
                }
506
            }
507
        }
508

509
        Vec::from_iter(path)
×
510
    }
×
511

512

513
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
514
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
515
    /// paths will be terminated if there are multiple valid paths emanating from a node.
516
    /// Returns vec with path for each component
517
    pub fn max_path_comp<F, F2>(&self, score: F, solid_path: F2) -> Vec<VecDeque<(usize, Dir)>>
1✔
518
    where
1✔
519
        F: Fn(&D) -> f32,
1✔
520
        F2: Fn(&D) -> bool,
1✔
521
    {
522
        if self.is_empty() {
1✔
523
            let vec: Vec<VecDeque<(usize, Dir)>> = Vec::new();
×
524
            return vec;
×
525
        }
1✔
526

527
        let components = self.iter_components();
1✔
528
        let mut paths: Vec<VecDeque<(usize, Dir)>> = Vec::new();
1✔
529

530
        for component in components {
3✔
531

532
            let current_comp = &component;
2✔
533
            
534

535
            let mut best_node = current_comp[0];
2✔
536
            let mut best_score = f32::MIN;
2✔
537
            for c in current_comp.iter() {
138✔
538
                let node = self.get_node(*c);
138✔
539
                let node_score = score(node.data());
138✔
540

541
                if node_score > best_score {
138✔
542
                    best_node = *c;
2✔
543
                    best_score = node_score;
2✔
544
                }
136✔
545
            }
546

547
            let oscore = |state| match state {
254✔
548
                None => 0.0,
127✔
549
                Some((id, _)) => score(self.get_node(id).data()),
127✔
550
            };
254✔
551

552
            let osolid_path = |state| match state {
127✔
553
                None => false,
×
554
                Some((id, _)) => solid_path(self.get_node(id).data()),
127✔
555
            };
127✔
556

557
            // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
558
            // already put into the path
559
            let mut used_nodes = HashSet::new();
2✔
560
            let mut path = VecDeque::new();
2✔
561

562
            // Start w/ initial state
563
            used_nodes.insert(best_node);
2✔
564
            path.push_front((best_node, Dir::Left));
2✔
565

566
            for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
4✔
567
                let &(start_node, dir, do_flip) = init;
4✔
568
                let mut current = (start_node, dir);
4✔
569

570
                loop {
571
                    let mut next = None;
131✔
572
                    let (cur_id, incoming_dir) = current;
131✔
573
                    let node = self.get_node(cur_id);
131✔
574
                    let edges = node.edges(incoming_dir.flip());
131✔
575

576
                    let mut solid_paths = 0;
131✔
577
                    for (_, id, dir, _) in edges {
258✔
578
                        let cand = Some((id, dir));
127✔
579
                        if osolid_path(cand) {
127✔
580
                            solid_paths += 1;
127✔
581
                        }
127✔
582

583
                        if oscore(cand) > oscore(next) {
127✔
584
                            next = cand;
127✔
585
                        }
127✔
586
                    }
587

588
                    if solid_paths > 1 {
131✔
UNCOV
589
                        break;
×
590
                    }
131✔
591

592
                    match next {
127✔
593
                        Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
127✔
594
                            if do_flip {
127✔
595
                                path.push_front((next_id, next_incoming.flip()));
69✔
596
                            } else {
69✔
597
                                path.push_back((next_id, next_incoming));
58✔
598
                            }
58✔
599

600
                            used_nodes.insert(next_id);
127✔
601
                            current = (next_id, next_incoming);
127✔
602
                        }
603
                        _ => break,
4✔
604
                    }
605
                }
606
            }
607
            
608
            paths.push(path);
2✔
609
            //paths.push(Vec::from_iter(path));
610
        }
611

612
        paths
1✔
613
    
614
    }
1✔
615

616
    pub fn iter_max_path_comp<F, F2>(&self, score: F, solid_path: F2) -> PathCompIter<K, D, F, F2> 
3✔
617
    where 
3✔
618
    F: Fn(&D) -> f32,
3✔
619
    F2: Fn(&D) -> bool
3✔
620
    {
621
        let component_iterator = self.iter_components();
3✔
622
        PathCompIter { graph: self, component_iterator, graph_pos: 0, score, solid_path }
3✔
623
    }
3✔
624

625
    /// write the paths from `iter_max_path_comp` to a fasta file
626
    pub fn path_to_fasta<F, F2>(&self, f: &mut dyn std::io::Write, path_iter: PathCompIter<K, D, F, F2>, return_lens: bool) -> (Vec<usize>, Vec<usize>)
1✔
627
    where 
1✔
628
    F: Fn(&D) -> f32,
1✔
629
    F2: Fn(&D) -> bool
1✔
630
    {
631
        // width of fasta lines
632
        let columns = 80;
1✔
633

634
        // sizes of components and of paths
635
        let mut comp_sizes = Vec::new();
1✔
636
        let mut path_lens = Vec::new();
1✔
637

638
        for (seq_counter, (component, path)) in path_iter.enumerate() {
2✔
639
            // get dna sequence from path
640
            let seq = self.sequence_of_path(path.iter());
2✔
641

642
            //write header with length & start node
643
            writeln!(f, ">path{} len={} start_node={}", seq_counter, seq.len(), path[0].0).unwrap();
2✔
644

645
            // calculate how sequence has to be split up
646
            let slices = (seq.len() / columns) + 1;
2✔
647
            let mut ranges = Vec::with_capacity(slices);
2✔
648

649
            let mut start = 0;
2✔
650
            while start < seq.len() {
6✔
651
                ranges.push(start..start + columns);
4✔
652
                start += columns;
4✔
653
            }
4✔
654

655
            let last_start = ranges.pop().expect("no kmers in parallel ranges").start;
2✔
656
            ranges.push(last_start..seq.len());
2✔
657

658
            // split up sequence and write to file accordingly
659
            for range in ranges {
6✔
660
                writeln!(f, "{:?}", seq.slice(range.start, range.end)).unwrap();
4✔
661
            }
4✔
662

663
            if return_lens {
2✔
664
                comp_sizes.push(component.len());
×
665
                path_lens.push(path.len());
×
666
            }
2✔
667
        }    
668

669
        (comp_sizes, path_lens)
1✔
670
        
671
    }
1✔
672

673

674
    /// Get the sequence of a path through the graph. The path is given as a sequence of node_id integers
675
    pub fn sequence_of_path<'a, I: 'a + Iterator<Item = &'a (usize, Dir)>>(
21,448✔
676
        &self,
21,448✔
677
        path: I,
21,448✔
678
    ) -> DnaString {
21,448✔
679
        let mut seq = DnaString::new();
21,448✔
680

681
        for (idx, &(node_id, dir)) in path.enumerate() {
716,368✔
682
            let start = if idx == 0 { 0 } else { K::k() - 1 };
716,368✔
683

684
            let node_seq = match dir {
716,368✔
685
                Dir::Left => self.get_node(node_id).sequence(),
372,988✔
686
                Dir::Right => self.get_node(node_id).sequence().rc(),
343,380✔
687
            };
688

689
            for p in start..node_seq.len() {
1,433,115✔
690
                seq.push(node_seq.get(p))
1,433,115✔
691
            }
692
        }
693

694
        seq
21,448✔
695
    }
21,448✔
696

697
    /// write a node to a dot file
698
    /// 
699
    /// ### Arguments: 
700
    /// 
701
    /// * `node`: [`Node<K, D>`] which will be written to a dot file
702
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
703
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
704
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
705
    /// * `f`: writer
706
    fn node_to_dot<FN: Fn(&Node<K, D>) -> String, FE: Fn(&Node<K, D>, u8, Dir, bool) -> String>(
97,978✔
707
        &self,
97,978✔
708
        node: &Node<'_, K, D>,
97,978✔
709
        node_label: &FN,
97,978✔
710
        edge_label: &FE,
97,978✔
711
        f: &mut dyn Write,
97,978✔
712
    ) {
97,978✔
713
        writeln!(f, "n{} {}", node.node_id, node_label(node)).unwrap();
97,978✔
714
        assert_eq!(node.exts().val.count_ones() as usize, node.l_edges().len() + node.r_edges().len());
97,978✔
715

716
        for (base, id, incoming_dir, flipped) in node.l_edges() {
97,978✔
717
            writeln!(f, "n{} -> n{} {}", id, node.node_id, edge_label(node, base, incoming_dir, flipped)).unwrap();
97,591✔
718
        }
97,591✔
719

720
        for (base, id, incoming_dir, flipped) in node.r_edges() {
97,978✔
721
            writeln!(f, "n{} -> n{} {}", node.node_id, id, edge_label(node, base, incoming_dir, flipped)).unwrap();
97,165✔
722
        }
97,165✔
723
    }
97,978✔
724

725
    /// Write the graph to a dot file
726
    /// 
727
    /// ### Arguments: 
728
    /// 
729
    /// * `path`: path to the output file
730
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes, e.g. [`Node::node_dot_default`]
731
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge, e.g. [`Node::edge_dot_default`]
732
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
733
    pub fn to_dot<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
1✔
734
    where 
1✔
735
    P: AsRef<Path>,
1✔
736
    FN: Fn(&Node<K, D>) -> String,
1✔
737
    FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
738
    {
739
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
740

741
        let pb = ProgressBar::new(self.len() as u64);
1✔
742
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
743
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
1✔
744

745
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
746
        for i in (0..self.len()).progress_with(pb) {
32,658✔
747
            self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
32,658✔
748
        }
32,658✔
749
        writeln!(&mut f, "}}").unwrap();
1✔
750
        
751
        f.flush().unwrap();
1✔
752
        debug!("large to dot loop: {}", self.len());
1✔
753
    }
1✔
754

755
    /// Write the graph to a dot file, highlight the nodes which form the 
756
    /// "best" path, according to [`PathCompIter`], with the number of occurences 
757
    /// as the score and `solid_path` always `true`.
758
    /// The nodes are formatted according to [`Node::node_dot_default`].
759
    /// 
760
    /// ### Arguments: 
761
    /// 
762
    /// * `path`: path to the output file
763
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge, e.g. [`Node::edge_dot_default`]
764
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
765
    /// * `colors`: a [`Colors`] with the color settings for the graph
766
    /// * `translator`: a [`Translator`] which translates tags or IDs to strings
767
    /// * `config`: a [`SummaryConfig`] which contains settings for the graph
768
    pub fn to_dot_with_path<P, FE, DI>(&self, path: P, edge_label: &FE, colors: &Colors<'_, D, DI>, translator: &Translator, config: &SummaryConfig)
1✔
769
    where 
1✔
770
    P: AsRef<Path>,
1✔
771
    D: SummaryData<DI>,
1✔
772
    FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
773
    {
774
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
775

776
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
777

778
        // iterate over components
779
        for (component, path) in self.iter_max_path_comp(|d| d.sum().unwrap_or(1) as f32, |_| true) {
62,565✔
780
            let hashed_path = path.into_iter().map(|(id, _)| id).collect::<HashSet<usize>>();
233✔
781
            for node_id in component {
32,891✔
782
                self.node_to_dot(
32,658✔
783
                    &self.get_node(node_id),
32,658✔
784
                    &|node| node.node_dot_default(colors, config, translator, hashed_path.contains(&node_id)), 
32,658✔
785
                    edge_label, 
32,658✔
786
                    &mut f
32,658✔
787
                );
788
            }
789
        }
790

791
        writeln!(&mut f, "}}").unwrap();
1✔
792
        
793
        f.flush().unwrap();
1✔
794
        debug!("large to dot loop: {}", self.len());
1✔
795
    }
1✔
796

797
    /// Write the graph to a dot file in parallel
798
    /// Will write in to n_threads files simultaniously,
799
    /// then go though the files and add the contents to a larger file, 
800
    /// and delete the small files.
801
    /// 
802
    /// ### Arguments: 
803
    /// 
804
    /// * `path`: path to the output file
805
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
806
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
807
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
808
    pub fn to_dot_parallel<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
1✔
809
    where 
1✔
810
        D: Sync,
1✔
811
        K: Sync,
1✔
812
        P: AsRef<Path> + Display + Sync,
1✔
813
        FN: Fn(&Node<K, D>) -> String + Sync,
1✔
814
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String + Sync,
1✔
815
    {        
816
        let slices = current_num_threads();
1✔
817
        let n_nodes = self.len();
1✔
818
        let sz = n_nodes / slices + 1;
1✔
819

820
        debug!("n_nodes: {}", n_nodes);
1✔
821
        debug!("sz: {}", sz);
1✔
822

823
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
824
        let mut start = 0;
1✔
825
        while start < n_nodes {
5✔
826
            parallel_ranges.push(start..start + sz);
4✔
827
            start += sz;
4✔
828
        }
4✔
829

830
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
831
        parallel_ranges.push(last_start..n_nodes);
1✔
832
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
833

834
        let mut files = Vec::with_capacity(current_num_threads());
1✔
835

836
        for i in 0..parallel_ranges.len() {
4✔
837
            files.push(format!("{}-{}.dot", path, i));
4✔
838
        } 
4✔
839

840
        let pb = ProgressBar::new(self.len() as u64);
1✔
841
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
842
        pb.set_message(format!("{:<32}", "writing graph to DOT files"));
1✔
843
    
844
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
4✔
845
            let mut f = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel dot file"));
4✔
846

847
            for i in range {
32,662✔
848
                self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
32,658✔
849
                pb.inc(1);
32,658✔
850
            }
32,658✔
851

852
            f.flush().unwrap();
4✔
853
        });
4✔
854
        pb.finish_and_clear();
1✔
855

856
        let mut out_file = BufWriter::with_capacity(BUF, File::create(path).expect("error creating combined dot file"));
1✔
857

858
        writeln!(&mut out_file, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
859

860
        let pb = ProgressBar::new(files.len() as u64);
1✔
861
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
862
        pb.set_message(format!("{:<32}", "combining files"));
1✔
863

864
        for file in files.iter().progress_with(pb) {
4✔
865
            let open_file = File::open(file).expect("error opening parallel dot file");
4✔
866
            let mut reader = BufReader::new(open_file);
4✔
867
            let mut buffer = [0; BUF];
4✔
868

869
            loop {
870
                let linecount = reader.read(&mut buffer).unwrap();
219✔
871
                if linecount == 0 { break }
219✔
872
                out_file.write_all(&buffer[..linecount]).unwrap();
215✔
873
            }
874

875
            remove_file(file).unwrap();
4✔
876
        }
877

878
        writeln!(&mut out_file, "}}").unwrap();
1✔
879

880
        out_file.flush().unwrap();
1✔
881

882

883
    }
1✔
884

885

886
    /// Write part of the graph to a dot file
887
    /// 
888
    /// ### Arguments: 
889
    /// 
890
    /// * `path`: path to the output file
891
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
892
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
893
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
894
    /// * `nodes`: [`Vec<usize>`] listing all IDs of nodes which should be included
895
    pub fn to_dot_partial<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE, nodes: Vec<usize>) 
1✔
896
    where 
1✔
897
        P: AsRef<Path>,
1✔
898
        FN: Fn(&Node<K, D>) -> String,
1✔
899
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
900
    {
901
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
902

903
        let pb = ProgressBar::new(nodes.len() as u64);
1✔
904
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
905
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
1✔
906

907
        writeln!(&mut f, "# {:?}", nodes).unwrap();
1✔
908
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
909
        for i in nodes.into_iter().progress_with(pb) {
4✔
910
            self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
4✔
911
        }
4✔
912
        writeln!(&mut f, "}}").unwrap();
1✔
913

914
        f.flush().unwrap();
1✔
915

916
        debug!("large to dot loop: {}", self.len());
1✔
917
    }
1✔
918

919
    fn node_to_gfa<F: Fn(&Node<'_, K, D>) -> String>(
97,978✔
920
        &self,
97,978✔
921
        node: &Node<'_, K, D>,
97,978✔
922
        w: &mut dyn Write,
97,978✔
923
        tag_func: Option<&F>,
97,978✔
924
    ) -> Result<(), Error> {
97,978✔
925
        match tag_func {
97,978✔
926
            Some(f) => {
65,320✔
927
                let tags = (f)(node);
65,320✔
928
                writeln!(
65,320✔
929
                    w,
65,320✔
930
                    "S\t{}\t{}\t{}",
65,320✔
931
                    node.node_id,
932
                    node.sequence().to_dna_string(),
65,320✔
933
                    tags
UNCOV
934
                )?;
×
935
            }
936
            _ => writeln!(
32,658✔
937
                w,
32,658✔
938
                "S\t{}\t{}",
32,658✔
939
                node.node_id,
940
                node.sequence().to_dna_string()
32,658✔
UNCOV
941
            )?,
×
942
        }
943

944
        for (_, target, dir, _) in node.l_edges() {
97,978✔
945
            if target >= node.node_id {
97,591✔
946
                let to_dir = match dir {
48,748✔
947
                    Dir::Left => "+",
19,433✔
948
                    Dir::Right => "-",
29,315✔
949
                };
950
                writeln!(
48,748✔
951
                    w,
48,748✔
952
                    "L\t{}\t-\t{}\t{}\t{}M",
48,748✔
953
                    node.node_id,
954
                    target,
955
                    to_dir,
956
                    K::k() - 1
48,748✔
UNCOV
957
                )?;
×
958
            }
48,843✔
959
        }
960

961
        for (_, target, dir, _) in node.r_edges() {
97,978✔
962
            if target > node.node_id {
97,165✔
963
                let to_dir = match dir {
48,634✔
964
                    Dir::Left => "+",
29,415✔
965
                    Dir::Right => "-",
19,219✔
966
                };
967
                writeln!(
48,634✔
968
                    w,
48,634✔
969
                    "L\t{}\t+\t{}\t{}\t{}M",
48,634✔
970
                    node.node_id,
971
                    target,
972
                    to_dir,
973
                    K::k() - 1
48,634✔
UNCOV
974
                )?;
×
975
            }
48,531✔
976
        }
977

978
        Ok(())
97,978✔
979
    }
97,978✔
980

981
    /// Write the graph to GFA format
982
    pub fn to_gfa<P: AsRef<Path>>(&self, gfa_out: P) -> Result<(), Error> {
1✔
983
        let wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
1✔
984
        self.write_gfa(&mut std::io::BufWriter::new(wtr))
1✔
985
    }
1✔
986

987
    pub fn write_gfa(&self, wtr: &mut impl Write) -> Result<(), Error> {
1✔
988
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
989

990
        type DummyFn<K, D> = fn(&Node<'_, K, D>) -> String;
991
        let dummy_opt: Option<&DummyFn<K, D>> = None;
1✔
992

993
        let pb = ProgressBar::new(self.len() as u64);
1✔
994
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
995
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
996

997
        for i in (0..self.len()).progress_with(pb) {
32,658✔
998
            let n = self.get_node(i);
32,658✔
999
            self.node_to_gfa(&n, wtr, dummy_opt)?;
32,658✔
1000
        }
1001

1002
        wtr.flush().unwrap();
1✔
1003

1004
        Ok(())
1✔
1005
    }
1✔
1006

1007
    /// Write the graph to GFA format
1008
    pub fn to_gfa_with_tags<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(
1✔
1009
        &self,
1✔
1010
        gfa_out: P,
1✔
1011
        tag_func: F,
1✔
1012
    ) -> Result<(), Error> {
1✔
1013
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creatinf gfa file"));
1✔
1014
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1015

1016
        let pb = ProgressBar::new(self.len() as u64);
1✔
1017
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1018
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1019

1020
        for i in (0..self.len()).progress_with(pb) {
32,658✔
1021
            let n = self.get_node(i);
32,658✔
1022
            self.node_to_gfa(&n, &mut wtr, Some(&tag_func))?;
32,658✔
1023
        }
1024

1025
        wtr.flush().unwrap();
1✔
1026

1027
        Ok(())
1✔
1028
    }
1✔
1029

1030
    /// Write the graph to GFA format, with multithreading, 
1031
    /// pass `tag_func=None` to write without tags
1032
    pub fn to_gfa_otags_parallel<P: AsRef<Path> + Display + Sync, F: Fn(&Node<'_, K, D>) -> String + Sync>(
1✔
1033
        &self,
1✔
1034
        gfa_out: P,
1✔
1035
        tag_func: Option<&F>,
1✔
1036
    ) -> Result<(), Error> 
1✔
1037
    where 
1✔
1038
    K: Sync,
1✔
1039
    D: Sync,
1✔
1040
    {
1041
        // split into ranges according to thread count
1042
        let slices = current_num_threads();
1✔
1043
        let n_nodes = self.len();
1✔
1044
        let sz = n_nodes / slices + 1;
1✔
1045

1046
        debug!("n_nodes: {}", n_nodes);
1✔
1047
        debug!("sz: {}", sz);
1✔
1048

1049
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
1050
        let mut start = 0;
1✔
1051
        while start < n_nodes {
5✔
1052
            parallel_ranges.push(start..start + sz);
4✔
1053
            start += sz;
4✔
1054
        }
4✔
1055

1056
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
1057
        parallel_ranges.push(last_start..n_nodes);
1✔
1058
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
1059

1060
        let mut files = Vec::with_capacity(current_num_threads());
1✔
1061

1062
        for i in 0..parallel_ranges.len() {
4✔
1063
            files.push(format!("{}-{}.gfa", gfa_out, i));
4✔
1064
        } 
4✔
1065

1066
        let pb = ProgressBar::new(self.len() as u64);
1✔
1067
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1068
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1069
        
1070
        
1071
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
4✔
1072
            let mut wtr = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel gfa file"));
4✔
1073

1074
            for i in range {
32,662✔
1075
                let n = self.get_node(i);
32,658✔
1076
                self.node_to_gfa(&n, &mut wtr, tag_func).unwrap();
32,658✔
1077
                pb.inc(1);
32,658✔
1078
            }
32,658✔
1079

1080
            wtr.flush().unwrap();
4✔
1081
        });
4✔
1082

1083
        pb.finish_and_clear();
1✔
1084

1085
        // combine files
1086
        let mut out_file = BufWriter::with_capacity(BUF, File::create(format!("{}.gfa", gfa_out)).expect("error creating combined gfa file"));
1✔
1087
        writeln!(out_file, "H\tVN:Z:debruijn-rs")?;
1✔
1088

1089
        let pb = ProgressBar::new(files.len() as u64);
1✔
1090
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1091
        pb.set_message(format!("{:<32}", "combining files"));
1✔
1092

1093
        for file in files.iter() {
4✔
1094
            let open_file = File::open(file).expect("error opening parallel gfa file");
4✔
1095
            let mut reader = BufReader::new(open_file);
4✔
1096
            let mut buffer = [0; BUF];
4✔
1097

1098
            loop {
1099
                let linecount = reader.read(&mut buffer).unwrap();
113✔
1100
                if linecount == 0 { break }
113✔
1101
                out_file.write_all(&buffer[..linecount]).unwrap();
109✔
1102
            }
1103

1104
            remove_file(file).unwrap();
4✔
1105
        }
1106

1107
        out_file.flush().unwrap();
1✔
1108

1109
        Ok(())
1✔
1110
    }
1✔
1111

1112
    /// Write the graph to GFA format
1113
    pub fn to_gfa_partial<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(&self, gfa_out: P, tag_func: Option<&F>, nodes: Vec<usize>) -> Result<(), Error> {
1✔
1114
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
1✔
1115
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1116

1117
        let pb = ProgressBar::new(self.len() as u64);
1✔
1118
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1119
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1120

1121
        for i in nodes.into_iter().progress_with(pb) {
4✔
1122
            let n = self.get_node(i);
4✔
1123
            self.node_to_gfa(&n, &mut wtr, tag_func)?;
4✔
1124
        }
1125

1126
        wtr.flush().unwrap();
1✔
1127

1128
        Ok(())    
1✔
1129
    }
1✔
1130

1131
    pub fn to_json_rest<W: Write, F: Fn(&D) -> Value>(
×
1132
        &self,
×
1133
        fmt_func: F,
×
1134
        mut writer: &mut W,
×
1135
        rest: Option<Value>,
×
1136
    ) {
×
1137
        writeln!(writer, "{{\n\"nodes\": [").unwrap();
×
1138
        for i in 0..self.len() {
×
1139
            let node = self.get_node(i);
×
1140
            node.to_json(&fmt_func, writer);
×
1141
            if i == self.len() - 1 {
×
1142
                writeln!(writer).unwrap();
×
1143
            } else {
×
1144
                writeln!(writer, ",").unwrap();
×
1145
            }
×
1146
        }
1147
        writeln!(writer, "],").unwrap();
×
1148

1149
        writeln!(writer, "\"links\": [").unwrap();
×
1150
        for i in 0..self.len() {
×
1151
            let node = self.get_node(i);
×
1152
            match node.edges_to_json(writer) {
×
1153
                true => {
1154
                    if i == self.len() - 1 {
×
1155
                        writeln!(writer).unwrap();
×
1156
                    } else {
×
1157
                        writeln!(writer, ",").unwrap();
×
1158
                    }
×
1159
                }
1160
                _ => continue,
×
1161
            }
1162
        }
1163
        writeln!(writer, "]").unwrap();
×
1164

1165
        match rest {
×
1166
            Some(Value::Object(v)) => {
×
1167
                for (k, v) in v.iter() {
×
1168
                    writeln!(writer, ",").expect("io error");
×
1169
                    write!(writer, "\"{}\": ", k).expect("io error");
×
1170
                    serde_json::to_writer(&mut writer, v).expect("io error");
×
1171
                    writeln!(writer).expect("io error");
×
1172
                }
×
1173
            }
1174
            _ => {
×
1175
                writeln!(writer).expect("io error");
×
1176
            }
×
1177
        }
1178

1179
        writeln!(writer, "}}").expect("io error");
×
1180
    }
×
1181

1182
    /// Write the graph to JSON
1183
    pub fn to_json<W: Write, F: Fn(&D) -> Value, RF: Fn(&mut W)>(
×
1184
        &self,
×
1185
        fmt_func: F,
×
1186
        writer: &mut W,
×
1187
    ) {
×
1188
        self.to_json_rest(fmt_func, writer, None);
×
1189
    }
×
1190

1191
    /// Print a text representation of the graph.
1192
    pub fn print(&self) {
2✔
1193
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
2✔
1194
        for node in self.iter_nodes() {
140✔
1195
            println!("{:?}", node);
140✔
1196
        }
140✔
1197
    }
2✔
1198

1199
    pub fn print_with_data(&self) {
×
1200
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
×
1201
        for node in self.iter_nodes() {
×
1202
            println!("{:?} ({:?})", node, node.data());
×
1203
        }
×
1204
    }
×
1205

1206
    pub fn max_path_beam<F, F2>(&self, beam: usize, score: F, _solid_path: F2) -> Vec<(usize, Dir)>
×
1207
    where
×
1208
        F: Fn(&D) -> f32,
×
1209
        F2: Fn(&D) -> bool,
×
1210
    {
1211
        if self.is_empty() {
×
1212
            return Vec::default();
×
1213
        }
×
1214

1215
        let mut states = Vec::new();
×
1216

1217
        for i in 0..self.len() {
×
1218
            let node = self.get_node(i);
×
1219

1220
            // Initialize beam search on terminal nodes
1221
            if node.exts().num_exts_l() == 0 || node.exts().num_exts_r() == 0 {
×
1222
                let dir = if node.exts().num_exts_l() > 0 {
×
1223
                    Dir::Right
×
1224
                } else {
1225
                    Dir::Left
×
1226
                };
1227

1228
                let status = if node.exts().num_exts_l() == 0 && node.exts().num_exts_r() == 0 {
×
1229
                    Status::End
×
1230
                } else {
1231
                    Status::Active
×
1232
                };
1233

1234
                let mut path = SmallVec8::new();
×
1235
                path.push((i as u32, dir));
×
1236

1237
                let s = State {
×
1238
                    path,
×
1239
                    status,
×
1240
                    score: score(node.data()),
×
1241
                };
×
1242
                states.push(s);
×
1243
            }
×
1244
        }
1245

1246
        // No end nodes -- just start on the first node
1247
        if states.is_empty() {
×
1248
            // Make a start
×
1249
            let node = self.get_node(0);
×
1250
            let mut path = SmallVec8::new();
×
1251
            path.push((0, Dir::Left));
×
1252
            states.push(State {
×
1253
                path,
×
1254
                status: Status::Active,
×
1255
                score: score(node.data()),
×
1256
            });
×
1257
        }
×
1258

1259
        // Beam search until we can't find any more expansions
1260
        let mut active = true;
×
1261
        while active {
×
1262
            let mut new_states = Vec::with_capacity(states.len());
×
1263
            active = false;
×
1264

1265
            for s in states {
×
1266
                if s.status == Status::Active {
×
1267
                    active = true;
×
1268
                    let expanded = self.expand_state(&s, &score);
×
1269
                    new_states.extend(expanded);
×
1270
                } else {
×
1271
                    new_states.push(s)
×
1272
                }
1273
            }
1274

1275
            // workaround to sort by descending score - will panic if there are NaN scores
1276
            new_states.sort_by(|a, b| (-(a.score)).partial_cmp(&-(b.score)).unwrap());
×
1277
            new_states.truncate(beam);
×
1278
            states = new_states;
×
1279
        }
1280

1281
        for (i, state) in states.iter().take(5).enumerate() {
×
1282
            trace!("i:{}  -- {:?}", i, state);
×
1283
        }
1284

1285
        // convert back to using usize for node_id
1286
        states[0]
×
1287
            .path
×
1288
            .iter()
×
1289
            .map(|&(node, dir)| (node as usize, dir))
×
1290
            .collect()
×
1291
    }
×
1292

1293
    fn expand_state<F>(&self, state: &State, score: &F) -> SmallVec4<State>
×
1294
    where
×
1295
        F: Fn(&D) -> f32,
×
1296
    {
1297
        if state.status != Status::Active {
×
1298
            panic!("only attempt to expand active states")
×
1299
        }
×
1300

1301
        let (node_id, dir) = state.path[state.path.len() - 1];
×
1302
        let node = self.get_node(node_id as usize);
×
1303
        let mut new_states = SmallVec4::new();
×
1304

1305
        for (_, next_node_id, incoming_dir, _) in node.edges(dir.flip()) {
×
1306
            let next_node = self.get_node(next_node_id);
×
1307
            let new_score = state.score + score(next_node.data());
×
1308

1309
            let cycle = state
×
1310
                .path
×
1311
                .iter()
×
1312
                .any(|&(prev_node, _)| prev_node == (next_node_id as u32));
×
1313

1314
            let status = if cycle {
×
1315
                Status::Cycle
×
1316
            } else if next_node.edges(incoming_dir.flip()).is_empty() {
×
1317
                Status::End
×
1318
            } else {
1319
                Status::Active
×
1320
            };
1321

1322
            let mut new_path = state.path.clone();
×
1323
            new_path.push((next_node_id as u32, incoming_dir));
×
1324

1325
            let next_state = State {
×
1326
                path: new_path,
×
1327
                score: new_score,
×
1328
                status,
×
1329
            };
×
1330

1331
            new_states.push(next_state);
×
1332
        }
1333

1334
        new_states
×
1335
    }
×
1336

1337

1338
    pub fn iter_components(&self) -> IterComponents<K, D> {
5✔
1339
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
5✔
1340
        let pos = 0;
5✔
1341

1342
        for _i in 0..self.len() {
33,210✔
1343
            visited.push(false);
33,210✔
1344
        }
33,210✔
1345

1346
        IterComponents { 
5✔
1347
            graph: self, 
5✔
1348
            visited, 
5✔
1349
            pos }
5✔
1350
    }
5✔
1351

1352

1353
    /// iteratively returns 2D Vec with node_ids grouped according to the connected components they form
1354
    pub fn components_i(&self) -> Vec<Vec<usize>> {
1✔
1355
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
1✔
1356
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
1✔
1357

1358
        for _i in 0..self.len() {
138✔
1359
            visited.push(false);
138✔
1360
        }
138✔
1361

1362
        for i in 0..self.len() {
138✔
1363
            if !visited[i] {
138✔
1364
                let comp = self.component_i(&mut visited, i);
2✔
1365
                components.push(comp);
2✔
1366
            }
136✔
1367
        }
1368

1369
        components
1✔
1370
    }
1✔
1371

1372
    /// recursively detects which nodes form separate graph components
1373
    /// returns 2D vector with node ids per component
1374
    /// (may lead to stack overflow)
1375
    pub fn components_r(&self) -> Vec<Vec<usize>> {
2✔
1376
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
2✔
1377
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
2✔
1378

1379
        for _i in 0..self.len() {
140✔
1380
            visited.push(false);
140✔
1381
        }
140✔
1382

1383
        for i in 0..self.len() {
140✔
1384
            if !visited[i] {
140✔
1385
                let mut comp: Vec<usize> = Vec::new();
4✔
1386
                self.component_r(&mut visited, i, &mut comp);
4✔
1387
                components.push(comp);
4✔
1388
            }
136✔
1389
        }
1390

1391
        components
2✔
1392

1393
    }
2✔
1394

1395
    fn component_r<'a>(&'a self, visited: &'a mut Vec<bool>, i: usize, comp: &'a mut Vec<usize>) {
140✔
1396
        
1397
        visited[i] = true;
140✔
1398
        comp.push(i);
140✔
1399
        let mut edges = self.find_edges(i, Dir::Left);
140✔
1400
        let mut r_edges = self.find_edges(i, Dir::Right);
140✔
1401

1402
        edges.append(&mut r_edges);
140✔
1403

1404
        for (_, edge, _, _) in edges.iter() {
272✔
1405
            if !visited[*edge] {
272✔
1406
                self.component_r(visited, *edge, comp);
136✔
1407
            }
136✔
1408
        }
1409
    }
140✔
1410

1411
    fn component_i<'a>(&'a self, visited: &'a mut [bool], i: usize) -> Vec<usize> {
243✔
1412
        let mut edges: Vec<usize> = Vec::new();
243✔
1413
        let mut comp: Vec<usize> = Vec::new();
243✔
1414

1415
        edges.push(i);
243✔
1416

1417
        while let Some(current_edge) = edges.pop() {
33,622✔
1418
            if !visited[current_edge] { 
33,379✔
1419
                comp.push(current_edge);
33,348✔
1420
                visited[current_edge] = true;
33,348✔
1421

1422
                let mut l_edges = self.find_edges(current_edge, Dir::Left);
33,348✔
1423
                let mut r_edges = self.find_edges(current_edge, Dir::Right);
33,348✔
1424

1425
                l_edges.append(&mut r_edges);
33,348✔
1426

1427
                for (_, new_edge, _, _) in l_edges.into_iter() {
66,276✔
1428
                    if !visited[new_edge] {
66,276✔
1429
                        edges.push(new_edge);
33,136✔
1430
                    }
33,140✔
1431
                }
1432
            }
31✔
1433
        }
1434
        comp
243✔
1435
    }
243✔
1436

1437
    /// iterate over all edges of the graph, item: (node, ext base, ext dir, target node)
1438
    pub fn iter_edges(&self) -> EdgeIter<'_, K, D> {
1✔
1439
        EdgeIter::new(self)
1✔
1440
    }
1✔
1441

1442
    pub fn find_bad_nodes<F: Fn(&Node<'_, K, D>) -> bool>(&self, valid: F) -> Vec<usize> {
×
1443
        let mut bad_nodes = Vec::new();
×
1444

1445
        for (i, node) in enumerate(self.iter_nodes()) {
×
1446
            if !valid(&node) { bad_nodes.push(i); }
×
1447
        }
1448
        
1449
        bad_nodes
×
1450
    }
×
1451
}
1452

1453
impl<K: Kmer, SD: Debug> DebruijnGraph<K, SD> {
1454
    pub fn create_colors<'a, 'b: 'a, DI>(&'a self, config: &SummaryConfig, color_mode: ColorMode<'b>) -> Colors<'b, SD, DI> 
×
1455
    where 
×
1456
    SD: SummaryData<DI>,
×
1457
    {
1458
        Colors::new(self, config, color_mode)
×
1459
    }
×
1460
    
1461
    /// [`crate::EdgeMult`] will contain hanging edges if the nodes were filtered
1462
    pub fn fix_edge_mults<DI>(&mut self) 
2✔
1463
    where 
2✔
1464
        SD: SummaryData<DI>
2✔
1465
    {
1466
        if self.get_node(0).data().edge_mults().is_some() {
2✔
1467
            for i in 0..self.len() {
276✔
1468
                self.base.data[i].fix_edge_mults(self.base.exts[i]);
276✔
1469
            }
276✔
UNCOV
1470
        }
×
1471
    }
2✔
1472

1473
    /// if there are [`crate::EdgeMult`]s in the data, prune the graph by removing edges that have a low coverage
1474
    pub fn filter_edges<DI>(&mut self, min: u32) -> Result<(), String>
1✔
1475
    where 
1✔
1476
        SD: SummaryData<DI>
1✔
1477
    {
1478
        // return if there is no edge coverage available
1479
        if self.get_node(0).data().edge_mults().is_none() { return Err(String::from("no edge mults available")) };
1✔
1480

1481
        for i in 0..self.len() {
138✔
1482
            let em = self.get_node(i).data().edge_mults().expect("shold have em").clone();
138✔
1483
            let edges = [(Dir::Left, 0), (Dir::Left, 1), (Dir::Left, 2), (Dir::Left, 3), 
138✔
1484
                (Dir::Right, 0), (Dir::Right, 1), (Dir::Right, 2), (Dir::Right, 3)];
138✔
1485
            
1486
            for (dir, base) in edges {
1,242✔
1487
                if min > em.edge_mult(base, dir) {
1,104✔
1488
                    // remove invalid ext from node
832✔
1489
                    let ext = self.base.exts[i].remove(dir, base);
832✔
1490
                    self.base.exts[i] = ext;
832✔
1491
                }
832✔
1492
            }
1493
        }
1494

1495
        // now that exts are remove, fix hanging edge mults
1496
        self.fix_edge_mults();
1✔
1497

1498
        Ok(())
1✔
1499
    }
1✔
1500
}
1501

1502
#[derive(Debug, Eq, PartialEq)]
1503
enum Status {
1504
    Active,
1505
    End,
1506
    Cycle,
1507
}
1508

1509
#[derive(Debug)]
1510
struct State {
1511
    path: SmallVec8<(u32, Dir)>,
1512
    score: f32,
1513
    status: Status,
1514
}
1515

1516
impl State {}
1517

1518
/// Iterator over nodes in a `DeBruijnGraph`
1519
pub struct NodeIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1520
    graph: &'a DebruijnGraph<K, D>,
1521
    node_id: usize,
1522
}
1523

1524
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIter<'a, K, D> {
1525
    type Item = Node<'a, K, D>;
1526

1527
    fn next(&mut self) -> Option<Node<'a, K, D>> {
541,851✔
1528
        if self.node_id < self.graph.len() {
541,851✔
1529
            let node = self.graph.get_node(self.node_id);
541,722✔
1530
            self.node_id += 1;
541,722✔
1531
            Some(node)
541,722✔
1532
        } else {
1533
            None
129✔
1534
        }
1535
    }
541,851✔
1536
}
1537

1538
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for &'a DebruijnGraph<K, D> {
1539
    type Item = NodeKmer<'a, K, D>;
1540
    type IntoIter = NodeIntoIter<'a, K, D>;
1541

1542
    fn into_iter(self) -> Self::IntoIter {
2,223✔
1543
        NodeIntoIter {
2,223✔
1544
            graph: self,
2,223✔
1545
            node_id: 0,
2,223✔
1546
        }
2,223✔
1547
    }
2,223✔
1548
}
1549

1550
/// Iterator over nodes in a `DeBruijnGraph`
1551
pub struct NodeIntoIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1552
    graph: &'a DebruijnGraph<K, D>,
1553
    node_id: usize,
1554
}
1555

1556
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIntoIter<'a, K, D> {
1557
    type Item = NodeKmer<'a, K, D>;
1558

1559
    fn next(&mut self) -> Option<Self::Item> {
263,890✔
1560
        if self.node_id < self.graph.len() {
263,890✔
1561
            let node_id = self.node_id;
263,890✔
1562
            let node = self.graph.get_node(node_id);
263,890✔
1563
            let node_seq = node.sequence();
263,890✔
1564

1565
            self.node_id += 1;
263,890✔
1566
            Some(NodeKmer {
263,890✔
1567
                node_id,
263,890✔
1568
                node_seq_slice: node_seq,
263,890✔
1569
                phantom_d: PhantomData,
263,890✔
1570
                phantom_k: PhantomData,
263,890✔
1571
            })
263,890✔
1572
        } else {
1573
            None
×
1574
        }
1575
    }
263,890✔
1576
}
1577

1578
/// A `DebruijnGraph` node with a reference to the sequence of the node.
1579
#[derive(Clone)]
1580
pub struct NodeKmer<'a, K: Kmer + 'a, D: Debug + 'a> {
1581
    pub node_id: usize,
1582
    node_seq_slice: DnaStringSlice<'a>,
1583
    phantom_k: PhantomData<K>,
1584
    phantom_d: PhantomData<D>,
1585
}
1586

1587
/// An iterator over the kmers in a `DeBruijn graph node`
1588
pub struct NodeKmerIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1589
    kmer_id: usize,
1590
    kmer: K,
1591
    num_kmers: usize,
1592
    node_seq_slice: DnaStringSlice<'a>,
1593
    phantom_k: PhantomData<K>,
1594
    phantom_d: PhantomData<D>,
1595
}
1596

1597
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for NodeKmer<'a, K, D> {
1598
    type Item = K;
1599
    type IntoIter = NodeKmerIter<'a, K, D>;
1600

1601
    fn into_iter(self) -> Self::IntoIter {
527,780✔
1602
        let num_kmers = self.node_seq_slice.len() - K::k() + 1;
527,780✔
1603

1604
        let kmer = if num_kmers > 0 {
527,780✔
1605
            self.node_seq_slice.get_kmer::<K>(0)
527,780✔
1606
        } else {
1607
            K::empty()
×
1608
        };
1609

1610
        NodeKmerIter {
527,780✔
1611
            kmer_id: 0,
527,780✔
1612
            kmer,
527,780✔
1613
            num_kmers,
527,780✔
1614
            node_seq_slice: self.node_seq_slice,
527,780✔
1615
            phantom_k: PhantomData,
527,780✔
1616
            phantom_d: PhantomData,
527,780✔
1617
        }
527,780✔
1618
    }
527,780✔
1619
}
1620

1621
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeKmerIter<'a, K, D> {
1622
    type Item = K;
1623

1624
    fn next(&mut self) -> Option<Self::Item> {
3,516,506✔
1625
        if self.num_kmers == self.kmer_id {
3,516,506✔
1626
            None
×
1627
        } else {
1628
            let current_kmer = self.kmer;
3,516,506✔
1629

1630
            self.kmer_id += 1;
3,516,506✔
1631
            if self.kmer_id < self.num_kmers {
3,516,506✔
1632
                let next_base = self.node_seq_slice.get(self.kmer_id + K::k() - 1);
3,448,582✔
1633
                let new_kmer = self.kmer.extend_right(next_base);
3,448,582✔
1634
                self.kmer = new_kmer;
3,448,582✔
1635
            }
3,448,582✔
1636

1637
            Some(current_kmer)
3,516,506✔
1638
        }
1639
    }
3,516,506✔
1640

1641
    fn size_hint(&self) -> (usize, Option<usize>) {
263,890✔
1642
        (self.num_kmers, Some(self.num_kmers))
263,890✔
1643
    }
263,890✔
1644

1645
    /// Provide a 'fast-forward' capability for this iterator
1646
    /// MPHF will use this to reduce the number of kmers that
1647
    /// need to be produced.
1648
    fn nth(&mut self, n: usize) -> Option<Self::Item> {
2,534,006✔
1649
        if n <= 4 {
2,534,006✔
1650
            // for small skips forward, shift one base at a time
1651
            for _ in 0..n {
2,279,298✔
1652
                self.next();
982,500✔
1653
            }
982,500✔
1654
        } else {
254,708✔
1655
            self.kmer_id += n;
254,708✔
1656
            self.kmer = self.node_seq_slice.get_kmer::<K>(self.kmer_id);
254,708✔
1657
        }
254,708✔
1658

1659
        self.next()
2,534,006✔
1660
    }
2,534,006✔
1661
}
1662

1663
/// Marker signifying that NodeKmerIter has a known size.
1664
impl<'a, K: Kmer + 'a, D: Debug + 'a> ExactSizeIterator for NodeKmerIter<'a, K, D> {}
1665

1666
/// Unbranched sequence in the DeBruijn graph
1667
pub struct Node<'a, K: Kmer + 'a, D: 'a> {
1668
    pub node_id: usize,
1669
    pub graph: &'a DebruijnGraph<K, D>,
1670
}
1671

1672
impl<'a, K: Kmer, D: Debug> Node<'a, K, D> {
1673
    /// Length of the sequence of this node
1674
    pub fn len(&self) -> usize {
1,645,928✔
1675
        self.graph.base.sequences.get(self.node_id).len()
1,645,928✔
1676
    }
1,645,928✔
1677

1678
    pub fn is_empty(&self) -> bool {
×
1679
        self.graph.base.sequences.get(self.node_id).is_empty()
×
1680
    }
×
1681

1682
    /// Sequence of the node
1683
    pub fn sequence(&self) -> DnaStringSlice<'a> {
3,407,993✔
1684
        self.graph.base.sequences.get(self.node_id)
3,407,993✔
1685
    }
3,407,993✔
1686

1687
    /// Reference to auxiliarly data associated with the node
1688
    pub fn data(&self) -> &'a D {
3,953,764✔
1689
        &self.graph.base.data[self.node_id]
3,953,764✔
1690
    }
3,953,764✔
1691

1692
    /// Extension bases from this node
1693
    pub fn exts(&self) -> Exts {
2,396,046✔
1694
        self.graph.base.exts[self.node_id]
2,396,046✔
1695
    }
2,396,046✔
1696

1697
    /// Edges leaving the left side of the node in the format
1698
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
1699
    pub fn l_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
294,102✔
1700
        self.graph.find_edges(self.node_id, Dir::Left)
294,102✔
1701
    }
294,102✔
1702

1703
    /// Edges leaving the right side of the node in the format
1704
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
1705
    pub fn r_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
294,102✔
1706
        self.graph.find_edges(self.node_id, Dir::Right)
294,102✔
1707
    }
294,102✔
1708

1709
    /// Edges leaving the 'dir' side of the node in the format
1710
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
1711
    pub fn edges(&self, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
494,575✔
1712
        self.graph.find_edges(self.node_id, dir)
494,575✔
1713
    }
494,575✔
1714

1715
    fn to_json<F: Fn(&D) -> Value>(&self, func: &F, f: &mut dyn Write) {
×
1716
        write!(
×
1717
            f,
×
1718
            "{{\"id\":\"{}\",\"L\":{},\"D\":{},\"Se\":\"{:?}\"}}",
×
1719
            self.node_id,
1720
            self.sequence().len(),
×
1721
            (func)(self.data()),
×
1722
            self.sequence(),
×
1723
        )
1724
        .unwrap();
×
1725
    }
×
1726

1727
    fn edges_to_json(&self, f: &mut dyn Write) -> bool {
×
1728
        let mut wrote = false;
×
1729
        let edges = self.r_edges();
×
1730
        for (idx, &(_, id, incoming_dir, _)) in edges.iter().enumerate() {
×
1731
            write!(
×
1732
                f,
×
1733
                "{{\"source\":\"{}\",\"target\":\"{}\",\"D\":\"{}\"}}",
×
1734
                self.node_id,
1735
                id,
1736
                match incoming_dir {
×
1737
                    Dir::Left => "L",
×
1738
                    Dir::Right => "R",
×
1739
                }
1740
            )
1741
            .unwrap();
×
1742

1743
            if idx < edges.len() - 1 {
×
1744
                write!(f, ",").unwrap();
×
1745
            }
×
1746

1747
            wrote = true;
×
1748
        }
1749
        wrote
×
1750
    }
×
1751
}
1752

1753
// TODO make generic instead of u8 (u8 is sufficient for dbg)
1754
impl<K: Kmer, SD: Debug> Node<'_, K, SD>  {
1755
    /// get default format for dot edges based on node data
1756
    pub fn edge_dot_default<DI>(&self, colors: &Colors<SD, DI>, base: u8, incoming_dir: Dir, flipped: bool) -> String 
194,761✔
1757
    where SD: SummaryData<DI>
194,761✔
1758
    {
1759
        // set color based on dir
1760
        let color = match incoming_dir {
194,761✔
1761
            Dir::Left => "blue",
97,593✔
1762
            Dir::Right => "red"
97,168✔
1763
        };
1764
        
1765
        if let Some(em) = self.data().edge_mults() {
194,761✔
1766
            
1767
            let dir = if flipped { 
194,761✔
1768
                incoming_dir 
77,306✔
1769
            } else {
1770
                incoming_dir.flip()
117,455✔
1771
            };
1772

1773
            // set penwidth based on count
1774
            let count = em.edge_mult(base, dir);
194,761✔
1775
            let penwidth = colors.edge_width(count);
194,761✔
1776

1777
            format!("[color={color}, penwidth={penwidth}, label=\"{}: {count}\"]", bits_to_base(base))
194,761✔
1778
        } else {
NEW
1779
            format!("[color={color}, penwidth={}]", colors.edge_width(1)) // since there should be no edge mults, this will return default value
×
1780
        }
1781
    }
194,761✔
1782

1783
    /// get default format for dot nodes, based on node data
1784
    pub fn node_dot_default<DI>(&self, colors: &Colors<SD, DI>, config: &SummaryConfig, translator: &Translator, outline: bool) -> String
97,994✔
1785
    where SD: SummaryData<DI>
97,994✔
1786
    {
1787
        // set color based on labels/fold change/p-value
1788
        let color = colors.node_color(self.data(), config, outline);
97,994✔
1789

1790
        let data_info = self.data().print(translator, config);
97,994✔
1791
        const MIN_TEXT_WIDTH: usize = 40;
1792
        let wrap = if self.len() > MIN_TEXT_WIDTH { self.len() } else { MIN_TEXT_WIDTH };
97,994✔
1793

1794
        let label = textwrap::fill(&format!("id: {}, len: {}, exts: {:?}, seq: {}\n{}", 
97,994✔
1795
            self.node_id,
97,994✔
1796
            self.len(),
97,994✔
1797
            self.exts(),
97,994✔
1798
            self.sequence(),
97,994✔
1799
            data_info
97,994✔
1800
        ), wrap);
97,994✔
1801

1802
        format!("[style=filled, {color}, label=\"{label}\"]")
97,994✔
1803
    }
97,994✔
1804
}
1805

1806
impl<K: Kmer, D> fmt::Debug for Node<'_, K, D>
1807
where
1808
    D: Debug,
1809
{
1810
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
140✔
1811
        write!(
140✔
1812
            f,
140✔
1813
            "Node {{ id:{}, Exts: {:?}, L:{:?} R:{:?}, Seq: {:?}, Data: {:?} }}",
140✔
1814
            self.node_id,
1815
            self.exts(),
140✔
1816
            self.l_edges(),
140✔
1817
            self.r_edges(),
140✔
1818
            self.sequence().len(),
140✔
1819
            self.data()
140✔
1820
        )
1821
    }
140✔
1822
}
1823

1824
pub struct IterComponents<'a, K: Kmer, D> {
1825
    graph: &'a DebruijnGraph<K, D>,
1826
    visited: Vec<bool>,
1827
    pos: usize,
1828
}
1829

1830
impl<K: Kmer, D: Debug> Iterator for IterComponents<'_, K, D> {
1831
    type Item = Vec<usize>;
1832
    fn next(&mut self) -> Option<Self::Item> {
246✔
1833
        while self.pos < self.graph.len() {
33,215✔
1834
            if !self.visited[self.pos] {
33,210✔
1835
                let comp = self.graph.component_i(&mut self.visited, self.pos);
241✔
1836
                self.pos += 1;
241✔
1837
                return Some(comp)
241✔
1838
            } else {
32,969✔
1839
                self.pos += 1;
32,969✔
1840
            }
32,969✔
1841
        }
1842
        assert!(self.visited.iter().map(|x| *x as usize).sum::<usize>() == self.graph.len());
33,210✔
1843
        None
5✔
1844
    }
246✔
1845
    
1846
}
1847

1848
pub struct PathCompIter<'a, K: Kmer, D: Debug, F, F2> 
1849
where 
1850
F: Fn(&D) -> f32,
1851
F2: Fn(&D) -> bool
1852
{
1853
    graph: &'a DebruijnGraph<K, D>,
1854
    component_iterator: IterComponents<'a, K, D>,
1855
    graph_pos: usize,
1856
    score: F,
1857
    solid_path: F2,
1858
}
1859

1860
/// returns the component and the "best" path in the component
1861
impl<K: Kmer, D: Debug, F, F2> Iterator for PathCompIter<'_, K, D, F, F2> 
1862
where 
1863
F: Fn(&D) -> f32,
1864
F2: Fn(&D) -> bool
1865
{
1866
    type Item = (Vec<usize>, VecDeque<(usize, Dir)>,);
1867
    fn next(&mut self) -> Option<Self::Item> {
240✔
1868
        match self.component_iterator.next() {
240✔
1869
            Some(component) => {
237✔
1870
                let current_comp = component;
237✔
1871
                
1872
    
1873
                let mut best_node = current_comp[0];
237✔
1874
                let mut best_score = f32::MIN;
237✔
1875
                for c in current_comp.iter() {
32,934✔
1876
                    let node = self.graph.get_node(*c);
32,934✔
1877
                    let node_score = (self.score)(node.data());
32,934✔
1878
    
1879
                    if node_score > best_score {
32,934✔
1880
                        best_node = *c;
281✔
1881
                        best_score = node_score;
281✔
1882
                    }
32,653✔
1883
                }
1884
    
1885
                let oscore = |state| match state {
60,162✔
1886
                    None => 0.0,
30,001✔
1887
                    Some((id, _)) => (self.score)(self.graph.get_node(id).data()),
30,161✔
1888
                };
60,162✔
1889
    
1890
                let osolid_path = |state| match state {
30,081✔
1891
                    None => false,
×
1892
                    Some((id, _)) => (self.solid_path)(self.graph.get_node(id).data()),
30,081✔
1893
                };
30,081✔
1894
    
1895
                // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
1896
                // already put into the path
1897
                let mut used_nodes = HashSet::new();
237✔
1898
                let mut path = VecDeque::new();
237✔
1899
    
1900
                // Start w/ initial state
1901
                used_nodes.insert(best_node);
237✔
1902
                path.push_front((best_node, Dir::Left));
237✔
1903
    
1904
                for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
474✔
1905
                    let &(start_node, dir, do_flip) = init;
474✔
1906
                    let mut current = (start_node, dir);
474✔
1907
    
1908
                    loop {
1909
                        let mut next = None;
30,472✔
1910
                        let (cur_id, incoming_dir) = current;
30,472✔
1911
                        let node = self.graph.get_node(cur_id);
30,472✔
1912
                        let edges = node.edges(incoming_dir.flip());
30,472✔
1913
    
1914
                        let mut solid_paths = 0;
30,472✔
1915
                        for (_, id, dir, _) in edges {
60,553✔
1916
                            let cand = Some((id, dir));
30,081✔
1917
                            if osolid_path(cand) {
30,081✔
1918
                                solid_paths += 1;
30,081✔
1919

1920
                                // second if clause is outside of first in original code (see max_path) 
1921
                                // but would basically ignore path validity.
1922
                                if oscore(cand) > oscore(next) {
30,081✔
1923
                                    next = cand;
30,039✔
1924
                                }
30,039✔
1925
                            }
×
1926
                        }
1927
                        
1928
                        // break if multiple solid paths are available
1929
                        /* if solid_paths > 1 {
1930
                            break;
1931
                        } */
1932
    
1933
                        match next {
30,001✔
1934
                            Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
30,001✔
1935
                                if do_flip {
29,998✔
1936
                                    path.push_front((next_id, next_incoming.flip()));
15,061✔
1937
                                } else {
15,061✔
1938
                                    path.push_back((next_id, next_incoming));
14,937✔
1939
                                }
14,937✔
1940
    
1941
                                used_nodes.insert(next_id);
29,998✔
1942
                                current = (next_id, next_incoming);
29,998✔
1943
                            }
1944
                            _ => break,
474✔
1945
                        }
1946
                    }
1947
                }
1948
                
1949
                
1950
                Some((current_comp, path))
237✔
1951
            }, 
1952
            None => {
1953
                // should technically not need graph_pos after this 
1954
                self.graph_pos += 1;
3✔
1955
                None
3✔
1956
            }
1957
        }
1958
    }
240✔
1959
}
1960

1961

1962
/// iterator over the edges of the de bruijn graph
1963
pub struct EdgeIter<'a, K: Kmer, D: Debug> {
1964
    graph: &'a DebruijnGraph<K, D>,
1965
    visited_edges: HashSet<(usize, usize)>,
1966
    current_node: usize,
1967
    current_dir: Dir,
1968
    node_edge_iter: smallvec::IntoIter<[(u8, usize, Dir, bool); 4]>
1969
}
1970

1971
impl<K: Kmer, D: Debug> EdgeIter<'_, K, D> {
1972
    pub fn new(graph: &DebruijnGraph<K, D>) -> EdgeIter<'_, K, D>{
1✔
1973
        let node_edge_iter = graph.get_node(0).l_edges().into_iter();
1✔
1974

1975
        EdgeIter { 
1✔
1976
            graph, 
1✔
1977
            visited_edges: HashSet::new(), 
1✔
1978
            current_node: 0, 
1✔
1979
            current_dir: Dir::Left, 
1✔
1980
            node_edge_iter
1✔
1981
        }
1✔
1982
    }
1✔
1983
}
1984

1985
impl<K: Kmer, D: Debug> Iterator for EdgeIter<'_, K, D> {
1986
    type Item = (usize, Dir, u8, usize); // node, direction leaving node, base, target node
1987

1988
    fn next(&mut self) -> Option<Self::Item> {
27✔
1989
        loop {
1990
            if let Some((base, nb_node_id, _, _)) = self.node_edge_iter.next() {
108✔
1991
                let edge = if self.current_node > nb_node_id { (nb_node_id, self.current_node) } else { (self.current_node, nb_node_id) };
52✔
1992

1993
                if self.visited_edges.insert(edge) { return Some((self.current_node, self.current_dir, base, nb_node_id)); } // else simply skip and move on
52✔
1994

1995
            } else {
1996
                match self.current_dir {
56✔
1997
                Dir::Left => {
28✔
1998
                    // no left edges, switch to right edges
28✔
1999
                    self.current_dir = Dir::Right;
28✔
2000
                    self.node_edge_iter = self.graph.get_node(self.current_node).r_edges().into_iter();
28✔
2001
                    
28✔
2002
                }
28✔
2003
                Dir::Right => {
2004
                    // no right edges, switch to next node left edges
2005
                    self.current_node += 1;
28✔
2006

2007
                    // quit if end of graph is reached
2008
                    if self.current_node == self.graph.len() { return None }
28✔
2009

2010
                    self.current_dir = Dir::Left;
27✔
2011
                    self.node_edge_iter = self.graph.get_node(self.current_node).l_edges().into_iter();
27✔
2012
                }
2013
            }
2014
            }
2015
            
2016
        }
2017
    }
27✔
2018
}
2019

2020
#[cfg(test)]
2021
mod test {
2022
    use std::{fs::File, io::BufReader};
2023

2024
    use crate::{kmer::Kmer16, summarizer::TagsCountsSumData};
2025

2026
    use super::DebruijnGraph;
2027
    use crate::{summarizer::SummaryData, Dir, BUF};
2028

2029

2030
    #[test]
2031
    #[cfg(not(feature = "sample128"))]
2032
    fn test_components() {
2033

2034
        let path = "test_data/400.graph.dbg";
2035
        let file = BufReader::with_capacity(BUF, File::open(path).unwrap());
2036

2037
        let (graph, _, _): (DebruijnGraph<Kmer16, TagsCountsSumData>, Vec<String>, crate::summarizer::SummaryConfig) = 
2038
            bincode::deserialize_from(file).expect("error deserializing graph");
2039

2040
        let components = graph.iter_components();
2041

2042
        let check_components = [
2043
            vec![3, 67, 130, 133, 59, 119, 97, 110, 68, 137, 29, 84, 131, 43, 30, 91, 14, 70, 79, 142, 136, 105, 103, 62, 
2044
                141, 104, 134, 88, 38, 81, 108, 92, 135, 96, 116, 121, 63, 124, 106, 129, 132, 126, 93, 109, 83, 112, 118, 
2045
                123, 125, 78, 122, 115, 75, 128, 140, 111, 26, 143, 113],
2046
            vec![41, 138, 100, 139, 86],
2047
            vec![53, 117, 127],
2048
            vec![69, 144, 77, 120, 114, 107, 101],
2049
        ];
2050

2051
        let mut counter = 0;
2052

2053
        for component in components {
2054
            if component.len() > 1 { 
2055
                println!("component: {:?}", component);
2056
                assert_eq!(component, check_components[counter]);
2057
                counter += 1;
2058
            }
2059
        }
2060
        assert_eq!(vec![(139, Dir::Left)], graph.max_path(|data| data.sum().unwrap_or(1) as f32, |_| true));
2061
    }
2062

2063
    #[test]
2064
    fn test_iter_edges() {
1✔
2065
        use crate::{compression::uncompressed_graph, filter::filter_kmers, reads::{Reads, ReadsPaired}, summarizer::{SampleInfo, SummaryConfig, TagsData}, Exts};
2066

2067
        let read1 = "CAGCATCGATGCGACGAGCGCTCGCATCGA".as_bytes();
1✔
2068
        let read2 = "ACGATCGTACGTAGCTAGCTGACTGAGC".as_bytes();
1✔
2069

2070
        let mut reads = Reads::new(crate::reads::Strandedness::Forward);
1✔
2071
        reads.add_from_bytes(read1, Exts::empty(), 0u8);
1✔
2072
        reads.add_from_bytes(read2, Exts::empty(), 1);
1✔
2073

2074
        let reads_paired = ReadsPaired::Unpaired { reads };
1✔
2075

2076
        let sample_info = SampleInfo::new(0b1, 0b10, 1, 1, vec![12, 12]);
1✔
2077
        let summary_config = SummaryConfig::new(1, None, crate::summarizer::GroupFrac::None, 0.3, sample_info, None, crate::summarizer::StatTest::WelchsTTest);
1✔
2078
        let (kmers, _) = filter_kmers::<TagsData, Kmer16, _>(&reads_paired, &summary_config, false, 1, false);
1✔
2079

2080
        let graph = uncompressed_graph(&kmers).finish();
1✔
2081

2082
        let check_edges: Vec<(usize, Dir, u8, usize)> = vec![(0, Dir::Left, 2, 16), (0, Dir::Right, 1, 2), (1, Dir::Left, 0, 11), 
1✔
2083
        (1, Dir::Right, 0, 16), (3, Dir::Left, 0, 13), (3, Dir::Right, 3, 10), (4, Dir::Left, 2, 21), (4, Dir::Right, 1, 17), (5, Dir::Left, 1, 27), 
1✔
2084
        (5, Dir::Right, 2, 19), (6, Dir::Left, 1, 24), (6, Dir::Right, 2, 12), (7, Dir::Left, 2, 23), (7, Dir::Right, 3, 11), (8, Dir::Left, 0, 12), 
1✔
2085
        (9, Dir::Left, 3, 17), (9, Dir::Right, 3, 24), (10, Dir::Right, 1, 21), (13, Dir::Left, 1, 18), (14, Dir::Right, 0, 26), 
1✔
2086
        (15, Dir::Left, 2, 20), (15, Dir::Right, 3, 22), (18, Dir::Left, 2, 19), (20, Dir::Left, 1, 26), (22, Dir::Right, 2, 25), (23, Dir::Left, 1, 25)];
1✔
2087

2088
        let edges = graph.iter_edges().collect::<Vec<_>>();
1✔
2089

2090
        assert_eq!(check_edges, edges);
1✔
2091
    }
1✔
2092
}
2093

STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc