• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jlab / rust-debruijn / 14644057144

24 Apr 2025 02:22PM UTC coverage: 85.013% (+0.2%) from 84.785%
14644057144

Pull #16

github

web-flow
Merge 6a082e950 into 8f229478f
Pull Request #16: Fix/unreachable kmers

125 of 133 new or added lines in 3 files covered. (93.98%)

2 existing lines in 1 file now uncovered.

6722 of 7907 relevant lines covered (85.01%)

3446407.62 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

68.22
/src/graph.rs
1
// Copyright 2017 10x Genomics
2

3
//! Containers for path-compressed De Bruijn graphs
4

5
use bit_set::BitSet;
6
use indicatif::ProgressBar;
7
use indicatif::ProgressIterator;
8
use indicatif::ProgressStyle;
9
use itertools::enumerate;
10
use log::{debug, trace};
11
use rayon::prelude::*;
12
use rayon::current_num_threads;
13
use serde_derive::{Deserialize, Serialize};
14
use smallvec::SmallVec;
15
use std::borrow::Borrow;
16

17
use std::collections::HashSet;
18
use std::collections::VecDeque;
19
use std::f32;
20
use std::fmt::{self, Debug, Display};
21
use std::fs::{remove_file, File};
22
use std::hash::Hash;
23
use std::io::BufWriter;
24
use std::io::{BufReader, Error, Read};
25
use std::io::Write;
26
use std::iter::FromIterator;
27
use std::marker::PhantomData;
28
use std::path::Path;
29

30
use boomphf::hashmap::BoomHashMap;
31

32
use serde_json;
33
use serde_json::Value;
34

35
type SmallVec4<T> = SmallVec<[T; 4]>;
36
type SmallVec8<T> = SmallVec<[T; 8]>;
37

38
use crate::bits_to_base;
39
use crate::colors::Colors;
40
use crate::compression::CompressionSpec;
41
use crate::dna_string::{DnaString, DnaStringSlice, PackedDnaStringSet};
42
use crate::summarizer::SummaryConfig;
43
use crate::summarizer::SummaryData;
44
use crate::BUF;
45
use crate::PROGRESS_STYLE;
46
use crate::{Dir, Exts, Kmer, Mer, Vmer};
47

48
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
49
/// This type does not carry the sorted index arrays the allow the graph
50
/// to be walked efficiently. The `DeBruijnGraph` type wraps this type and add those
51
/// vectors.
52
#[derive(Serialize, Deserialize, Clone, Debug)]
53
pub struct BaseGraph<K, D> {
54
    pub sequences: PackedDnaStringSet,
55
    pub exts: Vec<Exts>,
56
    pub data: Vec<D>,
57
    pub stranded: bool,
58
    phantom: PhantomData<K>,
59
}
60

61
impl<K, D> BaseGraph<K, D> {
62
    pub fn new(stranded: bool) -> Self {
1,644✔
63
        BaseGraph {
1,644✔
64
            sequences: PackedDnaStringSet::new(),
1,644✔
65
            exts: Vec::new(),
1,644✔
66
            data: Vec::new(),
1,644✔
67
            phantom: PhantomData,
1,644✔
68
            stranded,
1,644✔
69
        }
1,644✔
70
    }
1,644✔
71

72
    pub fn len(&self) -> usize {
737,201✔
73
        self.sequences.len()
737,201✔
74
    }
737,201✔
75

76
    pub fn is_empty(&self) -> bool {
1✔
77
        self.sequences.is_empty()
1✔
78
    }
1✔
79

80
    pub fn combine<I: Iterator<Item = BaseGraph<K, D>>>(graphs: I) -> Self {
13✔
81
        let mut sequences = PackedDnaStringSet::new();
13✔
82
        let mut exts = Vec::new();
13✔
83
        let mut data = Vec::new();
13✔
84
        let mut stranded = Vec::new();
13✔
85

86
        for g in graphs {
1,298✔
87
            for s in 0..g.sequences.len() {
2,889✔
88
                sequences.add(&g.sequences.get(s));
2,889✔
89
            }
2,889✔
90

91
            exts.extend(g.exts);
1,285✔
92
            data.extend(g.data);
1,285✔
93
            stranded.push(g.stranded);
1,285✔
94
        }
95

96
        let out_stranded = stranded.iter().all(|x| *x);
13✔
97

13✔
98
        if !out_stranded && !stranded.iter().all(|x| !*x) {
1,285✔
99
            panic!("attempted to combine stranded and unstranded graphs");
×
100
        }
13✔
101

13✔
102
        BaseGraph {
13✔
103
            sequences,
13✔
104
            stranded: out_stranded,
13✔
105
            exts,
13✔
106
            data,
13✔
107
            phantom: PhantomData,
13✔
108
        }
13✔
109
    }
13✔
110
}
111

112
impl<K: Kmer, D> BaseGraph<K, D> {
113
    pub fn add<R: Borrow<u8>, S: IntoIterator<Item = R>>(
704,866✔
114
        &mut self,
704,866✔
115
        sequence: S,
704,866✔
116
        exts: Exts,
704,866✔
117
        data: D,
704,866✔
118
    ) {
704,866✔
119
        self.sequences.add(sequence);
704,866✔
120
        self.exts.push(exts);
704,866✔
121
        self.data.push(data);
704,866✔
122
    }
704,866✔
123
}
124

125
impl<K: Kmer + Send + Sync, D> BaseGraph<K, D> {
126
    pub fn finish(self) -> DebruijnGraph<K, D> {
359✔
127
        let indices: Vec<u32> = (0..self.len() as u32).collect();
359✔
128

129
        let left_order = {
359✔
130
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
359✔
131
            for idx in &indices {
702,890✔
132
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
702,531✔
133
                
702,531✔
134
            }
702,531✔
135
            
136
            BoomHashMap::new_parallel(kmers, indices.clone())
359✔
137
        };
138

139
        let right_order = {
359✔
140
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
359✔
141
            for idx in &indices {
702,890✔
142
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
702,531✔
143
            }
702,531✔
144
            
145
            BoomHashMap::new_parallel(kmers, indices)
359✔
146
        };
359✔
147
        debug!("finish graph loops: 2x {}", self.len());
359✔
148

149
        DebruijnGraph {
359✔
150
            base: self,
359✔
151
            left_order,
359✔
152
            right_order,
359✔
153
        }
359✔
154
    }
359✔
155
}
156

157
impl<K: Kmer, D> BaseGraph<K, D> {
158
    pub fn finish_serial(self) -> DebruijnGraph<K, D> {
×
159
        let indices: Vec<u32> = (0..self.len() as u32).collect();
×
160

161
        let left_order = {
×
162
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
×
163
            let mut sequences: Vec<String> = Vec::new();
×
164
            for idx in &indices {
×
165
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
×
166
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
×
167
            }
×
168
            println!("left kmers: {:?}", kmers);
×
169
            println!("left seqs: {:?}", sequences);
×
170
            BoomHashMap::new(kmers, indices.clone())
×
171
        };
172

173
        let right_order = {
×
174
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
×
175
            let mut sequences: Vec<String> = Vec::new();
×
176
            for idx in &indices {
×
177
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
×
178
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
×
179
            }
×
180
            println!("right kmers: {:?}", kmers);
×
181
            println!("right seqs: {:?}", sequences);
×
182
            BoomHashMap::new(kmers, indices)
×
183
        };
×
184

×
185
        DebruijnGraph {
×
186
            base: self,
×
187
            left_order,
×
188
            right_order,
×
189
        }
×
190
    }
×
191

192
    pub fn remove_duplicate_kmers(self) {
×
193
        
×
194
    }
×
195
}
196

197
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
198
/// The struct carries sorted index arrays the allow the graph
199
/// to be walked efficiently.
200
#[derive(Serialize, Deserialize, Debug)]
201
pub struct DebruijnGraph<K: Hash, D> {
202
    pub base: BaseGraph<K, D>,
203
    left_order: BoomHashMap<K, u32>,
204
    right_order: BoomHashMap<K, u32>,
205
}
206

207
impl<K: Kmer, D: Debug> DebruijnGraph<K, D> {
208
    /// Total number of nodes in the DeBruijn graph
209
    pub fn len(&self) -> usize {
736,100✔
210
        self.base.len()
736,100✔
211
    }
736,100✔
212

213
    pub fn is_empty(&self) -> bool {
1✔
214
        self.base.is_empty()
1✔
215
    }
1✔
216

217
    /// Get a node given it's `node_id`
218
    pub fn get_node(&self, node_id: usize) -> Node<K, D> {
4,376,824✔
219
        Node {
4,376,824✔
220
            node_id,
4,376,824✔
221
            graph: self,
4,376,824✔
222
        }
4,376,824✔
223
    }
4,376,824✔
224

225
    /// Get a node given it's `node_id`
226
    pub fn get_node_kmer(&self, node_id: usize) -> NodeKmer<K, D> {
×
227
        let node = self.get_node(node_id);
×
228
        let node_seq = node.sequence();
×
229

×
230
        NodeKmer {
×
231
            node_id,
×
232
            node_seq_slice: node_seq,
×
233
            phantom_d: PhantomData,
×
234
            phantom_k: PhantomData,
×
235
        }
×
236
    }
×
237

238
    /// Return an iterator over all nodes in the graph
239
    pub fn iter_nodes(&self) -> NodeIter<K, D> {
129✔
240
        NodeIter {
129✔
241
            graph: self,
129✔
242
            node_id: 0,
129✔
243
        }
129✔
244
    }
129✔
245

246
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
247
    /// accessed via a Node wrapper object
248
    fn find_edges(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
507,902✔
249
        let exts = self.base.exts[node_id];
507,902✔
250
        let sequence = self.base.sequences.get(node_id);
507,902✔
251
        let kmer: K = sequence.term_kmer(dir);
507,902✔
252
        let mut edges = SmallVec4::new();
507,902✔
253

254
        for i in 0..4 {
2,539,510✔
255
            if exts.has_ext(dir, i) {
2,031,608✔
256
                let link = self.find_link(kmer.extend(i, dir), dir).expect("missing link");
732,811✔
257
                edges.push((i, link.0, link.1, link.2));
732,811✔
258
            }
1,298,797✔
259
        }
260

261
        edges
507,902✔
262
    }
507,902✔
263

264
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
265
    /// accessed via a Node wrapper object
266
    /// 
267
    /// allows missing links
268
    fn _find_edges_sharded(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
×
269
        let exts = self.base.exts[node_id];
×
270
        let sequence = self.base.sequences.get(node_id);
×
271
        let kmer: K = sequence.term_kmer(dir);
×
272
        let mut edges = SmallVec4::new();
×
273

274
        for i in 0..4 {
×
275
            if exts.has_ext(dir, i) {
×
276
                let link = self.find_link(kmer.extend(i, dir), dir); //.expect("missing link");
×
277
                if let Some(l) = link {
×
278
                    edges.push((i, l.0, l.1, l.2));
×
279
                }
×
280
                // Otherwise, this edge doesn't exist within this shard, so ignore it.
281
                // NOTE: this should be allowed in a 'complete' DBG
282
            }
×
283
        }
284

285
        edges
×
286
    }
×
287

288
    /// Seach for the kmer `kmer`, appearing at the given `side` of a node sequence.
289
    fn search_kmer(&self, kmer: K, side: Dir) -> Option<usize> {
4,017,144✔
290
        match side {
4,017,144✔
291
            Dir::Left => self.left_order.get(&kmer).map(|pos| *pos as usize),
2,007,793✔
292
            Dir::Right => self.right_order.get(&kmer).map(|pos| *pos as usize),
2,009,351✔
293
        }
294
    }
4,017,144✔
295

296
    /// Find a link in the graph, possibly handling a RC switch.
297
    pub fn find_link(&self, kmer: K, dir: Dir) -> Option<(usize, Dir, bool)> {
2,813,464✔
298
        // Only test self-consistent paths through
2,813,464✔
299
        // the edges
2,813,464✔
300
        // Avoids problems due to single kmer edges
2,813,464✔
301
        // (dir, next_side_incoming, rc)
2,813,464✔
302
        // (Dir::Left, Dir::Right, false) => true,
2,813,464✔
303
        // (Dir::Left, Dir::Left,  true) => true,
2,813,464✔
304
        // (Dir::Right, Dir::Left, false) => true,
2,813,464✔
305
        // (Dir::Right, Dir::Right, true) => true,
2,813,464✔
306

2,813,464✔
307
        let rc = kmer.rc();
2,813,464✔
308

2,813,464✔
309
        match dir {
2,813,464✔
310
            Dir::Left => {
311
                if let Some(idx) = self.search_kmer(kmer, Dir::Right) {
1,408,685✔
312
                    return Some((idx, Dir::Right, false));
805,671✔
313
                }
603,014✔
314

603,014✔
315
                if !self.base.stranded {
603,014✔
316
                    if let Some(idx) = self.search_kmer(rc, Dir::Left) {
603,014✔
317
                        return Some((idx, Dir::Left, true));
603,014✔
318
                    }
×
319
                }
×
320
            }
321

322
            Dir::Right => {
323
                if let Some(idx) = self.search_kmer(kmer, Dir::Left) {
1,404,779✔
324
                    return Some((idx, Dir::Left, false));
804,113✔
325
                }
600,666✔
326

600,666✔
327
                if !self.base.stranded {
600,666✔
328
                    if let Some(idx) = self.search_kmer(rc, Dir::Right) {
600,666✔
329
                        return Some((idx, Dir::Right, true));
600,666✔
330
                    }
×
331
                }
×
332
            }
333
        }
334

335
        None
×
336
    }
2,813,464✔
337

338
    /// Check whether the graph is fully compressed. Return `None` if it's compressed,
339
    /// otherwise return `Some(node1, node2)` representing a pair of node that could
340
    /// be collapsed. Probably only useful for testing.
341
    pub fn is_compressed<S: CompressionSpec<D>>(&self, spec: &S) -> Option<(usize, usize)> {
567✔
342
        for i in 0..self.len() {
72,270✔
343
            let n = self.get_node(i);
72,270✔
344

345
            for dir in &[Dir::Left, Dir::Right] {
216,810✔
346
                let dir_edges = n.edges(*dir);
144,540✔
347
                if dir_edges.len() == 1 {
144,540✔
348
                    let (_, next_id, return_dir, _) = dir_edges[0];
101,948✔
349
                    let next = self.get_node(next_id);
101,948✔
350

101,948✔
351
                    let ret_edges = next.edges(return_dir);
101,948✔
352
                    if ret_edges.len() == 1 {
101,948✔
353
                        // Test for us being a palindrome: this makes it OK
354
                        if n.len() == K::k() && n.sequence().first_kmer::<K>().is_palindrome() {
18✔
355
                            continue;
12✔
356
                        }
6✔
357

6✔
358
                        // Test for a neighbor being a palindrome: this makes it OK
6✔
359
                        if next.len() == K::k() && next.sequence().first_kmer::<K>().is_palindrome()
6✔
360
                        {
361
                            continue;
6✔
362
                        }
×
363

×
364
                        // Test for this edge representing a smooth circle (biting it's own tail):
×
365
                        if n.node_id == next_id {
×
366
                            continue;
×
367
                        }
×
368

×
369
                        if spec.join_test(n.data(), next.data()) {
×
370
                            // Found a unbranched edge that should have been eliminated
371
                            return Some((i, next_id));
×
372
                        }
×
373
                    }
101,930✔
374
                }
42,592✔
375
            }
376
        }
377

378
        None
567✔
379
    }
567✔
380

381
    /// Remove non-existent extensions that may be created due to filtered kmers
382
    /// 
383
    /// if `valid_nodes` if `None`, all nodes are valid
384
    pub fn fix_exts(&mut self, valid_nodes: Option<&BitSet>) {
247✔
385
        for i in 0..self.len() {
688,205✔
386
            let valid_exts = self.get_valid_exts(i, valid_nodes);
688,205✔
387
            self.base.exts[i] = valid_exts;
688,205✔
388
        }
688,205✔
389
    }
247✔
390

391
    pub fn get_valid_exts(&self, node_id: usize, valid_nodes: Option<&BitSet>) -> Exts {
688,205✔
392
        let mut new_exts = Exts::empty();
688,205✔
393
        let node = self.get_node(node_id);
688,205✔
394
        let exts = node.exts();
688,205✔
395
        let l_kmer: K = node.sequence().first_kmer();
688,205✔
396
        let r_kmer: K = node.sequence().last_kmer();
688,205✔
397

688,205✔
398
        let check_node = |id| match valid_nodes {
1,401,282✔
399
            Some(bs) => bs.contains(id),
1,358,888✔
400
            None => true,
42,394✔
401
        };
1,401,282✔
402

403
        for i in 0..4 {
3,441,025✔
404
            if exts.has_ext(Dir::Left, i) {
2,752,820✔
405
                match self.find_link(l_kmer.extend_left(i), Dir::Left) {
700,745✔
406
                    Some((target, _, _)) if check_node(target) => {
700,745✔
407
                        new_exts = new_exts.set(Dir::Left, i)
700,745✔
408
                    }
409
                    _ => (),
×
410
                }
411
            }
2,052,075✔
412

413
            if exts.has_ext(Dir::Right, i) {
2,752,820✔
414
                match self.find_link(r_kmer.extend_right(i), Dir::Right) {
700,537✔
415
                    Some((target, _, _)) if check_node(target) => {
700,537✔
416
                        new_exts = new_exts.set(Dir::Right, i)
700,537✔
417
                    }
418
                    _ => (),
×
419
                }
420
            }
2,052,283✔
421
        }
422

423
        new_exts
688,205✔
424
    }
688,205✔
425

426
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
427
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
428
    /// paths will be terminated if there are multiple valid paths emanating from a node.
429
    pub fn max_path<F, F2>(&self, score: F, solid_path: F2) -> Vec<(usize, Dir)>
×
430
    where
×
431
        F: Fn(&D) -> f32,
×
432
        F2: Fn(&D) -> bool,
×
433
    {
×
434
        if self.is_empty() {
×
435
            return Vec::default();
×
436
        }
×
437

×
438
        let mut best_node = 0;
×
439
        let mut best_score = f32::MIN;
×
440
        for i in 0..self.len() {
×
441
            let node = self.get_node(i);
×
442
            let node_score = score(node.data());
×
443

×
444
            if node_score > best_score {
×
445
                best_node = i;
×
446
                best_score = node_score;
×
447
            }
×
448
        }
449

450
        let oscore = |state| match state {
×
451
            None => 0.0,
×
452
            Some((id, _)) => score(self.get_node(id).data()),
×
453
        };
×
454

455
        let osolid_path = |state| match state {
×
456
            None => false,
×
457
            Some((id, _)) => solid_path(self.get_node(id).data()),
×
458
        };
×
459

460
        // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
461
        // already put into the path
462
        let mut used_nodes = HashSet::new();
×
463
        let mut path = VecDeque::new();
×
464

×
465
        // Start w/ initial state
×
466
        used_nodes.insert(best_node);
×
467
        path.push_front((best_node, Dir::Left));
×
468

469
        for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
×
470
            let &(start_node, dir, do_flip) = init;
×
471
            let mut current = (start_node, dir);
×
472

473
            loop {
474
                let mut next = None;
×
475
                let (cur_id, incoming_dir) = current;
×
476
                let node = self.get_node(cur_id);
×
477
                let edges = node.edges(incoming_dir.flip());
×
478

×
479
                let mut solid_paths = 0;
×
480
                for (_, id, dir, _) in edges {
×
481
                    let cand = Some((id, dir));
×
482
                    if osolid_path(cand) {
×
483
                        solid_paths += 1;
×
484
                    }
×
485

486
                    if oscore(cand) > oscore(next) {
×
487
                        next = cand;
×
488
                    }
×
489
                }
490

491
                if solid_paths > 1 {
×
492
                    break;
×
493
                }
×
494

495
                match next {
×
496
                    Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
×
497
                        if do_flip {
×
498
                            path.push_front((next_id, next_incoming.flip()));
×
499
                        } else {
×
500
                            path.push_back((next_id, next_incoming));
×
501
                        }
×
502

503
                        used_nodes.insert(next_id);
×
504
                        current = (next_id, next_incoming);
×
505
                    }
506
                    _ => break,
×
507
                }
508
            }
509
        }
510

511
        Vec::from_iter(path)
×
512
    }
×
513

514

515
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
516
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
517
    /// paths will be terminated if there are multiple valid paths emanating from a node.
518
    /// Returns vec with path for each component
519
    pub fn max_path_comp<F, F2>(&self, score: F, solid_path: F2) -> Vec<VecDeque<(usize, Dir)>>
1✔
520
    where
1✔
521
        F: Fn(&D) -> f32,
1✔
522
        F2: Fn(&D) -> bool,
1✔
523
    {
1✔
524
        if self.is_empty() {
1✔
525
            let vec: Vec<VecDeque<(usize, Dir)>> = Vec::new();
×
526
            return vec;
×
527
        }
1✔
528

1✔
529
        let components = self.iter_components();
1✔
530
        let mut paths: Vec<VecDeque<(usize, Dir)>> = Vec::new();
1✔
531

532
        for component in components {
3✔
533

534
            let current_comp = &component;
2✔
535
            
2✔
536

2✔
537
            let mut best_node = current_comp[0];
2✔
538
            let mut best_score = f32::MIN;
2✔
539
            for c in current_comp.iter() {
6✔
540
                let node = self.get_node(*c);
6✔
541
                let node_score = score(node.data());
6✔
542

6✔
543
                if node_score > best_score {
6✔
544
                    best_node = *c;
2✔
545
                    best_score = node_score;
2✔
546
                }
4✔
547
            }
548

549
            let oscore = |state| match state {
4✔
550
                None => 0.0,
2✔
551
                Some((id, _)) => score(self.get_node(id).data()),
2✔
552
            };
4✔
553

554
            let osolid_path = |state| match state {
2✔
555
                None => false,
×
556
                Some((id, _)) => solid_path(self.get_node(id).data()),
2✔
557
            };
2✔
558

559
            // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
560
            // already put into the path
561
            let mut used_nodes = HashSet::new();
2✔
562
            let mut path = VecDeque::new();
2✔
563

2✔
564
            // Start w/ initial state
2✔
565
            used_nodes.insert(best_node);
2✔
566
            path.push_front((best_node, Dir::Left));
2✔
567

568
            for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
4✔
569
                let &(start_node, dir, do_flip) = init;
4✔
570
                let mut current = (start_node, dir);
4✔
571

572
                loop {
573
                    let mut next = None;
6✔
574
                    let (cur_id, incoming_dir) = current;
6✔
575
                    let node = self.get_node(cur_id);
6✔
576
                    let edges = node.edges(incoming_dir.flip());
6✔
577

6✔
578
                    let mut solid_paths = 0;
6✔
579
                    for (_, id, dir, _) in edges {
8✔
580
                        let cand = Some((id, dir));
2✔
581
                        if osolid_path(cand) {
2✔
582
                            solid_paths += 1;
2✔
583
                        }
2✔
584

585
                        if oscore(cand) > oscore(next) {
2✔
586
                            next = cand;
2✔
587
                        }
2✔
588
                    }
589

590
                    if solid_paths > 1 {
6✔
591
                        break;
×
592
                    }
6✔
593

594
                    match next {
2✔
595
                        Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
2✔
596
                            if do_flip {
2✔
597
                                path.push_front((next_id, next_incoming.flip()));
1✔
598
                            } else {
1✔
599
                                path.push_back((next_id, next_incoming));
1✔
600
                            }
1✔
601

602
                            used_nodes.insert(next_id);
2✔
603
                            current = (next_id, next_incoming);
2✔
604
                        }
605
                        _ => break,
4✔
606
                    }
607
                }
608
            }
609
            
610
            paths.push(path);
2✔
611
            //paths.push(Vec::from_iter(path));
612
        }
613

614
        paths
1✔
615
    
616
    }
1✔
617

618
    pub fn iter_max_path_comp<F, F2>(&self, score: F, solid_path: F2) -> PathCompIter<K, D, F, F2> 
2✔
619
    where 
2✔
620
    F: Fn(&D) -> f32,
2✔
621
    F2: Fn(&D) -> bool
2✔
622
    {
2✔
623
        let component_iterator = self.iter_components();
2✔
624
        PathCompIter { graph: self, component_iterator, graph_pos: 0, score, solid_path }
2✔
625
    }
2✔
626

627
    /// write the paths from `iter_max_path_comp` to a fasta file
628
    pub fn path_to_fasta<F, F2>(&self, f: &mut dyn std::io::Write, path_iter: PathCompIter<K, D, F, F2>, return_lens: bool) -> (Vec<usize>, Vec<usize>)
1✔
629
    where 
1✔
630
    F: Fn(&D) -> f32,
1✔
631
    F2: Fn(&D) -> bool
1✔
632
    {
1✔
633
        // width of fasta lines
1✔
634
        let columns = 80;
1✔
635

1✔
636
        // sizes of components and of paths
1✔
637
        let mut comp_sizes = Vec::new();
1✔
638
        let mut path_lens = Vec::new();
1✔
639

640
        for (seq_counter, (component, path)) in path_iter.enumerate() {
2✔
641
            // get dna sequence from path
642
            let seq = self.sequence_of_path(path.iter());
2✔
643

2✔
644
            //write header with length & start node
2✔
645
            writeln!(f, ">path{} len={} start_node={}", seq_counter, seq.len(), path[0].0).unwrap();
2✔
646

2✔
647
            // calculate how sequence has to be split up
2✔
648
            let slices = (seq.len() / columns) + 1;
2✔
649
            let mut ranges = Vec::with_capacity(slices);
2✔
650

2✔
651
            let mut start = 0;
2✔
652
            while start < seq.len() {
5✔
653
                ranges.push(start..start + columns);
3✔
654
                start += columns;
3✔
655
            }
3✔
656

657
            let last_start = ranges.pop().expect("no kmers in parallel ranges").start;
2✔
658
            ranges.push(last_start..seq.len());
2✔
659

660
            // split up sequence and write to file accordingly
661
            for range in ranges {
5✔
662
                writeln!(f, "{:?}", seq.slice(range.start, range.end)).unwrap();
3✔
663
            }
3✔
664

665
            if return_lens {
2✔
666
                comp_sizes.push(component.len());
×
667
                path_lens.push(path.len());
×
668
            }
2✔
669
        }    
670

671
        (comp_sizes, path_lens)
1✔
672
        
1✔
673
    }
1✔
674

675

676
    /// Get the sequence of a path through the graph. The path is given as a sequence of node_id integers
677
    pub fn sequence_of_path<'a, I: 'a + Iterator<Item = &'a (usize, Dir)>>(
14,980✔
678
        &self,
14,980✔
679
        path: I,
14,980✔
680
    ) -> DnaString {
14,980✔
681
        let mut seq = DnaString::new();
14,980✔
682

683
        for (idx, &(node_id, dir)) in path.enumerate() {
673,237✔
684
            let start = if idx == 0 { 0 } else { K::k() - 1 };
673,237✔
685

686
            let node_seq = match dir {
673,237✔
687
                Dir::Left => self.get_node(node_id).sequence(),
346,710✔
688
                Dir::Right => self.get_node(node_id).sequence().rc(),
326,527✔
689
            };
690

691
            for p in start..node_seq.len() {
1,151,681✔
692
                seq.push(node_seq.get(p))
1,151,681✔
693
            }
694
        }
695

696
        seq
14,980✔
697
    }
14,980✔
698

699
    /// write a node to a dot file
700
    /// 
701
    /// ### Arguments: 
702
    /// 
703
    /// * `node`: [`Node<K, D>`] which will be written to a dot file
704
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
705
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
706
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
707
    /// * `f`: writer
708
    fn node_to_dot<FN: Fn(&Node<K, D>) -> String, FE: Fn(&Node<K, D>, u8, Dir, bool) -> String>(
65,320✔
709
        &self,
65,320✔
710
        node: &Node<'_, K, D>,
65,320✔
711
        node_label: &FN,
65,320✔
712
        edge_label: &FE,
65,320✔
713
        f: &mut dyn Write,
65,320✔
714
    ) {
65,320✔
715
        writeln!(f, "n{} {}", node.node_id, node_label(node)).unwrap();
65,320✔
716
        assert_eq!(node.exts().val.count_ones() as usize, node.l_edges().len() + node.r_edges().len());
65,320✔
717

718
        for (base, id, incoming_dir, flipped) in node.l_edges() {
65,320✔
719
            writeln!(f, "n{} -> n{} {}", id, node.node_id, edge_label(node, base, incoming_dir, flipped)).unwrap();
65,062✔
720
        }
65,062✔
721

722
        for (base, id, incoming_dir, flipped) in node.r_edges() {
65,320✔
723
            writeln!(f, "n{} -> n{} {}", node.node_id, id, edge_label(node, base, incoming_dir, flipped)).unwrap();
64,778✔
724
        }
64,778✔
725
    }
65,320✔
726

727
    /// Write the graph to a dot file
728
    /// 
729
    /// ### Arguments: 
730
    /// 
731
    /// * `path`: path to the output file
732
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
733
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
734
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
735
    pub fn to_dot<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
1✔
736
    where 
1✔
737
    P: AsRef<Path>,
1✔
738
    FN: Fn(&Node<K, D>) -> String,
1✔
739
    FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
740
    {
1✔
741
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
742

1✔
743
        let pb = ProgressBar::new(self.len() as u64);
1✔
744
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
745
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
1✔
746

1✔
747
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
748
        for i in (0..self.len()).progress_with(pb) {
32,658✔
749
            self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
32,658✔
750
        }
32,658✔
751
        writeln!(&mut f, "}}").unwrap();
1✔
752
        
1✔
753
        f.flush().unwrap();
1✔
754
        debug!("large to dot loop: {}", self.len());
1✔
755
    }
1✔
756

757
    /// Write the graph to a dot file in parallel
758
    /// Will write in to n_threads files simultaniously,
759
    /// then go though the files and add the contents to a larger file, 
760
    /// and delete the small files.
761
    /// 
762
    /// ### Arguments: 
763
    /// 
764
    /// * `path`: path to the output file
765
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
766
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
767
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
768
    pub fn to_dot_parallel<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
1✔
769
    where 
1✔
770
        D: Sync,
1✔
771
        K: Sync,
1✔
772
        P: AsRef<Path> + Display + Sync,
1✔
773
        FN: Fn(&Node<K, D>) -> String + Sync,
1✔
774
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String + Sync,
1✔
775
    {        
1✔
776
        let slices = current_num_threads();
1✔
777
        let n_nodes = self.len();
1✔
778
        let sz = n_nodes / slices + 1;
1✔
779

1✔
780
        debug!("n_nodes: {}", n_nodes);
1✔
781
        debug!("sz: {}", sz);
1✔
782

783
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
784
        let mut start = 0;
1✔
785
        while start < n_nodes {
5✔
786
            parallel_ranges.push(start..start + sz);
4✔
787
            start += sz;
4✔
788
        }
4✔
789

790
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
791
        parallel_ranges.push(last_start..n_nodes);
1✔
792
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
793

794
        let mut files = Vec::with_capacity(current_num_threads());
1✔
795

796
        for i in 0..parallel_ranges.len() {
4✔
797
            files.push(format!("{}-{}.dot", path, i));
4✔
798
        } 
4✔
799

800
        let pb = ProgressBar::new(self.len() as u64);
1✔
801
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
802
        pb.set_message(format!("{:<32}", "writing graph to DOT files"));
1✔
803
    
1✔
804
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
4✔
805
            let mut f = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel dot file"));
4✔
806

807
            for i in range {
32,662✔
808
                self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
32,658✔
809
                pb.inc(1);
32,658✔
810
            }
32,658✔
811

812
            f.flush().unwrap();
4✔
813
        });
4✔
814
        pb.finish_and_clear();
1✔
815

1✔
816
        let mut out_file = BufWriter::with_capacity(BUF, File::create(path).expect("error creating combined dot file"));
1✔
817

1✔
818
        writeln!(&mut out_file, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
819

1✔
820
        let pb = ProgressBar::new(files.len() as u64);
1✔
821
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
822
        pb.set_message(format!("{:<32}", "combining files"));
1✔
823

824
        for file in files.iter().progress_with(pb) {
4✔
825
            let open_file = File::open(file).expect("error opening parallel dot file");
4✔
826
            let mut reader = BufReader::new(open_file);
4✔
827
            let mut buffer = [0; BUF];
4✔
828

829
            loop {
830
                let linecount = reader.read(&mut buffer).unwrap();
228✔
831
                if linecount == 0 { break }
228✔
832
                out_file.write_all(&buffer[..linecount]).unwrap();
224✔
833
            }
834

835
            remove_file(file).unwrap();
4✔
836
        }
837

838
        writeln!(&mut out_file, "}}").unwrap();
1✔
839

1✔
840
        out_file.flush().unwrap();
1✔
841

1✔
842

1✔
843
    }
1✔
844

845
    /// Write part of the graph to a dot file
846
    /// 
847
    /// ### Arguments: 
848
    /// 
849
    /// * `path`: path to the output file
850
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
851
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
852
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
853
    /// * `nodes`: [`Vec<usize>`] listing all IDs of nodes which should be included
854
    pub fn to_dot_partial<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE, nodes: Vec<usize>) 
1✔
855
    where 
1✔
856
        P: AsRef<Path>,
1✔
857
        FN: Fn(&Node<K, D>) -> String,
1✔
858
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
859
    {
1✔
860
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
861

1✔
862
        let pb = ProgressBar::new(nodes.len() as u64);
1✔
863
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
864
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
1✔
865

1✔
866
        writeln!(&mut f, "# {:?}", nodes).unwrap();
1✔
867
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
868
        for i in nodes.into_iter().progress_with(pb) {
4✔
869
            self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
4✔
870
        }
4✔
871
        writeln!(&mut f, "}}").unwrap();
1✔
872

1✔
873
        f.flush().unwrap();
1✔
874

1✔
875
        debug!("large to dot loop: {}", self.len());
1✔
876
    }
1✔
877

878
    fn node_to_gfa<F: Fn(&Node<'_, K, D>) -> String>(
12✔
879
        &self,
12✔
880
        node: &Node<'_, K, D>,
12✔
881
        w: &mut dyn Write,
12✔
882
        tag_func: Option<&F>,
12✔
883
    ) -> Result<(), Error> {
12✔
884
        match tag_func {
12✔
885
            Some(f) => {
12✔
886
                let tags = (f)(node);
12✔
887
                writeln!(
12✔
888
                    w,
12✔
889
                    "S\t{}\t{}\t{}",
12✔
890
                    node.node_id,
12✔
891
                    node.sequence().to_dna_string(),
12✔
892
                    tags
12✔
893
                )?;
12✔
894
            }
895
            _ => writeln!(
×
896
                w,
×
897
                "S\t{}\t{}",
×
898
                node.node_id,
×
899
                node.sequence().to_dna_string()
×
900
            )?,
×
901
        }
902

903
        for (_, target, dir, _) in node.l_edges() {
12✔
904
            if target >= node.node_id {
8✔
905
                let to_dir = match dir {
4✔
906
                    Dir::Left => "+",
4✔
UNCOV
907
                    Dir::Right => "-",
×
908
                };
909
                writeln!(
4✔
910
                    w,
4✔
911
                    "L\t{}\t-\t{}\t{}\t{}M",
4✔
912
                    node.node_id,
4✔
913
                    target,
4✔
914
                    to_dir,
4✔
915
                    K::k() - 1
4✔
916
                )?;
4✔
917
            }
4✔
918
        }
919

920
        for (_, target, dir, _) in node.r_edges() {
12✔
921
            if target > node.node_id {
8✔
922
                let to_dir = match dir {
4✔
UNCOV
923
                    Dir::Left => "+",
×
924
                    Dir::Right => "-",
4✔
925
                };
926
                writeln!(
4✔
927
                    w,
4✔
928
                    "L\t{}\t+\t{}\t{}\t{}M",
4✔
929
                    node.node_id,
4✔
930
                    target,
4✔
931
                    to_dir,
4✔
932
                    K::k() - 1
4✔
933
                )?;
4✔
934
            }
4✔
935
        }
936

937
        Ok(())
12✔
938
    }
12✔
939

940
    /// Write the graph to GFA format
941
    pub fn to_gfa<P: AsRef<Path>>(&self, gfa_out: P) -> Result<(), Error> {
×
942
        let wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
×
943
        self.write_gfa(&mut std::io::BufWriter::new(wtr))
×
944
    }
×
945

946
    pub fn write_gfa(&self, wtr: &mut impl Write) -> Result<(), Error> {
×
947
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
×
948

949
        type DummyFn<K, D> = fn(&Node<'_, K, D>) -> String;
950
        let dummy_opt: Option<&DummyFn<K, D>> = None;
×
951

×
952
        let pb = ProgressBar::new(self.len() as u64);
×
953
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
×
954
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
×
955

956
        for i in (0..self.len()).progress_with(pb) {
×
957
            let n = self.get_node(i);
×
958
            self.node_to_gfa(&n, wtr, dummy_opt)?;
×
959
        }
960

961
        wtr.flush().unwrap();
×
962

×
963
        Ok(())
×
964
    }
×
965

966
    /// Write the graph to GFA format
967
    pub fn to_gfa_with_tags<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(
1✔
968
        &self,
1✔
969
        gfa_out: P,
1✔
970
        tag_func: F,
1✔
971
    ) -> Result<(), Error> {
1✔
972
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creatinf gfa file"));
1✔
973
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
974

975
        let pb = ProgressBar::new(self.len() as u64);
1✔
976
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
977
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
978

979
        for i in (0..self.len()).progress_with(pb) {
6✔
980
            let n = self.get_node(i);
6✔
981
            self.node_to_gfa(&n, &mut wtr, Some(&tag_func))?;
6✔
982
        }
983

984
        wtr.flush().unwrap();
1✔
985

1✔
986
        Ok(())
1✔
987
    }
1✔
988

989
    /// Write the graph to GFA format, with multithreading, 
990
    /// pass `tag_func=None` to write without tags
991
    pub fn to_gfa_otags_parallel<P: AsRef<Path> + Display + Sync, F: Fn(&Node<'_, K, D>) -> String + Sync>(
1✔
992
        &self,
1✔
993
        gfa_out: P,
1✔
994
        tag_func: Option<&F>,
1✔
995
    ) -> Result<(), Error> 
1✔
996
    where 
1✔
997
    K: Sync,
1✔
998
    D: Sync,
1✔
999
    {
1✔
1000
        // split into ranges according to thread count
1✔
1001
        let slices = current_num_threads();
1✔
1002
        let n_nodes = self.len();
1✔
1003
        let sz = n_nodes / slices + 1;
1✔
1004

1✔
1005
        debug!("n_nodes: {}", n_nodes);
1✔
1006
        debug!("sz: {}", sz);
1✔
1007

1008
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
1009
        let mut start = 0;
1✔
1010
        while start < n_nodes {
4✔
1011
            parallel_ranges.push(start..start + sz);
3✔
1012
            start += sz;
3✔
1013
        }
3✔
1014

1015
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
1016
        parallel_ranges.push(last_start..n_nodes);
1✔
1017
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
1018

1019
        let mut files = Vec::with_capacity(current_num_threads());
1✔
1020

1021
        for i in 0..parallel_ranges.len() {
3✔
1022
            files.push(format!("{}-{}.gfa", gfa_out, i));
3✔
1023
        } 
3✔
1024

1025
        let pb = ProgressBar::new(self.len() as u64);
1✔
1026
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1027
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1028
        
1✔
1029
        
1✔
1030
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
3✔
1031
            let mut wtr = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel gfa file"));
3✔
1032

1033
            for i in range {
9✔
1034
                let n = self.get_node(i);
6✔
1035
                self.node_to_gfa(&n, &mut wtr, tag_func).unwrap();
6✔
1036
                pb.inc(1);
6✔
1037
            }
6✔
1038

1039
            wtr.flush().unwrap();
3✔
1040
        });
3✔
1041

1✔
1042
        pb.finish_and_clear();
1✔
1043

1✔
1044
        // combine files
1✔
1045
        let mut out_file = BufWriter::with_capacity(BUF, File::create(format!("{}.gfa", gfa_out)).expect("error creating combined gfa file"));
1✔
1046
        writeln!(out_file, "H\tVN:Z:debruijn-rs")?;
1✔
1047

1048
        let pb = ProgressBar::new(files.len() as u64);
1✔
1049
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1050
        pb.set_message(format!("{:<32}", "combining files"));
1✔
1051

1052
        for file in files.iter() {
3✔
1053
            let open_file = File::open(file).expect("error opening parallel gfa file");
3✔
1054
            let mut reader = BufReader::new(open_file);
3✔
1055
            let mut buffer = [0; BUF];
3✔
1056

1057
            loop {
1058
                let linecount = reader.read(&mut buffer).unwrap();
6✔
1059
                if linecount == 0 { break }
6✔
1060
                out_file.write_all(&buffer[..linecount]).unwrap();
3✔
1061
            }
1062

1063
            remove_file(file).unwrap();
3✔
1064
        }
1065

1066
        out_file.flush().unwrap();
1✔
1067

1✔
1068
        Ok(())
1✔
1069
    }
1✔
1070

1071
    /// Write the graph to GFA format
1072
    pub fn to_gfa_partial<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(&self, gfa_out: P, tag_func: Option<&F>, nodes: Vec<usize>) -> Result<(), Error> {
×
1073
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
×
1074
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
×
1075

1076
        let pb = ProgressBar::new(self.len() as u64);
×
1077
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
×
1078
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
×
1079

1080
        for i in nodes.into_iter().progress_with(pb) {
×
1081
            let n = self.get_node(i);
×
1082
            self.node_to_gfa(&n, &mut wtr, tag_func)?;
×
1083
        }
1084

1085
        wtr.flush().unwrap();
×
1086

×
1087
        Ok(())    
×
1088
    }
×
1089

1090
    pub fn to_json_rest<W: Write, F: Fn(&D) -> Value>(
×
1091
        &self,
×
1092
        fmt_func: F,
×
1093
        mut writer: &mut W,
×
1094
        rest: Option<Value>,
×
1095
    ) {
×
1096
        writeln!(writer, "{{\n\"nodes\": [").unwrap();
×
1097
        for i in 0..self.len() {
×
1098
            let node = self.get_node(i);
×
1099
            node.to_json(&fmt_func, writer);
×
1100
            if i == self.len() - 1 {
×
1101
                writeln!(writer).unwrap();
×
1102
            } else {
×
1103
                writeln!(writer, ",").unwrap();
×
1104
            }
×
1105
        }
1106
        writeln!(writer, "],").unwrap();
×
1107

×
1108
        writeln!(writer, "\"links\": [").unwrap();
×
1109
        for i in 0..self.len() {
×
1110
            let node = self.get_node(i);
×
1111
            match node.edges_to_json(writer) {
×
1112
                true => {
1113
                    if i == self.len() - 1 {
×
1114
                        writeln!(writer).unwrap();
×
1115
                    } else {
×
1116
                        writeln!(writer, ",").unwrap();
×
1117
                    }
×
1118
                }
1119
                _ => continue,
×
1120
            }
1121
        }
1122
        writeln!(writer, "]").unwrap();
×
1123

1124
        match rest {
×
1125
            Some(Value::Object(v)) => {
×
1126
                for (k, v) in v.iter() {
×
1127
                    writeln!(writer, ",").expect("io error");
×
1128
                    write!(writer, "\"{}\": ", k).expect("io error");
×
1129
                    serde_json::to_writer(&mut writer, v).expect("io error");
×
1130
                    writeln!(writer).expect("io error");
×
1131
                }
×
1132
            }
1133
            _ => {
×
1134
                writeln!(writer).expect("io error");
×
1135
            }
×
1136
        }
1137

1138
        writeln!(writer, "}}").expect("io error");
×
1139
    }
×
1140

1141
    /// Write the graph to JSON
1142
    pub fn to_json<W: Write, F: Fn(&D) -> Value, RF: Fn(&mut W)>(
×
1143
        &self,
×
1144
        fmt_func: F,
×
1145
        writer: &mut W,
×
1146
    ) {
×
1147
        self.to_json_rest(fmt_func, writer, None);
×
1148
    }
×
1149

1150
    /// Print a text representation of the graph.
1151
    pub fn print(&self) {
2✔
1152
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
2✔
1153
        for node in self.iter_nodes() {
8✔
1154
            println!("{:?}", node);
8✔
1155
        }
8✔
1156
    }
2✔
1157

1158
    pub fn print_with_data(&self) {
×
1159
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
×
1160
        for node in self.iter_nodes() {
×
1161
            println!("{:?} ({:?})", node, node.data());
×
1162
        }
×
1163
    }
×
1164

1165
    pub fn max_path_beam<F, F2>(&self, beam: usize, score: F, _solid_path: F2) -> Vec<(usize, Dir)>
×
1166
    where
×
1167
        F: Fn(&D) -> f32,
×
1168
        F2: Fn(&D) -> bool,
×
1169
    {
×
1170
        if self.is_empty() {
×
1171
            return Vec::default();
×
1172
        }
×
1173

×
1174
        let mut states = Vec::new();
×
1175

1176
        for i in 0..self.len() {
×
1177
            let node = self.get_node(i);
×
1178

×
1179
            // Initialize beam search on terminal nodes
×
1180
            if node.exts().num_exts_l() == 0 || node.exts().num_exts_r() == 0 {
×
1181
                let dir = if node.exts().num_exts_l() > 0 {
×
1182
                    Dir::Right
×
1183
                } else {
1184
                    Dir::Left
×
1185
                };
1186

1187
                let status = if node.exts().num_exts_l() == 0 && node.exts().num_exts_r() == 0 {
×
1188
                    Status::End
×
1189
                } else {
1190
                    Status::Active
×
1191
                };
1192

1193
                let mut path = SmallVec8::new();
×
1194
                path.push((i as u32, dir));
×
1195

×
1196
                let s = State {
×
1197
                    path,
×
1198
                    status,
×
1199
                    score: score(node.data()),
×
1200
                };
×
1201
                states.push(s);
×
1202
            }
×
1203
        }
1204

1205
        // No end nodes -- just start on the first node
1206
        if states.is_empty() {
×
1207
            // Make a start
×
1208
            let node = self.get_node(0);
×
1209
            let mut path = SmallVec8::new();
×
1210
            path.push((0, Dir::Left));
×
1211
            states.push(State {
×
1212
                path,
×
1213
                status: Status::Active,
×
1214
                score: score(node.data()),
×
1215
            });
×
1216
        }
×
1217

1218
        // Beam search until we can't find any more expansions
1219
        let mut active = true;
×
1220
        while active {
×
1221
            let mut new_states = Vec::with_capacity(states.len());
×
1222
            active = false;
×
1223

1224
            for s in states {
×
1225
                if s.status == Status::Active {
×
1226
                    active = true;
×
1227
                    let expanded = self.expand_state(&s, &score);
×
1228
                    new_states.extend(expanded);
×
1229
                } else {
×
1230
                    new_states.push(s)
×
1231
                }
1232
            }
1233

1234
            // workaround to sort by descending score - will panic if there are NaN scores
1235
            new_states.sort_by(|a, b| (-(a.score)).partial_cmp(&-(b.score)).unwrap());
×
1236
            new_states.truncate(beam);
×
1237
            states = new_states;
×
1238
        }
×
1239

1240
        for (i, state) in states.iter().take(5).enumerate() {
×
1241
            trace!("i:{}  -- {:?}", i, state);
×
1242
        }
1243

1244
        // convert back to using usize for node_id
1245
        states[0]
×
1246
            .path
×
1247
            .iter()
×
1248
            .map(|&(node, dir)| (node as usize, dir))
×
1249
            .collect()
×
1250
    }
×
1251

1252
    fn expand_state<F>(&self, state: &State, score: &F) -> SmallVec4<State>
×
1253
    where
×
1254
        F: Fn(&D) -> f32,
×
1255
    {
×
1256
        if state.status != Status::Active {
×
1257
            panic!("only attempt to expand active states")
×
1258
        }
×
1259

×
1260
        let (node_id, dir) = state.path[state.path.len() - 1];
×
1261
        let node = self.get_node(node_id as usize);
×
1262
        let mut new_states = SmallVec4::new();
×
1263

1264
        for (_, next_node_id, incoming_dir, _) in node.edges(dir.flip()) {
×
1265
            let next_node = self.get_node(next_node_id);
×
1266
            let new_score = state.score + score(next_node.data());
×
1267

×
1268
            let cycle = state
×
1269
                .path
×
1270
                .iter()
×
1271
                .any(|&(prev_node, _)| prev_node == (next_node_id as u32));
×
1272

1273
            let status = if cycle {
×
1274
                Status::Cycle
×
1275
            } else if next_node.edges(incoming_dir.flip()).is_empty() {
×
1276
                Status::End
×
1277
            } else {
1278
                Status::Active
×
1279
            };
1280

1281
            let mut new_path = state.path.clone();
×
1282
            new_path.push((next_node_id as u32, incoming_dir));
×
1283

×
1284
            let next_state = State {
×
1285
                path: new_path,
×
1286
                score: new_score,
×
1287
                status,
×
1288
            };
×
1289

×
1290
            new_states.push(next_state);
×
1291
        }
1292

1293
        new_states
×
1294
    }
×
1295

1296

1297
    pub fn iter_components(&self) -> IterComponents<K, D> {
4✔
1298
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
4✔
1299
        let pos = 0;
4✔
1300

1301
        for _i in 0..self.len() {
24✔
1302
            visited.push(false);
24✔
1303
        }
24✔
1304

1305
        IterComponents { 
4✔
1306
            graph: self, 
4✔
1307
            visited, 
4✔
1308
            pos }
4✔
1309
    }
4✔
1310

1311

1312
    /// iteratively returns 2D Vec with node_ids grouped according to the connected components they form
1313
    pub fn components_i(&self) -> Vec<Vec<usize>> {
1✔
1314
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
1✔
1315
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
1✔
1316

1317
        for _i in 0..self.len() {
6✔
1318
            visited.push(false);
6✔
1319
        }
6✔
1320

1321
        for i in 0..self.len() {
6✔
1322
            if !visited[i] {
6✔
1323
                let comp = self.component_i(&mut visited, i);
2✔
1324
                components.push(comp);
2✔
1325
            }
4✔
1326
        }
1327

1328
        components
1✔
1329
    }
1✔
1330

1331
    /// recursively detects which nodes form separate graph components
1332
    /// returns 2D vector with node ids per component
1333
    /// (may lead to stack overflow)
1334
    pub fn components_r(&self) -> Vec<Vec<usize>> {
2✔
1335
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
2✔
1336
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
2✔
1337

1338
        for _i in 0..self.len() {
8✔
1339
            visited.push(false);
8✔
1340
        }
8✔
1341

1342
        for i in 0..self.len() {
8✔
1343
            if !visited[i] {
8✔
1344
                let mut comp: Vec<usize> = Vec::new();
4✔
1345
                self.component_r(&mut visited, i, &mut comp);
4✔
1346
                components.push(comp);
4✔
1347
            }
4✔
1348
        }
1349

1350
        components
2✔
1351

2✔
1352
    }
2✔
1353

1354
    fn component_r<'a>(&'a self, visited: &'a mut Vec<bool>, i: usize, comp: &'a mut Vec<usize>) {
8✔
1355
        
8✔
1356
        visited[i] = true;
8✔
1357
        comp.push(i);
8✔
1358
        let mut edges = self.find_edges(i, Dir::Left);
8✔
1359
        let mut r_edges = self.find_edges(i, Dir::Right);
8✔
1360

8✔
1361
        edges.append(&mut r_edges);
8✔
1362

1363
        for (_, edge, _, _) in edges.iter() {
8✔
1364
            if !visited[*edge] {
8✔
1365
                self.component_r(visited, *edge, comp);
4✔
1366
            }
4✔
1367
        }
1368
    }
8✔
1369

1370
    fn component_i<'a>(&'a self, visited: &'a mut [bool], i: usize) -> Vec<usize> {
10✔
1371
        let mut edges: Vec<usize> = Vec::new();
10✔
1372
        let mut comp: Vec<usize> = Vec::new();
10✔
1373

10✔
1374
        edges.push(i);
10✔
1375

1376
        while let Some(current_edge) = edges.pop() {
40✔
1377
            if !visited[current_edge] { 
30✔
1378
                comp.push(current_edge);
30✔
1379
                visited[current_edge] = true;
30✔
1380

30✔
1381
                let mut l_edges = self.find_edges(current_edge, Dir::Left);
30✔
1382
                let mut r_edges = self.find_edges(current_edge, Dir::Right);
30✔
1383

30✔
1384
                l_edges.append(&mut r_edges);
30✔
1385

1386
                for (_, new_edge, _, _) in l_edges.into_iter() {
40✔
1387
                    if !visited[new_edge] {
40✔
1388
                        edges.push(new_edge);
20✔
1389
                    }
20✔
1390
                }
1391
            }
×
1392
        }
1393
        comp
10✔
1394
    }
10✔
1395

1396
    pub fn find_bad_nodes<F: Fn(&Node<'_, K, D>) -> bool>(&self, valid: F) -> Vec<usize> {
×
1397
        let mut bad_nodes = Vec::new();
×
1398

1399
        for (i, node) in enumerate(self.iter_nodes()) {
×
1400
            if !valid(&node) { bad_nodes.push(i); }
×
1401
        }
1402
        
1403
        bad_nodes
×
1404
    }
×
1405
}
1406

1407
impl<K: Kmer, SD: SummaryData<u8> + Debug> DebruijnGraph<K, SD> {
1408
    pub fn create_colors(&self, config: &SummaryConfig) -> Colors<SD> {
×
1409
        Colors::new(self, config)
×
1410
    }
×
1411
    
1412
    /// edge mults will contain hanging edges if the nodes were filtered
1413
    pub fn fix_edge_mults(&mut self) {
1✔
1414
        if self.get_node(0).data().edge_mults().is_some() {
1✔
1415
            for i in 0..self.len() {
×
1416
                self.base.data[i].fix_edge_mults(self.base.exts[i]);
×
1417
            }
×
1418
        }
1✔
1419
    }
1✔
1420
}
1421

1422
#[derive(Debug, Eq, PartialEq)]
1423
enum Status {
1424
    Active,
1425
    End,
1426
    Cycle,
1427
}
1428

1429
#[derive(Debug)]
1430
struct State {
1431
    path: SmallVec8<(u32, Dir)>,
1432
    score: f32,
1433
    status: Status,
1434
}
1435

1436
impl State {}
1437

1438
/// Iterator over nodes in a `DeBruijnGraph`
1439
pub struct NodeIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1440
    graph: &'a DebruijnGraph<K, D>,
1441
    node_id: usize,
1442
}
1443

1444
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIter<'a, K, D> {
1445
    type Item = Node<'a, K, D>;
1446

1447
    fn next(&mut self) -> Option<Node<'a, K, D>> {
536,989✔
1448
        if self.node_id < self.graph.len() {
536,989✔
1449
            let node = self.graph.get_node(self.node_id);
536,860✔
1450
            self.node_id += 1;
536,860✔
1451
            Some(node)
536,860✔
1452
        } else {
1453
            None
129✔
1454
        }
1455
    }
536,989✔
1456
}
1457

1458
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for &'a DebruijnGraph<K, D> {
1459
    type Item = NodeKmer<'a, K, D>;
1460
    type IntoIter = NodeIntoIter<'a, K, D>;
1461

1462
    fn into_iter(self) -> Self::IntoIter {
2,232✔
1463
        NodeIntoIter {
2,232✔
1464
            graph: self,
2,232✔
1465
            node_id: 0,
2,232✔
1466
        }
2,232✔
1467
    }
2,232✔
1468
}
1469

1470
/// Iterator over nodes in a `DeBruijnGraph`
1471
pub struct NodeIntoIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1472
    graph: &'a DebruijnGraph<K, D>,
1473
    node_id: usize,
1474
}
1475

1476
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIntoIter<'a, K, D> {
1477
    type Item = NodeKmer<'a, K, D>;
1478

1479
    fn next(&mut self) -> Option<Self::Item> {
198,112✔
1480
        if self.node_id < self.graph.len() {
198,112✔
1481
            let node_id = self.node_id;
198,112✔
1482
            let node = self.graph.get_node(node_id);
198,112✔
1483
            let node_seq = node.sequence();
198,112✔
1484

198,112✔
1485
            self.node_id += 1;
198,112✔
1486
            Some(NodeKmer {
198,112✔
1487
                node_id,
198,112✔
1488
                node_seq_slice: node_seq,
198,112✔
1489
                phantom_d: PhantomData,
198,112✔
1490
                phantom_k: PhantomData,
198,112✔
1491
            })
198,112✔
1492
        } else {
1493
            None
×
1494
        }
1495
    }
198,112✔
1496
}
1497

1498
/// A `DebruijnGraph` node with a reference to the sequence of the node.
1499
#[derive(Clone)]
1500
pub struct NodeKmer<'a, K: Kmer + 'a, D: Debug + 'a> {
1501
    pub node_id: usize,
1502
    node_seq_slice: DnaStringSlice<'a>,
1503
    phantom_k: PhantomData<K>,
1504
    phantom_d: PhantomData<D>,
1505
}
1506

1507
/// An iterator over the kmers in a `DeBruijn graph node`
1508
pub struct NodeKmerIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1509
    kmer_id: usize,
1510
    kmer: K,
1511
    num_kmers: usize,
1512
    node_seq_slice: DnaStringSlice<'a>,
1513
    phantom_k: PhantomData<K>,
1514
    phantom_d: PhantomData<D>,
1515
}
1516

1517
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for NodeKmer<'a, K, D> {
1518
    type Item = K;
1519
    type IntoIter = NodeKmerIter<'a, K, D>;
1520

1521
    fn into_iter(self) -> Self::IntoIter {
396,224✔
1522
        let num_kmers = self.node_seq_slice.len() - K::k() + 1;
396,224✔
1523

1524
        let kmer = if num_kmers > 0 {
396,224✔
1525
            self.node_seq_slice.get_kmer::<K>(0)
396,224✔
1526
        } else {
1527
            K::empty()
×
1528
        };
1529

1530
        NodeKmerIter {
396,224✔
1531
            kmer_id: 0,
396,224✔
1532
            kmer,
396,224✔
1533
            num_kmers,
396,224✔
1534
            node_seq_slice: self.node_seq_slice,
396,224✔
1535
            phantom_k: PhantomData,
396,224✔
1536
            phantom_d: PhantomData,
396,224✔
1537
        }
396,224✔
1538
    }
396,224✔
1539
}
1540

1541
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeKmerIter<'a, K, D> {
1542
    type Item = K;
1543

1544
    fn next(&mut self) -> Option<Self::Item> {
3,320,770✔
1545
        if self.num_kmers == self.kmer_id {
3,320,770✔
1546
            None
×
1547
        } else {
1548
            let current_kmer = self.kmer;
3,320,770✔
1549

3,320,770✔
1550
            self.kmer_id += 1;
3,320,770✔
1551
            if self.kmer_id < self.num_kmers {
3,320,770✔
1552
                let next_base = self.node_seq_slice.get(self.kmer_id + K::k() - 1);
3,269,924✔
1553
                let new_kmer = self.kmer.extend_right(next_base);
3,269,924✔
1554
                self.kmer = new_kmer;
3,269,924✔
1555
            }
3,269,924✔
1556

1557
            Some(current_kmer)
3,320,770✔
1558
        }
1559
    }
3,320,770✔
1560

1561
    fn size_hint(&self) -> (usize, Option<usize>) {
198,112✔
1562
        (self.num_kmers, Some(self.num_kmers))
198,112✔
1563
    }
198,112✔
1564

1565
    /// Provide a 'fast-forward' capability for this iterator
1566
    /// MPHF will use this to reduce the number of kmers that
1567
    /// need to be produced.
1568
    fn nth(&mut self, n: usize) -> Option<Self::Item> {
2,395,168✔
1569
        if n <= 4 {
2,395,168✔
1570
            // for small skips forward, shift one base at a time
1571
            for _ in 0..n {
2,149,934✔
1572
                self.next();
925,602✔
1573
            }
925,602✔
1574
        } else {
245,234✔
1575
            self.kmer_id += n;
245,234✔
1576
            self.kmer = self.node_seq_slice.get_kmer::<K>(self.kmer_id);
245,234✔
1577
        }
245,234✔
1578

1579
        self.next()
2,395,168✔
1580
    }
2,395,168✔
1581
}
1582

1583
/// Marker signifying that NodeKmerIter has a known size.
1584
impl<'a, K: Kmer + 'a, D: Debug + 'a> ExactSizeIterator for NodeKmerIter<'a, K, D> {}
1585

1586
/// Unbranched sequence in the DeBruijn graph
1587
pub struct Node<'a, K: Kmer + 'a, D: 'a> {
1588
    pub node_id: usize,
1589
    pub graph: &'a DebruijnGraph<K, D>,
1590
}
1591

1592
impl<'a, K: Kmer, D: Debug> Node<'a, K, D> {
1593
    /// Length of the sequence of this node
1594
    pub fn len(&self) -> usize {
1,489,420✔
1595
        self.graph.base.sequences.get(self.node_id).len()
1,489,420✔
1596
    }
1,489,420✔
1597

1598
    pub fn is_empty(&self) -> bool {
×
1599
        self.graph.base.sequences.get(self.node_id).is_empty()
×
1600
    }
×
1601

1602
    /// Sequence of the node
1603
    pub fn sequence(&self) -> DnaStringSlice<'a> {
3,015,645✔
1604
        self.graph.base.sequences.get(self.node_id)
3,015,645✔
1605
    }
3,015,645✔
1606

1607
    /// Reference to auxiliarly data associated with the node
1608
    pub fn data(&self) -> &'a D {
2,798,486✔
1609
        &self.graph.base.data[self.node_id]
2,798,486✔
1610
    }
2,798,486✔
1611

1612
    /// Extension bases from this node
1613
    pub fn exts(&self) -> Exts {
2,186,428✔
1614
        self.graph.base.exts[self.node_id]
2,186,428✔
1615
    }
2,186,428✔
1616

1617
    /// Edges leaving the left side of the node in the format
1618
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
1619
    pub fn l_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
130,660✔
1620
        self.graph.find_edges(self.node_id, Dir::Left)
130,660✔
1621
    }
130,660✔
1622

1623
    /// Edges leaving the right side of the node in the format
1624
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
1625
    pub fn r_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
130,660✔
1626
        self.graph.find_edges(self.node_id, Dir::Right)
130,660✔
1627
    }
130,660✔
1628

1629
    /// Edges leaving the 'dir' side of the node in the format
1630
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
1631
    pub fn edges(&self, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
246,506✔
1632
        self.graph.find_edges(self.node_id, dir)
246,506✔
1633
    }
246,506✔
1634

1635
    fn to_json<F: Fn(&D) -> Value>(&self, func: &F, f: &mut dyn Write) {
×
1636
        write!(
×
1637
            f,
×
1638
            "{{\"id\":\"{}\",\"L\":{},\"D\":{},\"Se\":\"{:?}\"}}",
×
1639
            self.node_id,
×
1640
            self.sequence().len(),
×
1641
            (func)(self.data()),
×
1642
            self.sequence(),
×
1643
        )
×
1644
        .unwrap();
×
1645
    }
×
1646

1647
    fn edges_to_json(&self, f: &mut dyn Write) -> bool {
×
1648
        let mut wrote = false;
×
1649
        let edges = self.r_edges();
×
1650
        for (idx, &(_, id, incoming_dir, _)) in edges.iter().enumerate() {
×
1651
            write!(
×
1652
                f,
×
1653
                "{{\"source\":\"{}\",\"target\":\"{}\",\"D\":\"{}\"}}",
×
1654
                self.node_id,
×
1655
                id,
×
1656
                match incoming_dir {
×
1657
                    Dir::Left => "L",
×
1658
                    Dir::Right => "R",
×
1659
                }
1660
            )
1661
            .unwrap();
×
1662

×
1663
            if idx < edges.len() - 1 {
×
1664
                write!(f, ",").unwrap();
×
1665
            }
×
1666

1667
            wrote = true;
×
1668
        }
1669
        wrote
×
1670
    }
×
1671
}
1672

1673
// TODO make generic instead of u8 (u8 is sufficient for dbg)
1674
impl<K: Kmer, SD: SummaryData<u8> + Debug> Node<'_, K, SD>  {
1675
    /// get default format for dot edges based on node data
1676
    pub fn edge_dot_default(&self, colors: &Colors<SD>, base: u8, incoming_dir: Dir, flipped: bool) -> String {
129,845✔
1677
        // set color based on dir
1678
        let color = match incoming_dir {
129,845✔
1679
            Dir::Left => "blue",
65,064✔
1680
            Dir::Right => "red"
64,781✔
1681
        };
1682
        
1683
        if let Some(em) = self.data().edge_mults() {
129,845✔
1684
            
1685
            let dir = if flipped { 
129,845✔
1686
                incoming_dir 
51,540✔
1687
            } else {
1688
                incoming_dir.flip()
78,305✔
1689
            };
1690

1691
            // set penwidth based on count
1692
            let count = em.edge_mult(base, dir);
129,845✔
1693
            let penwidth = colors.edge_width(count);
129,845✔
1694

129,845✔
1695
            format!("[color={color}, penwidth={penwidth}, label=\"{}: {count}\"]", bits_to_base(base))
129,845✔
1696
        } else {
1697
            format!("[color={color}]")
×
1698
        }
1699
    }
129,845✔
1700

1701
    /// get default format for dot nodes, based on node data
1702
    pub fn node_dot_default(&self, colors: &Colors<SD>, config: &SummaryConfig, tag_translator: &bimap::BiHashMap<String, u8> , outline: bool) -> String {
65,325✔
1703
        // set color based on labels/fold change/p-value
65,325✔
1704
        let color = colors.node_color(self.data(), config, outline);
65,325✔
1705

65,325✔
1706
        let data_info = self.data().print(tag_translator, config);
65,325✔
1707
        const MIN_TEXT_WIDTH: usize = 40;
1708
        let wrap = if self.len() > MIN_TEXT_WIDTH { self.len() } else { MIN_TEXT_WIDTH };
65,325✔
1709

1710
        let label = textwrap::fill(&format!("id: {}, len: {}, exts: {:?}, seq: {}\n{}", 
65,325✔
1711
            self.node_id,
65,325✔
1712
            self.len(),
65,325✔
1713
            self.exts(),
65,325✔
1714
            self.sequence(),
65,325✔
1715
            data_info
65,325✔
1716
        ), wrap);
65,325✔
1717

65,325✔
1718
        format!("[style=filled, {color}, label=\"{label}\"]")
65,325✔
1719
    }
65,325✔
1720
}
1721

1722
impl<K: Kmer, D> fmt::Debug for Node<'_, K, D>
1723
where
1724
    D: Debug,
1725
{
1726
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
8✔
1727
        write!(
8✔
1728
            f,
8✔
1729
            "Node {{ id:{}, Exts: {:?}, L:{:?} R:{:?}, Seq: {:?}, Data: {:?} }}",
8✔
1730
            self.node_id,
8✔
1731
            self.exts(),
8✔
1732
            self.l_edges(),
8✔
1733
            self.r_edges(),
8✔
1734
            self.sequence().len(),
8✔
1735
            self.data()
8✔
1736
        )
8✔
1737
    }
8✔
1738
}
1739

1740
pub struct IterComponents<'a, K: Kmer, D> {
1741
    graph: &'a DebruijnGraph<K, D>,
1742
    visited: Vec<bool>,
1743
    pos: usize,
1744
}
1745

1746
impl<K: Kmer, D: Debug> Iterator for IterComponents<'_, K, D> {
1747
    type Item = Vec<usize>;
1748
    fn next(&mut self) -> Option<Self::Item> {
12✔
1749
        while self.pos < self.graph.len() {
28✔
1750
            if !self.visited[self.pos] {
24✔
1751
                let comp = self.graph.component_i(&mut self.visited, self.pos);
8✔
1752
                self.pos += 1;
8✔
1753
                return Some(comp)
8✔
1754
            } else {
16✔
1755
                self.pos += 1;
16✔
1756
            }
16✔
1757
        }
1758
        assert!(self.visited.iter().map(|x| *x as usize).sum::<usize>() == self.graph.len());
24✔
1759
        None
4✔
1760
    }
12✔
1761
    
1762
}
1763

1764
pub struct PathCompIter<'a, K: Kmer, D: Debug, F, F2> 
1765
where 
1766
F: Fn(&D) -> f32,
1767
F2: Fn(&D) -> bool
1768
{
1769
    graph: &'a DebruijnGraph<K, D>,
1770
    component_iterator: IterComponents<'a, K, D>,
1771
    graph_pos: usize,
1772
    score: F,
1773
    solid_path: F2,
1774
}
1775

1776
/// returns the component and the "best" path in the component
1777
impl<K: Kmer, D: Debug, F, F2> Iterator for PathCompIter<'_, K, D, F, F2> 
1778
where 
1779
F: Fn(&D) -> f32,
1780
F2: Fn(&D) -> bool
1781
{
1782
    type Item = (Vec<usize>, VecDeque<(usize, Dir)>,);
1783
    fn next(&mut self) -> Option<Self::Item> {
6✔
1784
        match self.component_iterator.next() {
6✔
1785
            Some(component) => {
4✔
1786
                let current_comp = component;
4✔
1787
                
4✔
1788
    
4✔
1789
                let mut best_node = current_comp[0];
4✔
1790
                let mut best_score = f32::MIN;
4✔
1791
                for c in current_comp.iter() {
12✔
1792
                    let node = self.graph.get_node(*c);
12✔
1793
                    let node_score = (self.score)(node.data());
12✔
1794
    
12✔
1795
                    if node_score > best_score {
12✔
1796
                        best_node = *c;
4✔
1797
                        best_score = node_score;
4✔
1798
                    }
8✔
1799
                }
1800
    
1801
                let oscore = |state| match state {
16✔
1802
                    None => 0.0,
4✔
1803
                    Some((id, _)) => (self.score)(self.graph.get_node(id).data()),
12✔
1804
                };
16✔
1805
    
1806
                let osolid_path = |state| match state {
4✔
1807
                    None => false,
×
1808
                    Some((id, _)) => (self.solid_path)(self.graph.get_node(id).data()),
4✔
1809
                };
4✔
1810
    
1811
                // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
1812
                // already put into the path
1813
                let mut used_nodes = HashSet::new();
4✔
1814
                let mut path = VecDeque::new();
4✔
1815
    
4✔
1816
                // Start w/ initial state
4✔
1817
                used_nodes.insert(best_node);
4✔
1818
                path.push_front((best_node, Dir::Left));
4✔
1819
    
1820
                for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
8✔
1821
                    let &(start_node, dir, do_flip) = init;
8✔
1822
                    let mut current = (start_node, dir);
8✔
1823
    
1824
                    loop {
1825
                        let mut next = None;
12✔
1826
                        let (cur_id, incoming_dir) = current;
12✔
1827
                        let node = self.graph.get_node(cur_id);
12✔
1828
                        let edges = node.edges(incoming_dir.flip());
12✔
1829
    
12✔
1830
                        let mut solid_paths = 0;
12✔
1831
                        for (_, id, dir, _) in edges {
16✔
1832
                            let cand = Some((id, dir));
4✔
1833
                            if osolid_path(cand) {
4✔
1834
                                solid_paths += 1;
4✔
1835

4✔
1836
                                // second if clause is outside of first in original code (see max_path) 
4✔
1837
                                // but would basically ignore path validity.
4✔
1838
                                if oscore(cand) > oscore(next) {
4✔
1839
                                    next = cand;
4✔
1840
                                }
4✔
1841
                            }
×
1842
    
1843
                            if oscore(cand) > oscore(next) {
4✔
1844
                                next = cand;
×
1845
                            }
4✔
1846
                        }
1847
    
1848
                        if solid_paths > 1 {
12✔
1849
                            break;
×
1850
                        }
12✔
1851
    
1852
                        match next {
4✔
1853
                            Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
4✔
1854
                                if do_flip {
4✔
1855
                                    path.push_front((next_id, next_incoming.flip()));
2✔
1856
                                } else {
2✔
1857
                                    path.push_back((next_id, next_incoming));
2✔
1858
                                }
2✔
1859
    
1860
                                used_nodes.insert(next_id);
4✔
1861
                                current = (next_id, next_incoming);
4✔
1862
                            }
1863
                            _ => break,
8✔
1864
                        }
1865
                    }
1866
                }
1867
                
1868
                
1869
                Some((current_comp, path))
4✔
1870
            }, 
1871
            None => {
1872
                // should technically not need graph_pos after this 
1873
                self.graph_pos += 1;
2✔
1874
                None
2✔
1875
            }
1876
        }
1877
    }
6✔
1878
}
1879

1880
#[cfg(test)]
1881
mod test {
1882
    use std::{fs::File, io::BufReader};
1883

1884
    use crate::{kmer::Kmer16, summarizer::TagsCountsSumData};
1885

1886
    use super::DebruijnGraph;
1887

1888
    #[test]
1889
    #[cfg(not(feature = "sample128"))]
1890
    fn test_components() {
1891
        use crate::{summarizer::SummaryData, Dir, BUF};
1892

1893
        let path = "test_data/400.graph.dbg";
1894
        let file = BufReader::with_capacity(BUF, File::open(path).unwrap());
1895

1896
        let (graph, _, _): (DebruijnGraph<Kmer16, TagsCountsSumData>, Vec<String>, crate::summarizer::SummaryConfig) = 
1897
            bincode::deserialize_from(file).expect("error deserializing graph");
1898

1899
        let components = graph.iter_components();
1900

1901
        let check_components = [
1902
            vec![3, 67, 130, 133, 59, 119, 97, 110, 68, 137, 29, 84, 131, 43, 30, 91, 14, 70, 79, 142, 136, 105, 103, 62, 
1903
                141, 104, 134, 88, 38, 81, 108, 92, 135, 96, 116, 121, 63, 124, 106, 129, 132, 126, 93, 109, 83, 112, 118, 
1904
                123, 125, 78, 122, 115, 75, 128, 140, 111, 26, 143, 113],
1905
            vec![41, 138, 100, 139, 86],
1906
            vec![53, 117, 127],
1907
            vec![69, 144, 77, 120, 114, 107, 101],
1908
        ];
1909

1910
        let mut counter = 0;
1911

1912
        for component in components {
1913
            if component.len() > 1 { 
1914
                println!("component: {:?}", component);
1915
                assert_eq!(component, check_components[counter]);
1916
                counter += 1;
1917
            }
1918
        }
1919
        assert_eq!(vec![(139, Dir::Left)], graph.max_path(|data| data.score(), |_| true));
1920
    }
1921
}
1922

STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc