• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jlab / rust-debruijn / 14333782752

08 Apr 2025 12:41PM UTC coverage: 80.356% (-2.3%) from 82.636%
14333782752

Pull #11

github

web-flow
Merge c1b9b83ef into 2f646dc00
Pull Request #11: Modifyable DOT edges and default formatting methods for nodes and edges

35 of 273 new or added lines in 5 files covered. (12.82%)

13 existing lines in 2 files now uncovered.

6091 of 7580 relevant lines covered (80.36%)

3917548.99 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

56.96
/src/graph.rs
1
// Copyright 2017 10x Genomics
2

3
//! Containers for path-compressed De Bruijn graphs
4

5
use bit_set::BitSet;
6
use indicatif::ProgressBar;
7
use indicatif::ProgressIterator;
8
use indicatif::ProgressStyle;
9
use itertools::enumerate;
10
use log::{debug, trace};
11
use rayon::prelude::*;
12
use rayon::current_num_threads;
13
use serde_derive::{Deserialize, Serialize};
14
use smallvec::SmallVec;
15
use std::borrow::Borrow;
16

17
use std::collections::HashSet;
18
use std::collections::VecDeque;
19
use std::f32;
20
use std::fmt::{self, Debug, Display};
21
use std::fs::{remove_file, File};
22
use std::hash::Hash;
23
use std::io::BufWriter;
24
use std::io::{BufReader, Error, Read};
25
use std::io::Write;
26
use std::iter::FromIterator;
27
use std::marker::PhantomData;
28
use std::path::Path;
29

30
use boomphf::hashmap::BoomHashMap;
31

32
use serde_json;
33
use serde_json::Value;
34

35
type SmallVec4<T> = SmallVec<[T; 4]>;
36
type SmallVec8<T> = SmallVec<[T; 8]>;
37

38
use crate::colors::Colors;
39
use crate::compression::CompressionSpec;
40
use crate::dna_string::{DnaString, DnaStringSlice, PackedDnaStringSet};
41
use crate::summarizer::SummaryConfig;
42
use crate::summarizer::SummaryData;
43
use crate::BUF;
44
use crate::{Dir, Exts, Kmer, Mer, Vmer};
45

46
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
47
/// This type does not carry the sorted index arrays the allow the graph
48
/// to be walked efficiently. The `DeBruijnGraph` type wraps this type and add those
49
/// vectors.
50
#[derive(Serialize, Deserialize, Clone, Debug)]
51
pub struct BaseGraph<K, D> {
52
    pub sequences: PackedDnaStringSet,
53
    pub exts: Vec<Exts>,
54
    pub data: Vec<D>,
55
    pub stranded: bool,
56
    phantom: PhantomData<K>,
57
}
58

59
impl<K, D> BaseGraph<K, D> {
60
    pub fn new(stranded: bool) -> Self {
1,900✔
61
        BaseGraph {
1,900✔
62
            sequences: PackedDnaStringSet::new(),
1,900✔
63
            exts: Vec::new(),
1,900✔
64
            data: Vec::new(),
1,900✔
65
            phantom: PhantomData,
1,900✔
66
            stranded,
1,900✔
67
        }
1,900✔
68
    }
1,900✔
69

70
    pub fn len(&self) -> usize {
326,592✔
71
        self.sequences.len()
326,592✔
72
    }
326,592✔
73

74
    pub fn is_empty(&self) -> bool {
1✔
75
        self.sequences.is_empty()
1✔
76
    }
1✔
77

78
    pub fn combine<I: Iterator<Item = BaseGraph<K, D>>>(graphs: I) -> Self {
13✔
79
        let mut sequences = PackedDnaStringSet::new();
13✔
80
        let mut exts = Vec::new();
13✔
81
        let mut data = Vec::new();
13✔
82
        let mut stranded = Vec::new();
13✔
83

84
        for g in graphs {
1,554✔
85
            for s in 0..g.sequences.len() {
4,689✔
86
                sequences.add(&g.sequences.get(s));
4,689✔
87
            }
4,689✔
88

89
            exts.extend(g.exts);
1,541✔
90
            data.extend(g.data);
1,541✔
91
            stranded.push(g.stranded);
1,541✔
92
        }
93

94
        let out_stranded = stranded.iter().all(|x| *x);
13✔
95

13✔
96
        if !out_stranded && !stranded.iter().all(|x| !*x) {
1,541✔
97
            panic!("attempted to combine stranded and unstranded graphs");
×
98
        }
13✔
99

13✔
100
        BaseGraph {
13✔
101
            sequences,
13✔
102
            stranded: out_stranded,
13✔
103
            exts,
13✔
104
            data,
13✔
105
            phantom: PhantomData,
13✔
106
        }
13✔
107
    }
13✔
108
}
109

110
impl<K: Kmer, D> BaseGraph<K, D> {
111
    pub fn add<R: Borrow<u8>, S: IntoIterator<Item = R>>(
793,418✔
112
        &mut self,
793,418✔
113
        sequence: S,
793,418✔
114
        exts: Exts,
793,418✔
115
        data: D,
793,418✔
116
    ) {
793,418✔
117
        self.sequences.add(sequence);
793,418✔
118
        self.exts.push(exts);
793,418✔
119
        self.data.push(data);
793,418✔
120
    }
793,418✔
121
}
122

123
impl<K: Kmer + Send + Sync, D> BaseGraph<K, D> {
124
    pub fn finish(self) -> DebruijnGraph<K, D> {
359✔
125
        let indices: Vec<u32> = (0..self.len() as u32).collect();
359✔
126

127
        let left_order = {
359✔
128
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
359✔
129
            for idx in &indices {
792,965✔
130
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
792,606✔
131
                
792,606✔
132
            }
792,606✔
133
            
134
            BoomHashMap::new_parallel(kmers, indices.clone())
359✔
135
        };
136

137
        let right_order = {
359✔
138
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
359✔
139
            for idx in &indices {
792,965✔
140
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
792,606✔
141
            }
792,606✔
142
            
143
            BoomHashMap::new_parallel(kmers, indices)
359✔
144
        };
359✔
145
        debug!("finish graph loops: 2x {}", self.len());
359✔
146

147
        DebruijnGraph {
359✔
148
            base: self,
359✔
149
            left_order,
359✔
150
            right_order,
359✔
151
        }
359✔
152
    }
359✔
153
}
154

155
impl<K: Kmer, D> BaseGraph<K, D> {
156
    pub fn finish_serial(self) -> DebruijnGraph<K, D> {
×
157
        let indices: Vec<u32> = (0..self.len() as u32).collect();
×
158

159
        let left_order = {
×
160
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
×
161
            let mut sequences: Vec<String> = Vec::new();
×
162
            for idx in &indices {
×
163
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
×
164
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
×
165
            }
×
166
            println!("left kmers: {:?}", kmers);
×
167
            println!("left seqs: {:?}", sequences);
×
168
            BoomHashMap::new(kmers, indices.clone())
×
169
        };
170

171
        let right_order = {
×
172
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
×
173
            let mut sequences: Vec<String> = Vec::new();
×
174
            for idx in &indices {
×
175
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
×
176
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
×
177
            }
×
178
            println!("right kmers: {:?}", kmers);
×
179
            println!("right seqs: {:?}", sequences);
×
180
            BoomHashMap::new(kmers, indices)
×
181
        };
×
182

×
183
        DebruijnGraph {
×
184
            base: self,
×
185
            left_order,
×
186
            right_order,
×
187
        }
×
188
    }
×
189

190
    pub fn remove_duplicate_kmers(self) {
×
191
        
×
192
    }
×
193
}
194

195
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
196
/// The struct carries sorted index arrays the allow the graph
197
/// to be walked efficiently.
198
#[derive(Serialize, Deserialize, Debug)]
199
pub struct DebruijnGraph<K: Hash, D> {
200
    pub base: BaseGraph<K, D>,
201
    left_order: BoomHashMap<K, u32>,
202
    right_order: BoomHashMap<K, u32>,
203
}
204

205
impl<K: Kmer, D: Debug> DebruijnGraph<K, D> {
206
    /// Total number of nodes in the DeBruijn graph
207
    pub fn len(&self) -> usize {
325,491✔
208
        self.base.len()
325,491✔
209
    }
325,491✔
210

211
    pub fn is_empty(&self) -> bool {
1✔
212
        self.base.is_empty()
1✔
213
    }
1✔
214

215
    /// Get a node given it's `node_id`
216
    pub fn get_node(&self, node_id: usize) -> Node<K, D> {
4,386,009✔
217
        Node {
4,386,009✔
218
            node_id,
4,386,009✔
219
            graph: self,
4,386,009✔
220
        }
4,386,009✔
221
    }
4,386,009✔
222

223
    /// Get a node given it's `node_id`
224
    pub fn get_node_kmer(&self, node_id: usize) -> NodeKmer<K, D> {
×
225
        let node = self.get_node(node_id);
×
226
        let node_seq = node.sequence();
×
227

×
228
        NodeKmer {
×
229
            node_id,
×
230
            node_seq_slice: node_seq,
×
231
            phantom_d: PhantomData,
×
232
            phantom_k: PhantomData,
×
233
        }
×
234
    }
×
235

236
    /// Return an iterator over all nodes in the graph
237
    pub fn iter_nodes(&self) -> NodeIter<K, D> {
113✔
238
        NodeIter {
113✔
239
            graph: self,
113✔
240
            node_id: 0,
113✔
241
        }
113✔
242
    }
113✔
243

244
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
245
    /// accessed via a Node wrapper object
246
    fn find_edges(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
376,065✔
247
        let exts = self.base.exts[node_id];
376,065✔
248
        let sequence = self.base.sequences.get(node_id);
376,065✔
249
        let kmer: K = sequence.term_kmer(dir);
376,065✔
250
        let mut edges = SmallVec4::new();
376,065✔
251

252
        for i in 0..4 {
1,880,325✔
253
            if exts.has_ext(dir, i) {
1,504,260✔
254
                let link = self.find_link(kmer.extend(i, dir), dir); //.expect("missing link");
737,352✔
255
                if let Some(l) = link {
737,352✔
256
                    edges.push((i, l.0, l.1, l.2));
737,352✔
257
                }
737,352✔
258
                // Otherwise, this edge doesn't exist within this shard, so ignore it.
259
                // NOTE: this should be allowed in a 'complete' DBG
260
            }
766,908✔
261
        }
262

263
        edges
376,065✔
264
    }
376,065✔
265

266
    /// Seach for the kmer `kmer`, appearing at the given `side` of a node sequence.
267
    fn search_kmer(&self, kmer: K, side: Dir) -> Option<usize> {
4,410,501✔
268
        match side {
4,410,501✔
269
            Dir::Left => self.left_order.get(&kmer).map(|pos| *pos as usize),
2,205,647✔
270
            Dir::Right => self.right_order.get(&kmer).map(|pos| *pos as usize),
2,204,854✔
271
        }
272
    }
4,410,501✔
273

274
    /// Find a link in the graph, possibly handling a RC switch.
275
    pub fn find_link(&self, kmer: K, dir: Dir) -> Option<(usize, Dir, bool)> {
3,075,817✔
276
        // Only test self-consistent paths through
3,075,817✔
277
        // the edges
3,075,817✔
278
        // Avoids problems due to single kmer edges
3,075,817✔
279
        // (dir, next_side_incoming, rc)
3,075,817✔
280
        // (Dir::Left, Dir::Right, false) => true,
3,075,817✔
281
        // (Dir::Left, Dir::Left,  true) => true,
3,075,817✔
282
        // (Dir::Right, Dir::Left, false) => true,
3,075,817✔
283
        // (Dir::Right, Dir::Right, true) => true,
3,075,817✔
284

3,075,817✔
285
        let rc = kmer.rc();
3,075,817✔
286

3,075,817✔
287
        match dir {
3,075,817✔
288
            Dir::Left => {
289
                if let Some(idx) = self.search_kmer(kmer, Dir::Right) {
1,536,422✔
290
                    return Some((idx, Dir::Right, false));
870,170✔
291
                }
666,252✔
292

666,252✔
293
                if !self.base.stranded {
666,252✔
294
                    if let Some(idx) = self.search_kmer(rc, Dir::Left) {
666,252✔
295
                        return Some((idx, Dir::Left, true));
666,252✔
296
                    }
×
297
                }
×
298
            }
299

300
            Dir::Right => {
301
                if let Some(idx) = self.search_kmer(kmer, Dir::Left) {
1,539,395✔
302
                    return Some((idx, Dir::Left, false));
870,963✔
303
                }
668,432✔
304

668,432✔
305
                if !self.base.stranded {
668,432✔
306
                    if let Some(idx) = self.search_kmer(rc, Dir::Right) {
668,432✔
307
                        return Some((idx, Dir::Right, true));
668,432✔
308
                    }
×
309
                }
×
310
            }
311
        }
312

313
        None
×
314
    }
3,075,817✔
315

316
    /// Check whether the graph is fully compressed. Return `None` if it's compressed,
317
    /// otherwise return `Some(node1, node2)` representing a pair of node that could
318
    /// be collapsed. Probably only useful for testing.
319
    pub fn is_compressed<S: CompressionSpec<D>>(&self, spec: &S) -> Option<(usize, usize)> {
567✔
320
        for i in 0..self.len() {
109,785✔
321
            let n = self.get_node(i);
109,785✔
322

323
            for dir in &[Dir::Left, Dir::Right] {
329,355✔
324
                let dir_edges = n.edges(*dir);
219,570✔
325
                if dir_edges.len() == 1 {
219,570✔
326
                    let (_, next_id, return_dir, _) = dir_edges[0];
156,361✔
327
                    let next = self.get_node(next_id);
156,361✔
328

156,361✔
329
                    let ret_edges = next.edges(return_dir);
156,361✔
330
                    if ret_edges.len() == 1 {
156,361✔
331
                        // Test for us being a palindrome: this makes it OK
332
                        if n.len() == K::k() && n.sequence().first_kmer::<K>().is_palindrome() {
18✔
333
                            continue;
12✔
334
                        }
6✔
335

6✔
336
                        // Test for a neighbor being a palindrome: this makes it OK
6✔
337
                        if next.len() == K::k() && next.sequence().first_kmer::<K>().is_palindrome()
6✔
338
                        {
339
                            continue;
6✔
UNCOV
340
                        }
×
UNCOV
341

×
UNCOV
342
                        // Test for this edge representing a smooth circle (biting it's own tail):
×
UNCOV
343
                        if n.node_id == next_id {
×
UNCOV
344
                            continue;
×
345
                        }
×
346

×
347
                        if spec.join_test(n.data(), next.data()) {
×
348
                            // Found a unbranched edge that should have been eliminated
349
                            return Some((i, next_id));
×
350
                        }
×
351
                    }
156,343✔
352
                }
63,209✔
353
            }
354
        }
355

356
        None
567✔
357
    }
567✔
358

359
    /// Remove non-existent extensions that may be created due to filtered kmers
360
    pub fn fix_exts(&mut self, valid_nodes: Option<&BitSet>) {
246✔
361
        for i in 0..self.len() {
770,874✔
362
            let valid_exts = self.get_valid_exts(i, valid_nodes);
770,874✔
363
            self.base.exts[i] = valid_exts;
770,874✔
364
        }
770,874✔
365
    }
246✔
366

367
    pub fn get_valid_exts(&self, node_id: usize, valid_nodes: Option<&BitSet>) -> Exts {
770,874✔
368
        let mut new_exts = Exts::empty();
770,874✔
369
        let node = self.get_node(node_id);
770,874✔
370
        let exts = node.exts();
770,874✔
371
        let l_kmer: K = node.sequence().first_kmer();
770,874✔
372
        let r_kmer: K = node.sequence().last_kmer();
770,874✔
373

770,874✔
374
        let check_node = |id| match valid_nodes {
1,580,784✔
375
            Some(bs) => bs.contains(id),
1,515,488✔
376
            None => true,
65,296✔
377
        };
1,580,784✔
378

379
        for i in 0..4 {
3,854,370✔
380
            if exts.has_ext(Dir::Left, i) {
3,083,496✔
381
                match self.find_link(l_kmer.extend_left(i), Dir::Left) {
790,176✔
382
                    Some((target, _, _)) if check_node(target) => {
790,176✔
383
                        new_exts = new_exts.set(Dir::Left, i)
790,176✔
384
                    }
385
                    _ => (),
×
386
                }
387
            }
2,293,320✔
388

389
            if exts.has_ext(Dir::Right, i) {
3,083,496✔
390
                match self.find_link(r_kmer.extend_right(i), Dir::Right) {
790,608✔
391
                    Some((target, _, _)) if check_node(target) => {
790,608✔
392
                        new_exts = new_exts.set(Dir::Right, i)
790,608✔
393
                    }
394
                    _ => (),
×
395
                }
396
            }
2,292,888✔
397
        }
398

399
        new_exts
770,874✔
400
    }
770,874✔
401

402
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
403
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
404
    /// paths will be terminated if there are multiple valid paths emanating from a node.
405
    pub fn max_path<F, F2>(&self, score: F, solid_path: F2) -> Vec<(usize, Dir)>
×
406
    where
×
407
        F: Fn(&D) -> f32,
×
408
        F2: Fn(&D) -> bool,
×
409
    {
×
410
        if self.is_empty() {
×
411
            return Vec::default();
×
412
        }
×
413

×
414
        let mut best_node = 0;
×
415
        let mut best_score = f32::MIN;
×
416
        for i in 0..self.len() {
×
417
            let node = self.get_node(i);
×
418
            let node_score = score(node.data());
×
419

×
420
            if node_score > best_score {
×
421
                best_node = i;
×
422
                best_score = node_score;
×
423
            }
×
424
        }
425

426
        let oscore = |state| match state {
×
427
            None => 0.0,
×
428
            Some((id, _)) => score(self.get_node(id).data()),
×
429
        };
×
430

431
        let osolid_path = |state| match state {
×
432
            None => false,
×
433
            Some((id, _)) => solid_path(self.get_node(id).data()),
×
434
        };
×
435

436
        // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
437
        // already put into the path
438
        let mut used_nodes = HashSet::new();
×
439
        let mut path = VecDeque::new();
×
440

×
441
        // Start w/ initial state
×
442
        used_nodes.insert(best_node);
×
443
        path.push_front((best_node, Dir::Left));
×
444

445
        for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
×
446
            let &(start_node, dir, do_flip) = init;
×
447
            let mut current = (start_node, dir);
×
448

449
            loop {
450
                let mut next = None;
×
451
                let (cur_id, incoming_dir) = current;
×
452
                let node = self.get_node(cur_id);
×
453
                let edges = node.edges(incoming_dir.flip());
×
454

×
455
                let mut solid_paths = 0;
×
456
                for (_, id, dir, _) in edges {
×
457
                    let cand = Some((id, dir));
×
458
                    if osolid_path(cand) {
×
459
                        solid_paths += 1;
×
460
                    }
×
461

462
                    if oscore(cand) > oscore(next) {
×
463
                        next = cand;
×
464
                    }
×
465
                }
466

467
                if solid_paths > 1 {
×
468
                    break;
×
469
                }
×
470

471
                match next {
×
472
                    Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
×
473
                        if do_flip {
×
474
                            path.push_front((next_id, next_incoming.flip()));
×
475
                        } else {
×
476
                            path.push_back((next_id, next_incoming));
×
477
                        }
×
478

479
                        used_nodes.insert(next_id);
×
480
                        current = (next_id, next_incoming);
×
481
                    }
482
                    _ => break,
×
483
                }
484
            }
485
        }
486

487
        Vec::from_iter(path)
×
488
    }
×
489

490

491
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
492
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
493
    /// paths will be terminated if there are multiple valid paths emanating from a node.
494
    /// Returns vec with path for each component
495
    pub fn max_path_comp<F, F2>(&self, score: F, solid_path: F2) -> Vec<VecDeque<(usize, Dir)>>
1✔
496
    where
1✔
497
        F: Fn(&D) -> f32,
1✔
498
        F2: Fn(&D) -> bool,
1✔
499
    {
1✔
500
        if self.is_empty() {
1✔
501
            let vec: Vec<VecDeque<(usize, Dir)>> = Vec::new();
×
502
            return vec;
×
503
        }
1✔
504

1✔
505
        let components = self.iter_components();
1✔
506
        let mut paths: Vec<VecDeque<(usize, Dir)>> = Vec::new();
1✔
507

508
        for component in components {
3✔
509

510
            let current_comp = &component;
2✔
511
            
2✔
512

2✔
513
            let mut best_node = current_comp[0];
2✔
514
            let mut best_score = f32::MIN;
2✔
515
            for c in current_comp.iter() {
6✔
516
                let node = self.get_node(*c);
6✔
517
                let node_score = score(node.data());
6✔
518

6✔
519
                if node_score > best_score {
6✔
520
                    best_node = *c;
2✔
521
                    best_score = node_score;
2✔
522
                }
4✔
523
            }
524

525
            let oscore = |state| match state {
4✔
526
                None => 0.0,
2✔
527
                Some((id, _)) => score(self.get_node(id).data()),
2✔
528
            };
4✔
529

530
            let osolid_path = |state| match state {
2✔
531
                None => false,
×
532
                Some((id, _)) => solid_path(self.get_node(id).data()),
2✔
533
            };
2✔
534

535
            // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
536
            // already put into the path
537
            let mut used_nodes = HashSet::new();
2✔
538
            let mut path = VecDeque::new();
2✔
539

2✔
540
            // Start w/ initial state
2✔
541
            used_nodes.insert(best_node);
2✔
542
            path.push_front((best_node, Dir::Left));
2✔
543

544
            for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
4✔
545
                let &(start_node, dir, do_flip) = init;
4✔
546
                let mut current = (start_node, dir);
4✔
547

548
                loop {
549
                    let mut next = None;
6✔
550
                    let (cur_id, incoming_dir) = current;
6✔
551
                    let node = self.get_node(cur_id);
6✔
552
                    let edges = node.edges(incoming_dir.flip());
6✔
553

6✔
554
                    let mut solid_paths = 0;
6✔
555
                    for (_, id, dir, _) in edges {
8✔
556
                        let cand = Some((id, dir));
2✔
557
                        if osolid_path(cand) {
2✔
558
                            solid_paths += 1;
2✔
559
                        }
2✔
560

561
                        if oscore(cand) > oscore(next) {
2✔
562
                            next = cand;
2✔
563
                        }
2✔
564
                    }
565

566
                    if solid_paths > 1 {
6✔
567
                        break;
×
568
                    }
6✔
569

570
                    match next {
2✔
571
                        Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
2✔
572
                            if do_flip {
2✔
573
                                path.push_front((next_id, next_incoming.flip()));
1✔
574
                            } else {
1✔
575
                                path.push_back((next_id, next_incoming));
1✔
576
                            }
1✔
577

578
                            used_nodes.insert(next_id);
2✔
579
                            current = (next_id, next_incoming);
2✔
580
                        }
581
                        _ => break,
4✔
582
                    }
583
                }
584
            }
585
            
586
            paths.push(path);
2✔
587
            //paths.push(Vec::from_iter(path));
588
        }
589

590
        paths
1✔
591
    
592
    }
1✔
593

594
    pub fn iter_max_path_comp<F, F2>(&self, score: F, solid_path: F2) -> PathCompIter<K, D, F, F2> 
2✔
595
    where 
2✔
596
    F: Fn(&D) -> f32,
2✔
597
    F2: Fn(&D) -> bool
2✔
598
    {
2✔
599
        let component_iterator = self.iter_components();
2✔
600
        PathCompIter { graph: self, component_iterator, graph_pos: 0, score, solid_path }
2✔
601
    }
2✔
602

603
    /// write the paths from `iter_max_path_comp` to a fasta file
604
    pub fn path_to_fasta<F, F2>(&self, f: &mut dyn std::io::Write, path_iter: PathCompIter<K, D, F, F2>, return_lens: bool) -> (Vec<usize>, Vec<usize>)
1✔
605
    where 
1✔
606
    F: Fn(&D) -> f32,
1✔
607
    F2: Fn(&D) -> bool
1✔
608
    {
1✔
609
        // width of fasta lines
1✔
610
        let columns = 80;
1✔
611

1✔
612
        // sizes of components and of paths
1✔
613
        let mut comp_sizes = Vec::new();
1✔
614
        let mut path_lens = Vec::new();
1✔
615

616
        for (seq_counter, (component, path)) in path_iter.enumerate() {
2✔
617
            // get dna sequence from path
618
            let seq = self.sequence_of_path(path.iter());
2✔
619

2✔
620
            //write header with length & start node
2✔
621
            writeln!(f, ">path{} len={} start_node={}", seq_counter, seq.len(), path[0].0).unwrap();
2✔
622

2✔
623
            // calculate how sequence has to be split up
2✔
624
            let slices = (seq.len() / columns) + 1;
2✔
625
            let mut ranges = Vec::with_capacity(slices);
2✔
626

2✔
627
            let mut start = 0;
2✔
628
            while start < seq.len() {
6✔
629
                ranges.push(start..start + columns);
4✔
630
                start += columns;
4✔
631
            }
4✔
632

633
            let last_start = ranges.pop().expect("no kmers in parallel ranges").start;
2✔
634
            ranges.push(last_start..seq.len());
2✔
635

636
            // split up sequence and write to file accordingly
637
            for range in ranges {
6✔
638
                writeln!(f, "{:?}", seq.slice(range.start, range.end)).unwrap();
4✔
639
            }
4✔
640

641
            if return_lens {
2✔
642
                comp_sizes.push(component.len());
×
643
                path_lens.push(path.len());
×
644
            }
2✔
645
        }    
646

647
        (comp_sizes, path_lens)
1✔
648
        
1✔
649
    }
1✔
650

651

652
    /// Get the sequence of a path through the graph. The path is given as a sequence of node_id integers
653
    pub fn sequence_of_path<'a, I: 'a + Iterator<Item = &'a (usize, Dir)>>(
22,895✔
654
        &self,
22,895✔
655
        path: I,
22,895✔
656
    ) -> DnaString {
22,895✔
657
        let mut seq = DnaString::new();
22,895✔
658

659
        for (idx, &(node_id, dir)) in path.enumerate() {
747,997✔
660
            let start = if idx == 0 { 0 } else { K::k() - 1 };
747,997✔
661

662
            let node_seq = match dir {
747,997✔
663
                Dir::Left => self.get_node(node_id).sequence(),
389,853✔
664
                Dir::Right => self.get_node(node_id).sequence().rc(),
358,144✔
665
            };
666

667
            for p in start..node_seq.len() {
1,479,135✔
668
                seq.push(node_seq.get(p))
1,479,135✔
669
            }
670
        }
671

672
        seq
22,895✔
673
    }
22,895✔
674

NEW
675
    fn node_to_dot<FN: Fn(&Node<K, D>) -> String, FE: Fn(&Node<K, D>, u8, Dir, bool) -> String>(
×
676
        &self,
×
677
        node: &Node<'_, K, D>,
×
NEW
678
        node_label: &FN,
×
NEW
679
        edge_label: &FE,
×
680
        f: &mut dyn Write,
×
681
    ) {
×
NEW
682
        writeln!(f, "n{} {}", node.node_id, node_label(node)).unwrap();
×
683

NEW
684
        for (base, id, incoming_dir, flipped) in node.l_edges() {
×
NEW
685

×
NEW
686
            writeln!(f, "n{} -> n{} {}", id, node.node_id, edge_label(node, base, incoming_dir, flipped)).unwrap();
×
UNCOV
687
        }
×
688

NEW
689
        for (base, id, incoming_dir, flipped) in node.r_edges() {
×
NEW
690
            writeln!(f, "n{} -> n{} {}", node.node_id, id, edge_label(node, base, incoming_dir, flipped)).unwrap();
×
691
        }
×
692
    }
×
693

694
    /// Write the graph to a dot file
NEW
695
    pub fn to_dot<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
×
NEW
696
    where 
×
NEW
697
    P: AsRef<Path>,
×
NEW
698
    FN: Fn(&Node<K, D>) -> String,
×
NEW
699
    FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
×
NEW
700
    {
×
701
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
×
702

×
703
        let pb = ProgressBar::new(self.len() as u64);
×
704
        pb.set_style(ProgressStyle::with_template("{msg} [{elapsed_precise}] {bar:60.cyan/blue} ({pos}/{len})").unwrap().progress_chars("#/-"));
×
705
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
×
706

×
707
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"").unwrap();
×
708
        for i in (0..self.len()).progress_with(pb) {
×
NEW
709
            self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
×
710
        }
×
711
        writeln!(&mut f, "}}").unwrap();
×
712
        
×
713
        f.flush().unwrap();
×
714
        debug!("large to dot loop: {}", self.len());
×
715
    }
×
716

717
    /// Write the graph to a dot file in parallel
718
    /// Will write in to n_threads files simultaniously,
719
    /// then go though the files and add the contents to a larger file, 
720
    /// and delete the small files.
721
    /// 
722
    /// The path does not need to contain the file ending.
NEW
723
    pub fn to_dot_parallel<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
×
724
    where 
×
725
        D: Sync,
×
NEW
726
        K: Sync,
×
NEW
727
        P: AsRef<Path> + Display + Sync,
×
NEW
728
        FN: Fn(&Node<K, D>) -> String + Sync,
×
NEW
729
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String + Sync,
×
730
    {        
×
731
        let slices = current_num_threads();
×
732
        let n_nodes = self.len();
×
733
        let sz = n_nodes / slices + 1;
×
734

×
735
        debug!("n_nodes: {}", n_nodes);
×
736
        debug!("sz: {}", sz);
×
737

738
        let mut parallel_ranges = Vec::with_capacity(slices);
×
739
        let mut start = 0;
×
740
        while start < n_nodes {
×
741
            parallel_ranges.push(start..start + sz);
×
742
            start += sz;
×
743
        }
×
744

745
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
×
746
        parallel_ranges.push(last_start..n_nodes);
×
747
        debug!("parallel ranges: {:?}", parallel_ranges);
×
748

749
        let mut files = Vec::with_capacity(current_num_threads());
×
750

751
        for i in 0..parallel_ranges.len() {
×
752
            files.push(format!("{}-{}.dot", path, i));
×
753
        } 
×
754

755
        let pb = ProgressBar::new(self.len() as u64);
×
756
        pb.set_style(ProgressStyle::with_template("{msg} [{elapsed_precise}] {bar:60.cyan/blue} ({pos}/{len})").unwrap().progress_chars("#/-"));
×
757
        pb.set_message(format!("{:<32}", "writing graph to DOT files"));
×
758
    
×
759
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
×
760
            let mut f = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel dot file"));
×
761

762
            for i in range {
×
NEW
763
                self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
×
764
                pb.inc(1);
×
765
            }
×
766

767
            f.flush().unwrap();
×
768
        });
×
769
        pb.finish_and_clear();
×
770

×
771
        let mut out_file = BufWriter::with_capacity(BUF, File::create(format!("{}.dot", path)).unwrap());
×
772

×
773
        writeln!(&mut out_file, "digraph {{\nrankdir=\"LR\"").unwrap();
×
774

×
775
        let pb = ProgressBar::new(files.len() as u64);
×
776
        pb.set_style(ProgressStyle::with_template("{msg} [{elapsed_precise}] {bar:60.cyan/blue} ({pos}/{len})").unwrap().progress_chars("#/-"));
×
777
        pb.set_message(format!("{:<32}", "combining files"));
×
778

779
        for file in files.iter().progress_with(pb) {
×
780
            let open_file = File::open(file).expect("error creating combined dot file");
×
781
            let mut reader = BufReader::new(open_file);
×
782
            let mut buffer = [0; BUF];
×
783

784
            loop {
785
                let linecount = reader.read(&mut buffer).unwrap();
×
786
                if linecount == 0 { break }
×
787
                out_file.write_all(&buffer[..linecount]).unwrap();
×
788
            }
789

790
            remove_file(file).unwrap();
×
791
        }
792

793
        writeln!(&mut out_file, "}}").unwrap();
×
794

×
795
        out_file.flush().unwrap();
×
796

×
797

×
798
    }
×
799

800
    /// Write part of the graph to a dot file
801
    /// 
802
    /// ### Arguments: 
803
    /// 
804
    /// * `path`: path to the output file
805
    /// * `node_label` & `edge_label`: closures taking [`Node<K, D>`] and returning a string containing commands for dot nodes and edges 
806
    /// * `nodes`: [`Vec<usize>`] listing all IDs of nodes which should be included
NEW
807
    pub fn to_dot_partial<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE, nodes: Vec<usize>) 
×
NEW
808
    where 
×
NEW
809
        P: AsRef<Path>,
×
NEW
810
        FN: Fn(&Node<K, D>) -> String,
×
NEW
811
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
×
NEW
812
    {
×
813
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
×
814

×
815
        let pb = ProgressBar::new(nodes.len() as u64);
×
816
        pb.set_style(ProgressStyle::with_template("{msg} [{elapsed_precise}] {bar:60.cyan/blue} ({pos}/{len})").unwrap().progress_chars("#/-"));
×
817
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
×
818

×
819
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"").unwrap();
×
820
        for i in nodes.into_iter().progress_with(pb) {
×
NEW
821
            self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
×
822
        }
×
823
        writeln!(&mut f, "}}").unwrap();
×
824

×
825
        f.flush().unwrap();
×
826

×
827
        debug!("large to dot loop: {}", self.len());
×
828
    }
×
829

830
    fn node_to_gfa<F: Fn(&Node<'_, K, D>) -> String>(
12✔
831
        &self,
12✔
832
        node: &Node<'_, K, D>,
12✔
833
        w: &mut dyn Write,
12✔
834
        tag_func: Option<&F>,
12✔
835
    ) -> Result<(), Error> {
12✔
836
        match tag_func {
12✔
837
            Some(f) => {
12✔
838
                let tags = (f)(node);
12✔
839
                writeln!(
12✔
840
                    w,
12✔
841
                    "S\t{}\t{}\t{}",
12✔
842
                    node.node_id,
12✔
843
                    node.sequence().to_dna_string(),
12✔
844
                    tags
12✔
845
                )?;
12✔
846
            }
847
            _ => writeln!(
×
848
                w,
×
849
                "S\t{}\t{}",
×
850
                node.node_id,
×
851
                node.sequence().to_dna_string()
×
852
            )?,
×
853
        }
854

855
        for (_, target, dir, _) in node.l_edges() {
14✔
856
            if target >= node.node_id {
14✔
857
                let to_dir = match dir {
6✔
858
                    Dir::Left => "+",
6✔
UNCOV
859
                    Dir::Right => "-",
×
860
                };
861
                writeln!(
6✔
862
                    w,
6✔
863
                    "L\t{}\t-\t{}\t{}\t{}M",
6✔
864
                    node.node_id,
6✔
865
                    target,
6✔
866
                    to_dir,
6✔
867
                    K::k() - 1
6✔
868
                )?;
6✔
869
            }
8✔
870
        }
871

872
        for (_, target, dir, _) in node.r_edges() {
12✔
873
            if target > node.node_id {
2✔
874
                let to_dir = match dir {
2✔
875
                    Dir::Left => "+",
2✔
UNCOV
876
                    Dir::Right => "-",
×
877
                };
878
                writeln!(
2✔
879
                    w,
2✔
880
                    "L\t{}\t+\t{}\t{}\t{}M",
2✔
881
                    node.node_id,
2✔
882
                    target,
2✔
883
                    to_dir,
2✔
884
                    K::k() - 1
2✔
885
                )?;
2✔
UNCOV
886
            }
×
887
        }
888

889
        Ok(())
12✔
890
    }
12✔
891

892
    /// Write the graph to GFA format
893
    pub fn to_gfa<P: AsRef<Path>>(&self, gfa_out: P) -> Result<(), Error> {
×
894
        let wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
×
895
        self.write_gfa(&mut std::io::BufWriter::new(wtr))
×
896
    }
×
897

898
    pub fn write_gfa(&self, wtr: &mut impl Write) -> Result<(), Error> {
×
899
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
×
900

901
        type DummyFn<K, D> = fn(&Node<'_, K, D>) -> String;
902
        let dummy_opt: Option<&DummyFn<K, D>> = None;
×
903

×
904
        let pb = ProgressBar::new(self.len() as u64);
×
905
        pb.set_style(ProgressStyle::with_template("{msg} [{elapsed_precise}] {bar:60.cyan/blue} ({pos}/{len})").unwrap().progress_chars("#/-"));
×
906
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
×
907

908
        for i in (0..self.len()).progress_with(pb) {
×
909
            let n = self.get_node(i);
×
910
            self.node_to_gfa(&n, wtr, dummy_opt)?;
×
911
        }
912

913
        wtr.flush().unwrap();
×
914

×
915
        Ok(())
×
916
    }
×
917

918
    /// Write the graph to GFA format
919
    pub fn to_gfa_with_tags<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(
1✔
920
        &self,
1✔
921
        gfa_out: P,
1✔
922
        tag_func: F,
1✔
923
    ) -> Result<(), Error> {
1✔
924
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creatinf gfa file"));
1✔
925
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
926

927
        let pb = ProgressBar::new(self.len() as u64);
1✔
928
        pb.set_style(ProgressStyle::with_template("{msg} [{elapsed_precise}] {bar:60.cyan/blue} ({pos}/{len})").unwrap().progress_chars("#/-"));
1✔
929
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
930

931
        for i in (0..self.len()).progress_with(pb) {
6✔
932
            let n = self.get_node(i);
6✔
933
            self.node_to_gfa(&n, &mut wtr, Some(&tag_func))?;
6✔
934
        }
935

936
        wtr.flush().unwrap();
1✔
937

1✔
938
        Ok(())
1✔
939
    }
1✔
940

941
    /// Write the graph to GFA format, with multithreading, 
942
    /// pass `tag_func=None` to write without tags
943
    pub fn to_gfa_otags_parallel<P: AsRef<Path> + Display + Sync, F: Fn(&Node<'_, K, D>) -> String + Sync>(
1✔
944
        &self,
1✔
945
        gfa_out: P,
1✔
946
        tag_func: Option<&F>,
1✔
947
    ) -> Result<(), Error> 
1✔
948
    where 
1✔
949
    K: Sync,
1✔
950
    D: Sync,
1✔
951
    {
1✔
952
        // split into ranges according to thread count
1✔
953
        let slices = current_num_threads();
1✔
954
        let n_nodes = self.len();
1✔
955
        let sz = n_nodes / slices + 1;
1✔
956

1✔
957
        debug!("n_nodes: {}", n_nodes);
1✔
958
        debug!("sz: {}", sz);
1✔
959

960
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
961
        let mut start = 0;
1✔
962
        while start < n_nodes {
4✔
963
            parallel_ranges.push(start..start + sz);
3✔
964
            start += sz;
3✔
965
        }
3✔
966

967
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
968
        parallel_ranges.push(last_start..n_nodes);
1✔
969
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
970

971
        let mut files = Vec::with_capacity(current_num_threads());
1✔
972

973
        for i in 0..parallel_ranges.len() {
3✔
974
            files.push(format!("{}-{}.gfa", gfa_out, i));
3✔
975
        } 
3✔
976

977
        let pb = ProgressBar::new(self.len() as u64);
1✔
978
        pb.set_style(ProgressStyle::with_template("{msg} [{elapsed_precise}] {bar:60.cyan/blue} ({pos}/{len})").unwrap().progress_chars("#/-"));
1✔
979
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
980
        
1✔
981
        
1✔
982
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
3✔
983
            let mut wtr = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel gfa file"));
3✔
984

985
            for i in range {
9✔
986
                let n = self.get_node(i);
6✔
987
                self.node_to_gfa(&n, &mut wtr, tag_func).unwrap();
6✔
988
                pb.inc(1);
6✔
989
            }
6✔
990

991
            wtr.flush().unwrap();
3✔
992
        });
3✔
993

1✔
994
        pb.finish_and_clear();
1✔
995

1✔
996
        // combine files
1✔
997
        let mut out_file = BufWriter::with_capacity(BUF, File::create(format!("{}.gfa", gfa_out)).expect("error creating combined gfa file"));
1✔
998
        writeln!(out_file, "H\tVN:Z:debruijn-rs")?;
1✔
999

1000
        let pb = ProgressBar::new(files.len() as u64);
1✔
1001
        pb.set_style(ProgressStyle::with_template("{msg} [{elapsed_precise}] {bar:60.cyan/blue} ({pos}/{len})").unwrap().progress_chars("#/-"));
1✔
1002
        pb.set_message(format!("{:<32}", "combining files"));
1✔
1003

1004
        for file in files.iter() {
3✔
1005
            let open_file = File::open(file).expect("error opening parallel gfa file");
3✔
1006
            let mut reader = BufReader::new(open_file);
3✔
1007
            let mut buffer = [0; BUF];
3✔
1008

1009
            loop {
1010
                let linecount = reader.read(&mut buffer).unwrap();
6✔
1011
                if linecount == 0 { break }
6✔
1012
                out_file.write_all(&buffer[..linecount]).unwrap();
3✔
1013
            }
1014

1015
            remove_file(file).unwrap();
3✔
1016
        }
1017

1018
        out_file.flush().unwrap();
1✔
1019

1✔
1020
        Ok(())
1✔
1021
    }
1✔
1022

1023
    /// Write the graph to GFA format
1024
    pub fn to_gfa_partial<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(&self, gfa_out: P, tag_func: Option<&F>, nodes: Vec<usize>) -> Result<(), Error> {
×
1025
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
×
1026
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
×
1027

1028
        let pb = ProgressBar::new(self.len() as u64);
×
1029
        pb.set_style(ProgressStyle::with_template("{msg} [{elapsed_precise}] {bar:60.cyan/blue} ({pos}/{len})").unwrap().progress_chars("#/-"));
×
1030
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
×
1031

1032
        for i in nodes.into_iter().progress_with(pb) {
×
1033
            let n = self.get_node(i);
×
1034
            self.node_to_gfa(&n, &mut wtr, tag_func)?;
×
1035
        }
1036

1037
        wtr.flush().unwrap();
×
1038

×
1039
        Ok(())    
×
1040
    }
×
1041

1042
    pub fn to_json_rest<W: Write, F: Fn(&D) -> Value>(
×
1043
        &self,
×
1044
        fmt_func: F,
×
1045
        mut writer: &mut W,
×
1046
        rest: Option<Value>,
×
1047
    ) {
×
1048
        writeln!(writer, "{{\n\"nodes\": [").unwrap();
×
1049
        for i in 0..self.len() {
×
1050
            let node = self.get_node(i);
×
1051
            node.to_json(&fmt_func, writer);
×
1052
            if i == self.len() - 1 {
×
1053
                writeln!(writer).unwrap();
×
1054
            } else {
×
1055
                writeln!(writer, ",").unwrap();
×
1056
            }
×
1057
        }
1058
        writeln!(writer, "],").unwrap();
×
1059

×
1060
        writeln!(writer, "\"links\": [").unwrap();
×
1061
        for i in 0..self.len() {
×
1062
            let node = self.get_node(i);
×
1063
            match node.edges_to_json(writer) {
×
1064
                true => {
1065
                    if i == self.len() - 1 {
×
1066
                        writeln!(writer).unwrap();
×
1067
                    } else {
×
1068
                        writeln!(writer, ",").unwrap();
×
1069
                    }
×
1070
                }
1071
                _ => continue,
×
1072
            }
1073
        }
1074
        writeln!(writer, "]").unwrap();
×
1075

1076
        match rest {
×
1077
            Some(Value::Object(v)) => {
×
1078
                for (k, v) in v.iter() {
×
1079
                    writeln!(writer, ",").expect("io error");
×
1080
                    write!(writer, "\"{}\": ", k).expect("io error");
×
1081
                    serde_json::to_writer(&mut writer, v).expect("io error");
×
1082
                    writeln!(writer).expect("io error");
×
1083
                }
×
1084
            }
1085
            _ => {
×
1086
                writeln!(writer).expect("io error");
×
1087
            }
×
1088
        }
1089

1090
        writeln!(writer, "}}").expect("io error");
×
1091
    }
×
1092

1093
    /// Write the graph to JSON
1094
    pub fn to_json<W: Write, F: Fn(&D) -> Value, RF: Fn(&mut W)>(
×
1095
        &self,
×
1096
        fmt_func: F,
×
1097
        writer: &mut W,
×
1098
    ) {
×
1099
        self.to_json_rest(fmt_func, writer, None);
×
1100
    }
×
1101

1102
    /// Print a text representation of the graph.
1103
    pub fn print(&self) {
2✔
1104
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
2✔
1105
        for node in self.iter_nodes() {
8✔
1106
            println!("{:?}", node);
8✔
1107
        }
8✔
1108
    }
2✔
1109

1110
    pub fn print_with_data(&self) {
×
1111
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
×
1112
        for node in self.iter_nodes() {
×
1113
            println!("{:?} ({:?})", node, node.data());
×
1114
        }
×
1115
    }
×
1116

1117
    pub fn max_path_beam<F, F2>(&self, beam: usize, score: F, _solid_path: F2) -> Vec<(usize, Dir)>
×
1118
    where
×
1119
        F: Fn(&D) -> f32,
×
1120
        F2: Fn(&D) -> bool,
×
1121
    {
×
1122
        if self.is_empty() {
×
1123
            return Vec::default();
×
1124
        }
×
1125

×
1126
        let mut states = Vec::new();
×
1127

1128
        for i in 0..self.len() {
×
1129
            let node = self.get_node(i);
×
1130

×
1131
            // Initialize beam search on terminal nodes
×
1132
            if node.exts().num_exts_l() == 0 || node.exts().num_exts_r() == 0 {
×
1133
                let dir = if node.exts().num_exts_l() > 0 {
×
1134
                    Dir::Right
×
1135
                } else {
1136
                    Dir::Left
×
1137
                };
1138

1139
                let status = if node.exts().num_exts_l() == 0 && node.exts().num_exts_r() == 0 {
×
1140
                    Status::End
×
1141
                } else {
1142
                    Status::Active
×
1143
                };
1144

1145
                let mut path = SmallVec8::new();
×
1146
                path.push((i as u32, dir));
×
1147

×
1148
                let s = State {
×
1149
                    path,
×
1150
                    status,
×
1151
                    score: score(node.data()),
×
1152
                };
×
1153
                states.push(s);
×
1154
            }
×
1155
        }
1156

1157
        // No end nodes -- just start on the first node
1158
        if states.is_empty() {
×
1159
            // Make a start
×
1160
            let node = self.get_node(0);
×
1161
            let mut path = SmallVec8::new();
×
1162
            path.push((0, Dir::Left));
×
1163
            states.push(State {
×
1164
                path,
×
1165
                status: Status::Active,
×
1166
                score: score(node.data()),
×
1167
            });
×
1168
        }
×
1169

1170
        // Beam search until we can't find any more expansions
1171
        let mut active = true;
×
1172
        while active {
×
1173
            let mut new_states = Vec::with_capacity(states.len());
×
1174
            active = false;
×
1175

1176
            for s in states {
×
1177
                if s.status == Status::Active {
×
1178
                    active = true;
×
1179
                    let expanded = self.expand_state(&s, &score);
×
1180
                    new_states.extend(expanded);
×
1181
                } else {
×
1182
                    new_states.push(s)
×
1183
                }
1184
            }
1185

1186
            // workaround to sort by descending score - will panic if there are NaN scores
1187
            new_states.sort_by(|a, b| (-(a.score)).partial_cmp(&-(b.score)).unwrap());
×
1188
            new_states.truncate(beam);
×
1189
            states = new_states;
×
1190
        }
×
1191

1192
        for (i, state) in states.iter().take(5).enumerate() {
×
1193
            trace!("i:{}  -- {:?}", i, state);
×
1194
        }
1195

1196
        // convert back to using usize for node_id
1197
        states[0]
×
1198
            .path
×
1199
            .iter()
×
1200
            .map(|&(node, dir)| (node as usize, dir))
×
1201
            .collect()
×
1202
    }
×
1203

1204
    fn expand_state<F>(&self, state: &State, score: &F) -> SmallVec4<State>
×
1205
    where
×
1206
        F: Fn(&D) -> f32,
×
1207
    {
×
1208
        if state.status != Status::Active {
×
1209
            panic!("only attempt to expand active states")
×
1210
        }
×
1211

×
1212
        let (node_id, dir) = state.path[state.path.len() - 1];
×
1213
        let node = self.get_node(node_id as usize);
×
1214
        let mut new_states = SmallVec4::new();
×
1215

1216
        for (_, next_node_id, incoming_dir, _) in node.edges(dir.flip()) {
×
1217
            let next_node = self.get_node(next_node_id);
×
1218
            let new_score = state.score + score(next_node.data());
×
1219

×
1220
            let cycle = state
×
1221
                .path
×
1222
                .iter()
×
1223
                .any(|&(prev_node, _)| prev_node == (next_node_id as u32));
×
1224

1225
            let status = if cycle {
×
1226
                Status::Cycle
×
1227
            } else if next_node.edges(incoming_dir.flip()).is_empty() {
×
1228
                Status::End
×
1229
            } else {
1230
                Status::Active
×
1231
            };
1232

1233
            let mut new_path = state.path.clone();
×
1234
            new_path.push((next_node_id as u32, incoming_dir));
×
1235

×
1236
            let next_state = State {
×
1237
                path: new_path,
×
1238
                score: new_score,
×
1239
                status,
×
1240
            };
×
1241

×
1242
            new_states.push(next_state);
×
1243
        }
1244

1245
        new_states
×
1246
    }
×
1247

1248

1249
    pub fn iter_components(&self) -> IterComponents<K, D> {
4✔
1250
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
4✔
1251
        let pos = 0;
4✔
1252

1253
        for _i in 0..self.len() {
24✔
1254
            visited.push(false);
24✔
1255
        }
24✔
1256

1257
        IterComponents { 
4✔
1258
            graph: self, 
4✔
1259
            visited, 
4✔
1260
            pos }
4✔
1261
    }
4✔
1262

1263

1264
    /// iteratively returns 2D Vec with node_ids grouped according to the connected components they form
1265
    pub fn components_i(&self) -> Vec<Vec<usize>> {
1✔
1266
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
1✔
1267
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
1✔
1268

1269
        for _i in 0..self.len() {
6✔
1270
            visited.push(false);
6✔
1271
        }
6✔
1272

1273
        for i in 0..self.len() {
6✔
1274
            if !visited[i] {
6✔
1275
                let comp = self.component_i(&mut visited, i);
2✔
1276
                components.push(comp);
2✔
1277
            }
4✔
1278
        }
1279

1280
        components
1✔
1281
    }
1✔
1282

1283
    /// recursively detects which nodes form separate graph components
1284
    /// returns 2D vector with node ids per component
1285
    /// (may lead to stack overflow)
1286
    pub fn components_r(&self) -> Vec<Vec<usize>> {
2✔
1287
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
2✔
1288
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
2✔
1289

1290
        for _i in 0..self.len() {
8✔
1291
            visited.push(false);
8✔
1292
        }
8✔
1293

1294
        for i in 0..self.len() {
8✔
1295
            if !visited[i] {
8✔
1296
                let mut comp: Vec<usize> = Vec::new();
4✔
1297
                self.component_r(&mut visited, i, &mut comp);
4✔
1298
                components.push(comp);
4✔
1299
            }
4✔
1300
        }
1301

1302
        components
2✔
1303

2✔
1304
    }
2✔
1305

1306
    fn component_r<'a>(&'a self, visited: &'a mut Vec<bool>, i: usize, comp: &'a mut Vec<usize>) {
8✔
1307
        
8✔
1308
        visited[i] = true;
8✔
1309
        comp.push(i);
8✔
1310
        let mut edges = self.find_edges(i, Dir::Left);
8✔
1311
        let mut r_edges = self.find_edges(i, Dir::Right);
8✔
1312

8✔
1313
        edges.append(&mut r_edges);
8✔
1314

1315
        for (_, edge, _, _) in edges.iter() {
8✔
1316
            if !visited[*edge] {
8✔
1317
                self.component_r(visited, *edge, comp);
4✔
1318
            }
4✔
1319
        }
1320
    }
8✔
1321

1322
    fn component_i<'a>(&'a self, visited: &'a mut [bool], i: usize) -> Vec<usize> {
10✔
1323
        let mut edges: Vec<usize> = Vec::new();
10✔
1324
        let mut comp: Vec<usize> = Vec::new();
10✔
1325

10✔
1326
        edges.push(i);
10✔
1327

1328
        while let Some(current_edge) = edges.pop() {
40✔
1329
            if !visited[current_edge] { 
30✔
1330
                comp.push(current_edge);
30✔
1331
                visited[current_edge] = true;
30✔
1332

30✔
1333
                let mut l_edges = self.find_edges(current_edge, Dir::Left);
30✔
1334
                let mut r_edges = self.find_edges(current_edge, Dir::Right);
30✔
1335

30✔
1336
                l_edges.append(&mut r_edges);
30✔
1337

1338
                for (_, new_edge, _, _) in l_edges.into_iter() {
40✔
1339
                    if !visited[new_edge] {
40✔
1340
                        edges.push(new_edge);
20✔
1341
                    }
20✔
1342
                }
1343
            }
×
1344
        }
1345
        comp
10✔
1346
    }
10✔
1347

1348
    pub fn find_bad_nodes<F: Fn(&Node<'_, K, D>) -> bool>(&self, valid: F) -> Vec<usize> {
×
1349
        let mut bad_nodes = Vec::new();
×
1350

1351
        for (i, node) in enumerate(self.iter_nodes()) {
×
1352
            if !valid(&node) { bad_nodes.push(i); }
×
1353
        }
1354
        
1355
        bad_nodes
×
1356
    }
×
1357
}
1358

1359
impl<K: Kmer, SD: SummaryData<u8> + Debug> DebruijnGraph<K, SD> {
NEW
1360
    pub fn create_colors(&self, config: &SummaryConfig) -> Colors<SD> {
×
NEW
1361
        Colors::new(self, config)
×
NEW
1362
    }
×
1363
}
1364

1365
#[derive(Debug, Eq, PartialEq)]
1366
enum Status {
1367
    Active,
1368
    End,
1369
    Cycle,
1370
}
1371

1372
#[derive(Debug)]
1373
struct State {
1374
    path: SmallVec8<(u32, Dir)>,
1375
    score: f32,
1376
    status: Status,
1377
}
1378

1379
impl State {}
1380

1381
/// Iterator over nodes in a `DeBruijnGraph`
1382
pub struct NodeIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1383
    graph: &'a DebruijnGraph<K, D>,
1384
    node_id: usize,
1385
}
1386

1387
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIter<'a, K, D> {
1388
    type Item = Node<'a, K, D>;
1389

1390
    fn next(&mut self) -> Option<Node<'a, K, D>> {
21,845✔
1391
        if self.node_id < self.graph.len() {
21,845✔
1392
            let node = self.graph.get_node(self.node_id);
21,732✔
1393
            self.node_id += 1;
21,732✔
1394
            Some(node)
21,732✔
1395
        } else {
1396
            None
113✔
1397
        }
1398
    }
21,845✔
1399
}
1400

1401
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for &'a DebruijnGraph<K, D> {
1402
    type Item = NodeKmer<'a, K, D>;
1403
    type IntoIter = NodeIntoIter<'a, K, D>;
1404

1405
    fn into_iter(self) -> Self::IntoIter {
2,253✔
1406
        NodeIntoIter {
2,253✔
1407
            graph: self,
2,253✔
1408
            node_id: 0,
2,253✔
1409
        }
2,253✔
1410
    }
2,253✔
1411
}
1412

1413
/// Iterator over nodes in a `DeBruijnGraph`
1414
pub struct NodeIntoIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1415
    graph: &'a DebruijnGraph<K, D>,
1416
    node_id: usize,
1417
}
1418

1419
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIntoIter<'a, K, D> {
1420
    type Item = NodeKmer<'a, K, D>;
1421

1422
    fn next(&mut self) -> Option<Self::Item> {
302,652✔
1423
        if self.node_id < self.graph.len() {
302,652✔
1424
            let node_id = self.node_id;
302,652✔
1425
            let node = self.graph.get_node(node_id);
302,652✔
1426
            let node_seq = node.sequence();
302,652✔
1427

302,652✔
1428
            self.node_id += 1;
302,652✔
1429
            Some(NodeKmer {
302,652✔
1430
                node_id,
302,652✔
1431
                node_seq_slice: node_seq,
302,652✔
1432
                phantom_d: PhantomData,
302,652✔
1433
                phantom_k: PhantomData,
302,652✔
1434
            })
302,652✔
1435
        } else {
1436
            None
×
1437
        }
1438
    }
302,652✔
1439
}
1440

1441
/// A `DebruijnGraph` node with a reference to the sequence of the node.
1442
#[derive(Clone)]
1443
pub struct NodeKmer<'a, K: Kmer + 'a, D: Debug + 'a> {
1444
    pub node_id: usize,
1445
    node_seq_slice: DnaStringSlice<'a>,
1446
    phantom_k: PhantomData<K>,
1447
    phantom_d: PhantomData<D>,
1448
}
1449

1450
/// An iterator over the kmers in a `DeBruijn graph node`
1451
pub struct NodeKmerIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1452
    kmer_id: usize,
1453
    kmer: K,
1454
    num_kmers: usize,
1455
    node_seq_slice: DnaStringSlice<'a>,
1456
    phantom_k: PhantomData<K>,
1457
    phantom_d: PhantomData<D>,
1458
}
1459

1460
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for NodeKmer<'a, K, D> {
1461
    type Item = K;
1462
    type IntoIter = NodeKmerIter<'a, K, D>;
1463

1464
    fn into_iter(self) -> Self::IntoIter {
605,304✔
1465
        let num_kmers = self.node_seq_slice.len() - K::k() + 1;
605,304✔
1466

1467
        let kmer = if num_kmers > 0 {
605,304✔
1468
            self.node_seq_slice.get_kmer::<K>(0)
605,304✔
1469
        } else {
1470
            K::empty()
×
1471
        };
1472

1473
        NodeKmerIter {
605,304✔
1474
            kmer_id: 0,
605,304✔
1475
            kmer,
605,304✔
1476
            num_kmers,
605,304✔
1477
            node_seq_slice: self.node_seq_slice,
605,304✔
1478
            phantom_k: PhantomData,
605,304✔
1479
            phantom_d: PhantomData,
605,304✔
1480
        }
605,304✔
1481
    }
605,304✔
1482
}
1483

1484
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeKmerIter<'a, K, D> {
1485
    type Item = K;
1486

1487
    fn next(&mut self) -> Option<Self::Item> {
3,683,542✔
1488
        if self.num_kmers == self.kmer_id {
3,683,542✔
1489
            None
×
1490
        } else {
1491
            let current_kmer = self.kmer;
3,683,542✔
1492

3,683,542✔
1493
            self.kmer_id += 1;
3,683,542✔
1494
            if self.kmer_id < self.num_kmers {
3,683,542✔
1495
                let next_base = self.node_seq_slice.get(self.kmer_id + K::k() - 1);
3,606,054✔
1496
                let new_kmer = self.kmer.extend_right(next_base);
3,606,054✔
1497
                self.kmer = new_kmer;
3,606,054✔
1498
            }
3,606,054✔
1499

1500
            Some(current_kmer)
3,683,542✔
1501
        }
1502
    }
3,683,542✔
1503

1504
    fn size_hint(&self) -> (usize, Option<usize>) {
302,652✔
1505
        (self.num_kmers, Some(self.num_kmers))
302,652✔
1506
    }
302,652✔
1507

1508
    /// Provide a 'fast-forward' capability for this iterator
1509
    /// MPHF will use this to reduce the number of kmers that
1510
    /// need to be produced.
1511
    fn nth(&mut self, n: usize) -> Option<Self::Item> {
2,658,190✔
1512
        if n <= 4 {
2,658,190✔
1513
            // for small skips forward, shift one base at a time
1514
            for _ in 0..n {
2,391,134✔
1515
                self.next();
1,025,352✔
1516
            }
1,025,352✔
1517
        } else {
267,056✔
1518
            self.kmer_id += n;
267,056✔
1519
            self.kmer = self.node_seq_slice.get_kmer::<K>(self.kmer_id);
267,056✔
1520
        }
267,056✔
1521

1522
        self.next()
2,658,190✔
1523
    }
2,658,190✔
1524
}
1525

1526
/// Marker signifying that NodeKmerIter has a known size.
1527
impl<'a, K: Kmer + 'a, D: Debug + 'a> ExactSizeIterator for NodeKmerIter<'a, K, D> {}
1528

1529
/// Unbranched sequence in the DeBruijn graph
1530
pub struct Node<'a, K: Kmer + 'a, D: 'a> {
1531
    pub node_id: usize,
1532
    pub graph: &'a DebruijnGraph<K, D>,
1533
}
1534

1535
impl<'a, K: Kmer, D: Debug> Node<'a, K, D> {
1536
    /// Length of the sequence of this node
1537
    pub fn len(&self) -> usize {
1,515,390✔
1538
        self.graph.base.sequences.get(self.node_id).len()
1,515,390✔
1539
    }
1,515,390✔
1540

1541
    pub fn is_empty(&self) -> bool {
×
1542
        self.graph.base.sequences.get(self.node_id).is_empty()
×
1543
    }
×
1544

1545
    /// Sequence of the node
1546
    pub fn sequence(&self) -> DnaStringSlice<'a> {
3,385,033✔
1547
        self.graph.base.sequences.get(self.node_id)
3,385,033✔
1548
    }
3,385,033✔
1549

1550
    /// Reference to auxiliarly data associated with the node
1551
    pub fn data(&self) -> &'a D {
2,241,081✔
1552
        &self.graph.base.data[self.node_id]
2,241,081✔
1553
    }
2,241,081✔
1554

1555
    /// Extension bases from this node
1556
    pub fn exts(&self) -> Exts {
2,299,437✔
1557
        self.graph.base.exts[self.node_id]
2,299,437✔
1558
    }
2,299,437✔
1559

1560
    /// Edges leaving the left side of the node in the format
1561
    //// (target_node id, incoming side of target node, whether target node has is flipped)
1562
    pub fn l_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
20✔
1563
        self.graph.find_edges(self.node_id, Dir::Left)
20✔
1564
    }
20✔
1565

1566
    /// Edges leaving the right side of the node in the format
1567
    //// (target_node id, incoming side of target node, whether target node has is flipped)
1568
    pub fn r_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
20✔
1569
        self.graph.find_edges(self.node_id, Dir::Right)
20✔
1570
    }
20✔
1571

1572
    /// Edges leaving the 'dir' side of the node in the format
1573
    //// (target_node id, incoming side of target node, whether target node has is flipped)
1574
    pub fn edges(&self, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
375,949✔
1575
        self.graph.find_edges(self.node_id, dir)
375,949✔
1576
    }
375,949✔
1577

1578
    fn to_json<F: Fn(&D) -> Value>(&self, func: &F, f: &mut dyn Write) {
×
1579
        write!(
×
1580
            f,
×
1581
            "{{\"id\":\"{}\",\"L\":{},\"D\":{},\"Se\":\"{:?}\"}}",
×
1582
            self.node_id,
×
1583
            self.sequence().len(),
×
1584
            (func)(self.data()),
×
1585
            self.sequence(),
×
1586
        )
×
1587
        .unwrap();
×
1588
    }
×
1589

1590
    fn edges_to_json(&self, f: &mut dyn Write) -> bool {
×
1591
        let mut wrote = false;
×
1592
        let edges = self.r_edges();
×
1593
        for (idx, &(_, id, incoming_dir, _)) in edges.iter().enumerate() {
×
1594
            write!(
×
1595
                f,
×
1596
                "{{\"source\":\"{}\",\"target\":\"{}\",\"D\":\"{}\"}}",
×
1597
                self.node_id,
×
1598
                id,
×
1599
                match incoming_dir {
×
1600
                    Dir::Left => "L",
×
1601
                    Dir::Right => "R",
×
1602
                }
1603
            )
1604
            .unwrap();
×
1605

×
1606
            if idx < edges.len() - 1 {
×
1607
                write!(f, ",").unwrap();
×
1608
            }
×
1609

1610
            wrote = true;
×
1611
        }
1612
        wrote
×
1613
    }
×
1614
}
1615

1616
// TODO make generic instead of u8 (u8 is sufficient for dbg)
1617
impl<K: Kmer, SD: SummaryData<u8> + Debug> Node<'_, K, SD>  {
NEW
1618
    pub fn edge_dot_default(&self, colors: &Colors<SD>, base: u8, incoming_dir: Dir, flipped: bool) -> String {
×
NEW
1619
        let color = match incoming_dir {
×
NEW
1620
            Dir::Left => "blue",
×
NEW
1621
            Dir::Right => "red"
×
1622
        };
1623
        
NEW
1624
        if let Some(em) = self.data().edge_mults() {
×
1625
            
NEW
1626
            let dir = if flipped { 
×
NEW
1627
                incoming_dir 
×
1628
            } else {
NEW
1629
                incoming_dir.flip()
×
1630
            };
1631

NEW
1632
            let count = em.edge_mult(base, dir);
×
NEW
1633
            let penwidth = colors.edge_width(count);
×
NEW
1634

×
NEW
1635
            format!("[color={color}, penwidth={penwidth}, label=\"{count}\"]")
×
1636
        } else {
NEW
1637
            format!("[color={color}]")
×
1638
        }
NEW
1639
    }
×
1640

NEW
1641
    pub fn node_dot_default(&self, colors: &Colors<SD>, config: &SummaryConfig, tag_translator: &bimap::BiHashMap<String, u8> , outline: bool) -> String {
×
NEW
1642
        let color = colors.node_color(self.data(), config, outline);
×
NEW
1643
        let data_info = self.data().print(tag_translator, config);
×
NEW
1644
        let wrap = if self.len() > 40 { self.len() } else { 40 };
×
NEW
1645
        let label = textwrap::fill(&format!("id: {}, len: {}, seq: {}, {}", 
×
NEW
1646
            self.node_id,
×
NEW
1647
            self.len(),
×
NEW
1648
            self.sequence(),
×
NEW
1649
            data_info
×
NEW
1650
        ), wrap);
×
NEW
1651
        format!("[style=filled, {color}, label=\"{label}\"]")
×
UNCOV
1652
    }
×
1653
}
1654

1655
impl<K: Kmer, D> fmt::Debug for Node<'_, K, D>
1656
where
1657
    D: Debug,
1658
{
1659
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
8✔
1660
        write!(
8✔
1661
            f,
8✔
1662
            "Node {{ id:{}, Exts: {:?}, L:{:?} R:{:?}, Seq: {:?}, Data: {:?} }}",
8✔
1663
            self.node_id,
8✔
1664
            self.exts(),
8✔
1665
            self.l_edges(),
8✔
1666
            self.r_edges(),
8✔
1667
            self.sequence().len(),
8✔
1668
            self.data()
8✔
1669
        )
8✔
1670
    }
8✔
1671
}
1672

1673
pub struct IterComponents<'a, K: Kmer, D> {
1674
    graph: &'a DebruijnGraph<K, D>,
1675
    visited: Vec<bool>,
1676
    pos: usize,
1677
}
1678

1679
impl<K: Kmer, D: Debug> Iterator for IterComponents<'_, K, D> {
1680
    type Item = Vec<usize>;
1681
    fn next(&mut self) -> Option<Self::Item> {
12✔
1682
        while self.pos < self.graph.len() {
28✔
1683
            if !self.visited[self.pos] {
24✔
1684
                let comp = self.graph.component_i(&mut self.visited, self.pos);
8✔
1685
                self.pos += 1;
8✔
1686
                return Some(comp)
8✔
1687
            } else {
16✔
1688
                self.pos += 1;
16✔
1689
            }
16✔
1690
        }
1691
        assert!(self.visited.iter().map(|x| *x as usize).sum::<usize>() == self.graph.len());
24✔
1692
        None
4✔
1693
    }
12✔
1694
    
1695
}
1696

1697
pub struct PathCompIter<'a, K: Kmer, D: Debug, F, F2> 
1698
where 
1699
F: Fn(&D) -> f32,
1700
F2: Fn(&D) -> bool
1701
{
1702
    graph: &'a DebruijnGraph<K, D>,
1703
    component_iterator: IterComponents<'a, K, D>,
1704
    graph_pos: usize,
1705
    score: F,
1706
    solid_path: F2,
1707
}
1708

1709
/// returns the component and the "best" path in the component
1710
impl<K: Kmer, D: Debug, F, F2> Iterator for PathCompIter<'_, K, D, F, F2> 
1711
where 
1712
F: Fn(&D) -> f32,
1713
F2: Fn(&D) -> bool
1714
{
1715
    type Item = (Vec<usize>, VecDeque<(usize, Dir)>,);
1716
    fn next(&mut self) -> Option<Self::Item> {
6✔
1717
        match self.component_iterator.next() {
6✔
1718
            Some(component) => {
4✔
1719
                let current_comp = component;
4✔
1720
                
4✔
1721
    
4✔
1722
                let mut best_node = current_comp[0];
4✔
1723
                let mut best_score = f32::MIN;
4✔
1724
                for c in current_comp.iter() {
12✔
1725
                    let node = self.graph.get_node(*c);
12✔
1726
                    let node_score = (self.score)(node.data());
12✔
1727
    
12✔
1728
                    if node_score > best_score {
12✔
1729
                        best_node = *c;
4✔
1730
                        best_score = node_score;
4✔
1731
                    }
8✔
1732
                }
1733
    
1734
                let oscore = |state| match state {
16✔
1735
                    None => 0.0,
4✔
1736
                    Some((id, _)) => (self.score)(self.graph.get_node(id).data()),
12✔
1737
                };
16✔
1738
    
1739
                let osolid_path = |state| match state {
4✔
1740
                    None => false,
×
1741
                    Some((id, _)) => (self.solid_path)(self.graph.get_node(id).data()),
4✔
1742
                };
4✔
1743
    
1744
                // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
1745
                // already put into the path
1746
                let mut used_nodes = HashSet::new();
4✔
1747
                let mut path = VecDeque::new();
4✔
1748
    
4✔
1749
                // Start w/ initial state
4✔
1750
                used_nodes.insert(best_node);
4✔
1751
                path.push_front((best_node, Dir::Left));
4✔
1752
    
1753
                for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
8✔
1754
                    let &(start_node, dir, do_flip) = init;
8✔
1755
                    let mut current = (start_node, dir);
8✔
1756
    
1757
                    loop {
1758
                        let mut next = None;
12✔
1759
                        let (cur_id, incoming_dir) = current;
12✔
1760
                        let node = self.graph.get_node(cur_id);
12✔
1761
                        let edges = node.edges(incoming_dir.flip());
12✔
1762
    
12✔
1763
                        let mut solid_paths = 0;
12✔
1764
                        for (_, id, dir, _) in edges {
16✔
1765
                            let cand = Some((id, dir));
4✔
1766
                            if osolid_path(cand) {
4✔
1767
                                solid_paths += 1;
4✔
1768

4✔
1769
                                // second if clause is outside of first in original code (see max_path) 
4✔
1770
                                // but would basically ignore path validity.
4✔
1771
                                if oscore(cand) > oscore(next) {
4✔
1772
                                    next = cand;
4✔
1773
                                }
4✔
1774
                            }
×
1775
    
1776
                            if oscore(cand) > oscore(next) {
4✔
1777
                                next = cand;
×
1778
                            }
4✔
1779
                        }
1780
    
1781
                        if solid_paths > 1 {
12✔
1782
                            break;
×
1783
                        }
12✔
1784
    
1785
                        match next {
4✔
1786
                            Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
4✔
1787
                                if do_flip {
4✔
1788
                                    path.push_front((next_id, next_incoming.flip()));
2✔
1789
                                } else {
2✔
1790
                                    path.push_back((next_id, next_incoming));
2✔
1791
                                }
2✔
1792
    
1793
                                used_nodes.insert(next_id);
4✔
1794
                                current = (next_id, next_incoming);
4✔
1795
                            }
1796
                            _ => break,
8✔
1797
                        }
1798
                    }
1799
                }
1800
                
1801
                
1802
                Some((current_comp, path))
4✔
1803
            }, 
1804
            None => {
1805
                // should technically not need graph_pos after this 
1806
                self.graph_pos += 1;
2✔
1807
                None
2✔
1808
            }
1809
        }
1810
    }
6✔
1811
}
1812

1813
#[cfg(test)]
1814
mod test {
1815
    use std::{fs::File, io::BufReader};
1816

1817
    use crate::{kmer::Kmer16, summarizer::TagsCountsSumData};
1818

1819
    use super::DebruijnGraph;
1820

1821
    #[test]
1822
    #[cfg(not(feature = "sample128"))]
1823
    fn test_components() {
1824
        use crate::BUF;
1825

1826
        let path = "test_data/400.graph.dbg";
1827
        let file = BufReader::with_capacity(BUF, File::open(path).unwrap());
1828

1829
        let (graph, _, _): (DebruijnGraph<Kmer16, TagsCountsSumData>, Vec<String>, crate::summarizer::SummaryConfig) = 
1830
            bincode::deserialize_from(file).expect("error deserializing graph");
1831

1832
        let components = graph.iter_components();
1833

1834
        let check_components = [
1835
            vec![3, 67, 130, 133, 59, 119, 97, 110, 68, 137, 29, 84, 131, 43, 30, 91, 14, 70, 79, 142, 136, 105, 103, 62, 
1836
                141, 104, 134, 88, 38, 81, 108, 92, 135, 96, 116, 121, 63, 124, 106, 129, 132, 126, 93, 109, 83, 112, 118, 
1837
                123, 125, 78, 122, 115, 75, 128, 140, 111, 26, 143, 113],
1838
            vec![41, 138, 100, 139, 86],
1839
            vec![53, 117, 127],
1840
            vec![69, 144, 77, 120, 114, 107, 101],
1841
        ];
1842

1843
        let mut counter = 0;
1844

1845
        for component in components {
1846
            if component.len() > 1 { 
1847
                println!("component: {:?}", component);
1848
                assert_eq!(component, check_components[counter]);
1849
                counter += 1;
1850
            }
1851
        }
1852
    }
1853
}
1854

STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc