• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jlab / rust-debruijn / 17154294205

22 Aug 2025 11:40AM UTC coverage: 87.068% (-0.3%) from 87.371%
17154294205

Pull #30

github

web-flow
Merge 785518be4 into 4ef952e1f
Pull Request #30: Transcript Mapping

239 of 298 new or added lines in 5 files covered. (80.2%)

16 existing lines in 1 file now uncovered.

7103 of 8158 relevant lines covered (87.07%)

4887738.64 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

75.47
/src/graph.rs
1
// Copyright 2017 10x Genomics
2

3
//! Containers for path-compressed De Bruijn graphs
4

5
use bimap::BiHashMap;
6
use bio::io::fasta;
7
use bit_set::BitSet;
8
use indicatif::ProgressBar;
9
use indicatif::ProgressIterator;
10
use indicatif::ProgressStyle;
11
use itertools::enumerate;
12
use log::{debug, trace};
13
use rayon::prelude::*;
14
use rayon::current_num_threads;
15
use serde_derive::{Deserialize, Serialize};
16
use smallvec::SmallVec;
17
use std::borrow::Borrow;
18

19
use std::collections::HashSet;
20
use std::collections::VecDeque;
21
use std::f32;
22
use std::fmt::{self, Debug, Display};
23
use std::fs::{remove_file, File};
24
use std::hash::Hash;
25
use std::io::BufWriter;
26
use std::io::{BufReader, Error, Read};
27
use std::io::Write;
28
use std::iter::FromIterator;
29
use std::marker::PhantomData;
30
use std::path::Path;
31
use std::process::id;
32

33
use boomphf::hashmap::BoomHashMap;
34

35
use serde_json;
36
use serde_json::Value;
37

38
type SmallVec4<T> = SmallVec<[T; 4]>;
39
type SmallVec8<T> = SmallVec<[T; 8]>;
40

41
use crate::bits_to_base;
42
use crate::colors::ColorMode;
43
use crate::colors::Colors;
44
use crate::compression::CompressionSpec;
45
use crate::dna_string::{DnaString, DnaStringSlice, PackedDnaStringSet};
46
use crate::summarizer::SummaryConfig;
47
use crate::summarizer::SummaryData;
48
use crate::summarizer::Translator;
49
use crate::summarizer::ID;
50
use crate::BUF;
51
use crate::PROGRESS_STYLE;
52
use crate::{Dir, Exts, Kmer, Mer, Vmer};
53

54
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
55
/// This type does not carry the sorted index arrays the allow the graph
56
/// to be walked efficiently. The `DeBruijnGraph` type wraps this type and add those
57
/// vectors.
58
#[derive(Serialize, Deserialize, Clone, Debug)]
59
pub struct BaseGraph<K, D> {
60
    pub sequences: PackedDnaStringSet,
61
    pub exts: Vec<Exts>,
62
    pub data: Vec<D>,
63
    pub stranded: bool,
64
    phantom: PhantomData<K>,
65
}
66

67
impl<K, D> BaseGraph<K, D> {
68
    pub fn new(stranded: bool) -> Self {
2,181✔
69
        BaseGraph {
2,181✔
70
            sequences: PackedDnaStringSet::new(),
2,181✔
71
            exts: Vec::new(),
2,181✔
72
            data: Vec::new(),
2,181✔
73
            phantom: PhantomData,
2,181✔
74
            stranded,
2,181✔
75
        }
2,181✔
76
    }
2,181✔
77

78
    pub fn len(&self) -> usize {
865,537✔
79
        self.sequences.len()
865,537✔
80
    }
865,537✔
81

82
    pub fn is_empty(&self) -> bool {
1✔
83
        self.sequences.is_empty()
1✔
84
    }
1✔
85

86
    pub fn combine<I: Iterator<Item = BaseGraph<K, D>>>(graphs: I) -> Self {
12✔
87
        let mut sequences = PackedDnaStringSet::new();
12✔
88
        let mut exts = Vec::new();
12✔
89
        let mut data = Vec::new();
12✔
90
        let mut stranded = Vec::new();
12✔
91

92
        for g in graphs {
1,499✔
93
            for s in 0..g.sequences.len() {
5,051✔
94
                sequences.add(&g.sequences.get(s));
5,051✔
95
            }
5,051✔
96

97
            exts.extend(g.exts);
1,487✔
98
            data.extend(g.data);
1,487✔
99
            stranded.push(g.stranded);
1,487✔
100
        }
101

102
        let out_stranded = stranded.iter().all(|x| *x);
12✔
103

104
        if !out_stranded && !stranded.iter().all(|x| !*x) {
1,487✔
105
            panic!("attempted to combine stranded and unstranded graphs");
×
106
        }
12✔
107

108
        BaseGraph {
12✔
109
            sequences,
12✔
110
            stranded: out_stranded,
12✔
111
            exts,
12✔
112
            data,
12✔
113
            phantom: PhantomData,
12✔
114
        }
12✔
115
    }
12✔
116
}
117

118
impl<K: Kmer, D> BaseGraph<K, D> {
119
    pub fn add<R: Borrow<u8>, S: IntoIterator<Item = R>>(
1,436,890✔
120
        &mut self,
1,436,890✔
121
        sequence: S,
1,436,890✔
122
        exts: Exts,
1,436,890✔
123
        data: D,
1,436,890✔
124
    ) {
1,436,890✔
125
        self.sequences.add(sequence);
1,436,890✔
126
        self.exts.push(exts);
1,436,890✔
127
        self.data.push(data);
1,436,890✔
128
    }
1,436,890✔
129
}
130

131
impl<K: Kmer + Send + Sync, D> BaseGraph<K, D> {
132
    pub fn finish(self) -> DebruijnGraph<K, D> {
583✔
133
        let indices: Vec<u32> = (0..self.len() as u32).collect();
583✔
134

135
        let left_order = {
583✔
136
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
583✔
137
            for idx in &indices {
773,331✔
138
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
772,748✔
139
                
772,748✔
140
            }
772,748✔
141
            
142
            BoomHashMap::new_parallel(kmers, indices.clone())
583✔
143
        };
144

145
        let right_order = {
583✔
146
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
583✔
147
            for idx in &indices {
773,331✔
148
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
772,748✔
149
            }
772,748✔
150
            
151
            BoomHashMap::new_parallel(kmers, indices)
583✔
152
        };
153
        debug!("finish graph loops: 2x {}", self.len());
583✔
154

155
        DebruijnGraph {
583✔
156
            base: self,
583✔
157
            left_order,
583✔
158
            right_order,
583✔
159
        }
583✔
160
    }
583✔
161
}
162

163
impl<K: Kmer, D> BaseGraph<K, D> {
164
    pub fn finish_serial(self) -> DebruijnGraph<K, D> {
111✔
165
        let indices: Vec<u32> = (0..self.len() as u32).collect();
111✔
166

167
        let left_order = {
111✔
168
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
111✔
169
            let mut sequences: Vec<String> = Vec::new();
111✔
170
            for idx in &indices {
662,000✔
171
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
661,889✔
172
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
661,889✔
173
            }
661,889✔
174
            println!("left kmers: {:?}", kmers);
111✔
175
            println!("left seqs: {:?}", sequences);
111✔
176
            BoomHashMap::new(kmers, indices.clone())
111✔
177
        };
178

179
        let right_order = {
111✔
180
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
111✔
181
            let mut sequences: Vec<String> = Vec::new();
111✔
182
            for idx in &indices {
662,000✔
183
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
661,889✔
184
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
661,889✔
185
            }
661,889✔
186
            println!("right kmers: {:?}", kmers);
111✔
187
            println!("right seqs: {:?}", sequences);
111✔
188
            BoomHashMap::new(kmers, indices)
111✔
189
        };
190

191
        DebruijnGraph {
111✔
192
            base: self,
111✔
193
            left_order,
111✔
194
            right_order,
111✔
195
        }
111✔
196
    }
111✔
197
}
198

199
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
200
/// The struct carries sorted index arrays the allow the graph
201
/// to be walked efficiently.
202
#[derive(Serialize, Deserialize, Debug)]
203
pub struct DebruijnGraph<K: Hash, D> {
204
    pub base: BaseGraph<K, D>,
205
    left_order: BoomHashMap<K, u32>,
206
    right_order: BoomHashMap<K, u32>,
207
}
208

209
impl<K: Kmer, D: Debug> DebruijnGraph<K, D> {
210
    /// Total number of nodes in the DeBruijn graph
211
    pub fn len(&self) -> usize {
863,431✔
212
        self.base.len()
863,431✔
213
    }
863,431✔
214

215
    pub fn is_empty(&self) -> bool {
1✔
216
        self.base.is_empty()
1✔
217
    }
1✔
218

219
    /// Get a node given it's `node_id`
220
    pub fn get_node(&self, node_id: usize) -> Node<K, D> {
4,889,352✔
221
        Node {
4,889,352✔
222
            node_id,
4,889,352✔
223
            graph: self,
4,889,352✔
224
        }
4,889,352✔
225
    }
4,889,352✔
226

227
    /// Get a node given it's `node_id`
228
    pub fn get_node_kmer(&self, node_id: usize) -> NodeKmer<K, D> {
×
229
        let node = self.get_node(node_id);
×
230
        let node_seq = node.sequence();
×
231

232
        NodeKmer {
×
233
            node_id,
×
234
            node_seq_slice: node_seq,
×
235
            phantom_d: PhantomData,
×
236
            phantom_k: PhantomData,
×
237
        }
×
238
    }
×
239

240
    /// Return an iterator over all nodes in the graph
241
    pub fn iter_nodes(&self) -> NodeIter<K, D> {
129✔
242
        NodeIter {
129✔
243
            graph: self,
129✔
244
            node_id: 0,
129✔
245
        }
129✔
246
    }
129✔
247

248
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
249
    /// accessed via a Node wrapper object
250
    fn find_edges(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
1,184,023✔
251
        let exts = self.base.exts[node_id];
1,184,023✔
252
        let sequence = self.base.sequences.get(node_id);
1,184,023✔
253
        let kmer: K = sequence.term_kmer(dir);
1,184,023✔
254
        let mut edges = SmallVec4::new();
1,184,023✔
255

256
        for i in 0..4 {
5,920,115✔
257
            if exts.has_ext(dir, i) {
4,736,092✔
258
                let link = self.find_link(kmer.extend(i, dir), dir).expect("missing link");
1,654,684✔
259
                edges.push((i, link.0, link.1, link.2));
1,654,684✔
260
            }
3,081,408✔
261
        }
262

263
        edges
1,184,023✔
264
    }
1,184,023✔
265

266
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
267
    /// accessed via a Node wrapper object
268
    /// 
269
    /// allows missing links
270
    fn _find_edges_sharded(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
×
271
        let exts = self.base.exts[node_id];
×
272
        let sequence = self.base.sequences.get(node_id);
×
273
        let kmer: K = sequence.term_kmer(dir);
×
274
        let mut edges = SmallVec4::new();
×
275

276
        for i in 0..4 {
×
277
            if exts.has_ext(dir, i) {
×
278
                let link = self.find_link(kmer.extend(i, dir), dir); //.expect("missing link");
×
279
                if let Some(l) = link {
×
280
                    edges.push((i, l.0, l.1, l.2));
×
281
                }
×
282
                // Otherwise, this edge doesn't exist within this shard, so ignore it.
283
                // NOTE: this should be allowed in a 'complete' DBG
284
            }
×
285
        }
286

287
        edges
×
288
    }
×
289

290
    /// Seach for the kmer `kmer`, appearing at the given `side` of a node sequence.
291
    fn search_kmer(&self, kmer: K, side: Dir) -> Option<usize> {
5,376,009✔
292
        match side {
5,376,009✔
293
            Dir::Left => self.left_order.get(&kmer).map(|pos| *pos as usize),
2,680,386✔
294
            Dir::Right => self.right_order.get(&kmer).map(|pos| *pos as usize),
2,695,623✔
295
        }
296
    }
5,376,009✔
297

298
    /// Find a link in the graph, possibly handling a RC switch.
299
    pub fn find_link(&self, kmer: K, dir: Dir) -> Option<(usize, Dir, bool)> {
3,745,277✔
300
        // Only test self-consistent paths through
301
        // the edges
302
        // Avoids problems due to single kmer edges
303
        // (dir, next_side_incoming, rc)
304
        // (Dir::Left, Dir::Right, false) => true,
305
        // (Dir::Left, Dir::Left,  true) => true,
306
        // (Dir::Right, Dir::Left, false) => true,
307
        // (Dir::Right, Dir::Right, true) => true,
308

309
        let rc = kmer.rc();
3,745,277✔
310

311
        match dir {
3,745,277✔
312
            Dir::Left => {
313
                if let Some(idx) = self.search_kmer(kmer, Dir::Right) {
1,873,764✔
314
                    return Some((idx, Dir::Right, false));
1,064,891✔
315
                }
808,873✔
316

317
                if !self.base.stranded {
808,873✔
318
                    if let Some(idx) = self.search_kmer(rc, Dir::Left) {
808,873✔
319
                        return Some((idx, Dir::Left, true));
808,873✔
320
                    }
×
321
                }
×
322
            }
323

324
            Dir::Right => {
325
                if let Some(idx) = self.search_kmer(kmer, Dir::Left) {
1,871,513✔
326
                    return Some((idx, Dir::Left, false));
1,063,802✔
327
                }
807,711✔
328

329
                if !self.base.stranded {
807,711✔
330
                    if let Some(idx) = self.search_kmer(rc, Dir::Right) {
807,711✔
331
                        return Some((idx, Dir::Right, true));
807,711✔
332
                    }
×
333
                }
×
334
            }
335
        }
336

337
        None
×
338
    }
3,745,277✔
339

340
    /// Check whether the graph is fully compressed. Return `None` if it's compressed,
341
    /// otherwise return `Some(node1, node2)` representing a pair of node that could
342
    /// be collapsed. Probably only useful for testing.
343
    pub fn is_compressed<S: CompressionSpec<D>>(&self, spec: &S) -> Option<(usize, usize)> {
789✔
344
        for i in 0..self.len() {
145,550✔
345
            let n = self.get_node(i);
145,550✔
346

347
            for dir in &[Dir::Left, Dir::Right] {
436,650✔
348
                let dir_edges = n.edges(*dir);
291,100✔
349
                if dir_edges.len() == 1 {
291,100✔
350
                    let (_, next_id, return_dir, _) = dir_edges[0];
207,238✔
351
                    let next = self.get_node(next_id);
207,238✔
352

353
                    let ret_edges = next.edges(return_dir);
207,238✔
354
                    if ret_edges.len() == 1 {
207,238✔
355
                        // Test for us being a palindrome: this makes it OK
356
                        if n.len() == K::k() && n.sequence().first_kmer::<K>().is_palindrome() {
24✔
357
                            continue;
16✔
358
                        }
8✔
359

360
                        // Test for a neighbor being a palindrome: this makes it OK
361
                        if next.len() == K::k() && next.sequence().first_kmer::<K>().is_palindrome()
8✔
362
                        {
363
                            continue;
8✔
364
                        }
×
365

366
                        // Test for this edge representing a smooth circle (biting it's own tail):
367
                        if n.node_id == next_id {
×
368
                            continue;
×
369
                        }
×
370

371
                        if spec.join_test(n.data(), next.data()) {
×
372
                            // Found a unbranched edge that should have been eliminated
373
                            return Some((i, next_id));
×
374
                        }
×
375
                    }
207,214✔
376
                }
83,862✔
377
            }
378
        }
379

380
        None
789✔
381
    }
789✔
382

383
    /// Remove non-existent extensions that may be created due to filtered kmers
384
    /// 
385
    /// if `valid_nodes` if `None`, all nodes are valid
386
    pub fn fix_exts(&mut self, valid_nodes: Option<&BitSet>) {
247✔
387
        for i in 0..self.len() {
688,809✔
388
            let valid_exts = self.get_valid_exts(i, valid_nodes);
688,809✔
389
            self.base.exts[i] = valid_exts;
688,809✔
390
        }
688,809✔
391
    }
247✔
392

393
    pub fn get_valid_exts(&self, node_id: usize, valid_nodes: Option<&BitSet>) -> Exts {
688,809✔
394
        let mut new_exts = Exts::empty();
688,809✔
395
        let node = self.get_node(node_id);
688,809✔
396
        let exts = node.exts();
688,809✔
397
        let l_kmer: K = node.sequence().first_kmer();
688,809✔
398
        let r_kmer: K = node.sequence().last_kmer();
688,809✔
399

400
        let check_node = |id| match valid_nodes {
1,414,466✔
401
            Some(bs) => bs.contains(id),
1,352,306✔
402
            None => true,
62,160✔
403
        };
1,414,466✔
404

405
        for i in 0..4 {
3,444,045✔
406
            if exts.has_ext(Dir::Left, i) {
2,755,236✔
407
                match self.find_link(l_kmer.extend_left(i), Dir::Left) {
707,169✔
408
                    Some((target, _, _)) if check_node(target) => {
707,169✔
409
                        new_exts = new_exts.set(Dir::Left, i)
707,169✔
410
                    }
411
                    _ => (),
×
412
                }
413
            }
2,048,067✔
414

415
            if exts.has_ext(Dir::Right, i) {
2,755,236✔
416
                match self.find_link(r_kmer.extend_right(i), Dir::Right) {
707,297✔
417
                    Some((target, _, _)) if check_node(target) => {
707,297✔
418
                        new_exts = new_exts.set(Dir::Right, i)
707,297✔
419
                    }
420
                    _ => (),
×
421
                }
422
            }
2,047,939✔
423
        }
424

425
        new_exts
688,809✔
426
    }
688,809✔
427

428
    /// mutable reference to the auxiliary data of the node node_id
NEW
429
    pub fn mut_data(&mut self, node_id: usize) -> &mut D {
×
NEW
430
        &mut self.base.data[node_id]
×
NEW
431
    }
×
432

433
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
434
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
435
    /// paths will be terminated if there are multiple valid paths emanating from a node.
436
    pub fn max_path<F, F2>(&self, score: F, solid_path: F2) -> Vec<(usize, Dir)>
×
437
    where
×
438
        F: Fn(&D) -> f32,
×
439
        F2: Fn(&D) -> bool,
×
440
    {
441
        if self.is_empty() {
×
442
            return Vec::default();
×
443
        }
×
444

445
        let mut best_node = 0;
×
446
        let mut best_score = f32::MIN;
×
447
        for i in 0..self.len() {
×
448
            let node = self.get_node(i);
×
449
            let node_score = score(node.data());
×
450

451
            if node_score > best_score {
×
452
                best_node = i;
×
453
                best_score = node_score;
×
454
            }
×
455
        }
456

457
        let oscore = |state| match state {
×
458
            None => 0.0,
×
459
            Some((id, _)) => score(self.get_node(id).data()),
×
460
        };
×
461

462
        let osolid_path = |state| match state {
×
463
            None => false,
×
464
            Some((id, _)) => solid_path(self.get_node(id).data()),
×
465
        };
×
466

467
        // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
468
        // already put into the path
469
        let mut used_nodes = HashSet::new();
×
470
        let mut path = VecDeque::new();
×
471

472
        // Start w/ initial state
473
        used_nodes.insert(best_node);
×
474
        path.push_front((best_node, Dir::Left));
×
475

476
        for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
×
477
            let &(start_node, dir, do_flip) = init;
×
478
            let mut current = (start_node, dir);
×
479

480
            loop {
481
                let mut next = None;
×
482
                let (cur_id, incoming_dir) = current;
×
483
                let node = self.get_node(cur_id);
×
484
                let edges = node.edges(incoming_dir.flip());
×
485

486
                let mut solid_paths = 0;
×
487
                for (_, id, dir, _) in edges {
×
488
                    let cand = Some((id, dir));
×
489
                    if osolid_path(cand) {
×
490
                        solid_paths += 1;
×
491
                    }
×
492

493
                    if oscore(cand) > oscore(next) {
×
494
                        next = cand;
×
495
                    }
×
496
                }
497

498
                if solid_paths > 1 {
×
499
                    break;
×
500
                }
×
501

502
                match next {
×
503
                    Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
×
504
                        if do_flip {
×
505
                            path.push_front((next_id, next_incoming.flip()));
×
506
                        } else {
×
507
                            path.push_back((next_id, next_incoming));
×
508
                        }
×
509

510
                        used_nodes.insert(next_id);
×
511
                        current = (next_id, next_incoming);
×
512
                    }
513
                    _ => break,
×
514
                }
515
            }
516
        }
517

518
        Vec::from_iter(path)
×
519
    }
×
520

521

522
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
523
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
524
    /// paths will be terminated if there are multiple valid paths emanating from a node.
525
    /// Returns vec with path for each component
526
    pub fn max_path_comp<F, F2>(&self, score: F, solid_path: F2) -> Vec<VecDeque<(usize, Dir)>>
1✔
527
    where
1✔
528
        F: Fn(&D) -> f32,
1✔
529
        F2: Fn(&D) -> bool,
1✔
530
    {
531
        if self.is_empty() {
1✔
532
            let vec: Vec<VecDeque<(usize, Dir)>> = Vec::new();
×
533
            return vec;
×
534
        }
1✔
535

536
        let components = self.iter_components();
1✔
537
        let mut paths: Vec<VecDeque<(usize, Dir)>> = Vec::new();
1✔
538

539
        for component in components {
3✔
540

541
            let current_comp = &component;
2✔
542
            
543

544
            let mut best_node = current_comp[0];
2✔
545
            let mut best_score = f32::MIN;
2✔
546
            for c in current_comp.iter() {
140✔
547
                let node = self.get_node(*c);
140✔
548
                let node_score = score(node.data());
140✔
549

550
                if node_score > best_score {
140✔
551
                    best_node = *c;
2✔
552
                    best_score = node_score;
2✔
553
                }
138✔
554
            }
555

556
            let oscore = |state| match state {
170✔
557
                None => 0.0,
85✔
558
                Some((id, _)) => score(self.get_node(id).data()),
85✔
559
            };
170✔
560

561
            let osolid_path = |state| match state {
85✔
562
                None => false,
×
563
                Some((id, _)) => solid_path(self.get_node(id).data()),
85✔
564
            };
85✔
565

566
            // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
567
            // already put into the path
568
            let mut used_nodes = HashSet::new();
2✔
569
            let mut path = VecDeque::new();
2✔
570

571
            // Start w/ initial state
572
            used_nodes.insert(best_node);
2✔
573
            path.push_front((best_node, Dir::Left));
2✔
574

575
            for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
4✔
576
                let &(start_node, dir, do_flip) = init;
4✔
577
                let mut current = (start_node, dir);
4✔
578

579
                loop {
580
                    let mut next = None;
89✔
581
                    let (cur_id, incoming_dir) = current;
89✔
582
                    let node = self.get_node(cur_id);
89✔
583
                    let edges = node.edges(incoming_dir.flip());
89✔
584

585
                    let mut solid_paths = 0;
89✔
586
                    for (_, id, dir, _) in edges {
174✔
587
                        let cand = Some((id, dir));
85✔
588
                        if osolid_path(cand) {
85✔
589
                            solid_paths += 1;
85✔
590
                        }
85✔
591

592
                        if oscore(cand) > oscore(next) {
85✔
593
                            next = cand;
85✔
594
                        }
85✔
595
                    }
596

597
                    if solid_paths > 1 {
89✔
598
                        break;
×
599
                    }
89✔
600

601
                    match next {
85✔
602
                        Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
85✔
603
                            if do_flip {
85✔
604
                                path.push_front((next_id, next_incoming.flip()));
41✔
605
                            } else {
44✔
606
                                path.push_back((next_id, next_incoming));
44✔
607
                            }
44✔
608

609
                            used_nodes.insert(next_id);
85✔
610
                            current = (next_id, next_incoming);
85✔
611
                        }
612
                        _ => break,
4✔
613
                    }
614
                }
615
            }
616
            
617
            paths.push(path);
2✔
618
            //paths.push(Vec::from_iter(path));
619
        }
620

621
        paths
1✔
622
    
623
    }
1✔
624

625
    pub fn iter_max_path_comp<F, F2>(&self, score: F, solid_path: F2) -> PathCompIter<K, D, F, F2> 
3✔
626
    where 
3✔
627
    F: Fn(&D) -> f32,
3✔
628
    F2: Fn(&D) -> bool
3✔
629
    {
630
        let component_iterator = self.iter_components();
3✔
631
        PathCompIter { graph: self, component_iterator, graph_pos: 0, score, solid_path }
3✔
632
    }
3✔
633

634
    /// write the paths from `iter_max_path_comp` to a fasta file
635
    pub fn path_to_fasta<F, F2>(&self, f: &mut dyn std::io::Write, path_iter: PathCompIter<K, D, F, F2>, return_lens: bool) -> (Vec<usize>, Vec<usize>)
1✔
636
    where 
1✔
637
    F: Fn(&D) -> f32,
1✔
638
    F2: Fn(&D) -> bool
1✔
639
    {
640
        // width of fasta lines
641
        let columns = 80;
1✔
642

643
        // sizes of components and of paths
644
        let mut comp_sizes = Vec::new();
1✔
645
        let mut path_lens = Vec::new();
1✔
646

647
        for (seq_counter, (component, path)) in path_iter.enumerate() {
2✔
648
            // get dna sequence from path
649
            let seq = self.sequence_of_path(path.iter());
2✔
650

651
            //write header with length & start node
652
            writeln!(f, ">path{} len={} start_node={}", seq_counter, seq.len(), path[0].0).unwrap();
2✔
653

654
            // calculate how sequence has to be split up
655
            let slices = (seq.len() / columns) + 1;
2✔
656
            let mut ranges = Vec::with_capacity(slices);
2✔
657

658
            let mut start = 0;
2✔
659
            while start < seq.len() {
5✔
660
                ranges.push(start..start + columns);
3✔
661
                start += columns;
3✔
662
            }
3✔
663

664
            let last_start = ranges.pop().expect("no kmers in parallel ranges").start;
2✔
665
            ranges.push(last_start..seq.len());
2✔
666

667
            // split up sequence and write to file accordingly
668
            for range in ranges {
5✔
669
                writeln!(f, "{:?}", seq.slice(range.start, range.end)).unwrap();
3✔
670
            }
3✔
671

672
            if return_lens {
2✔
673
                comp_sizes.push(component.len());
×
674
                path_lens.push(path.len());
×
675
            }
2✔
676
        }    
677

678
        (comp_sizes, path_lens)
1✔
679
        
680
    }
1✔
681

682

683
    /// Get the sequence of a path through the graph. The path is given as a sequence of node_id integers
684
    pub fn sequence_of_path<'a, I: 'a + Iterator<Item = &'a (usize, Dir)>>(
21,735✔
685
        &self,
21,735✔
686
        path: I,
21,735✔
687
    ) -> DnaString {
21,735✔
688
        let mut seq = DnaString::new();
21,735✔
689

690
        for (idx, &(node_id, dir)) in path.enumerate() {
667,201✔
691
            let start = if idx == 0 { 0 } else { K::k() - 1 };
667,201✔
692

693
            let node_seq = match dir {
667,201✔
694
                Dir::Left => self.get_node(node_id).sequence(),
347,900✔
695
                Dir::Right => self.get_node(node_id).sequence().rc(),
319,301✔
696
            };
697

698
            for p in start..node_seq.len() {
1,370,470✔
699
                seq.push(node_seq.get(p))
1,370,470✔
700
            }
701
        }
702

703
        seq
21,735✔
704
    }
21,735✔
705

706
    /// map sequences from a fasta file to a completely uncompressed (!!!) and stranded (!!!!!) debruijn graph
707
    pub fn map_transcripts<P>(&self, path: P, translator: &mut Translator) -> Result<Vec<Box<[ID]>>, String> 
1✔
708
    where 
1✔
709
        P: AsRef<Path>
1✔
710
    {
711
        if !self.base.stranded { return Err("graph has to be stranded".to_string()) };
1✔
712

713
        let reader = fasta::Reader::new(BufReader::new(File::open(path).unwrap()));
1✔
714
        let mut node_transcript_ids = vec![Vec::new(); self.len()];
1✔
715

716
        let mut backup_id_tr = BiHashMap::new();
1✔
717

718
        let id_tr = if let Some(id_tr) = translator.mut_id_translator() {
1✔
719
            id_tr
1✔
720
        } else {
NEW
721
            &mut backup_id_tr
×
722
        };
723

724
        // go through each transcript and map to graph
725
        for result in reader.records() {
15✔
726
            let record = result.expect("error parsing transcripts fasta");
15✔
727

728
            // get gene id or make new id
729
            let gene_id = match id_tr.get_by_left(&record.id().to_string()) {
15✔
NEW
730
                Some(id) => *id,
×
731
                None => {
732
                    let new_id = id_tr.len() as ID;
15✔
733
                    id_tr.insert(record.id().to_string(), new_id);
15✔
734
                    new_id
15✔
735
                }
736
            };
737

738
            // iterate over k-mers in transcript and find each one in the graph
739
            let sequence = DnaString::from_acgt_bytes(record.seq());
15✔
740
            for kmer in sequence.iter_kmers::<K>() {
14,148✔
741
                if let Some(node) = self.search_kmer(kmer, Dir::Right) {
14,148✔
742
                    node_transcript_ids[node].push(gene_id);
13,284✔
743
                    //println!("node: {node}, seq: {}, kmer: {:?}, ids: {:?}, tr id: {}", self.get_node(node).sequence(), kmer, self.get_node(node).data(), gene_id);                
13,284✔
744
                }
13,284✔
745
            }
746
        }
747

748
        // change to boxed slices to reduce memory
749
        let boxed_transcripts = node_transcript_ids.into_iter().map(|vec| vec.into()).collect();
21,993✔
750

751
        Ok(boxed_transcripts)
1✔
752
    }
1✔
753

754
    /// write a node to a dot file
755
    /// 
756
    /// ### Arguments: 
757
    /// 
758
    /// * `node`: [`Node<K, D>`] which will be written to a dot file
759
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
760
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
761
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
762
    /// * `f`: writer
763
    fn node_to_dot<FN: Fn(&Node<K, D>) -> String, FE: Fn(&Node<K, D>, u8, Dir, bool) -> String>(
97,978✔
764
        &self,
97,978✔
765
        node: &Node<'_, K, D>,
97,978✔
766
        node_label: &FN,
97,978✔
767
        edge_label: &FE,
97,978✔
768
        f: &mut dyn Write,
97,978✔
769
    ) {
97,978✔
770
        writeln!(f, "n{} {}", node.node_id, node_label(node)).unwrap();
97,978✔
771
        assert_eq!(node.exts().val.count_ones() as usize, node.l_edges().len() + node.r_edges().len());
97,978✔
772

773
        for (base, id, incoming_dir, flipped) in node.l_edges() {
97,978✔
774
            writeln!(f, "n{} -> n{} {}", id, node.node_id, edge_label(node, base, incoming_dir, flipped)).unwrap();
97,591✔
775
        }
97,591✔
776

777
        for (base, id, incoming_dir, flipped) in node.r_edges() {
97,978✔
778
            writeln!(f, "n{} -> n{} {}", node.node_id, id, edge_label(node, base, incoming_dir, flipped)).unwrap();
97,165✔
779
        }
97,165✔
780
    }
97,978✔
781

782
    /// Write the graph to a dot file
783
    /// 
784
    /// ### Arguments: 
785
    /// 
786
    /// * `path`: path to the output file
787
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes, e.g. [`Node::node_dot_default`]
788
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge, e.g. [`Node::edge_dot_default`]
789
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
790
    pub fn to_dot<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
1✔
791
    where 
1✔
792
    P: AsRef<Path>,
1✔
793
    FN: Fn(&Node<K, D>) -> String,
1✔
794
    FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
795
    {
796
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
797

798
        let pb = ProgressBar::new(self.len() as u64);
1✔
799
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
800
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
1✔
801

802
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
803
        for i in (0..self.len()).progress_with(pb) {
32,658✔
804
            self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
32,658✔
805
        }
32,658✔
806
        writeln!(&mut f, "}}").unwrap();
1✔
807
        
808
        f.flush().unwrap();
1✔
809
        debug!("large to dot loop: {}", self.len());
1✔
810
    }
1✔
811

812
    /// Write the graph to a dot file, highlight the nodes which form the 
813
    /// "best" path, according to [`PathCompIter`], with the number of occurences 
814
    /// as the score and `solid_path` always `true`.
815
    /// The nodes are formatted according to [`Node::node_dot_default`].
816
    /// 
817
    /// ### Arguments: 
818
    /// 
819
    /// * `path`: path to the output file
820
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge, e.g. [`Node::edge_dot_default`]
821
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
822
    /// * `colors`: a [`Colors`] with the color settings for the graph
823
    /// * `translator`: a [`Translator`] which translates tags or IDs to strings
824
    /// * `config`: a [`SummaryConfig`] which contains settings for the graph
825
    pub fn to_dot_with_path<P, FE, DI>(&self, path: P, edge_label: &FE, colors: &Colors<'_, D, DI>, translator: &Translator, config: &SummaryConfig, translate_id_groups: bool)
1✔
826
    where 
1✔
827
    P: AsRef<Path>,
1✔
828
    D: SummaryData<DI>,
1✔
829
    FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
830
    {
831
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
832

833
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
834

835
        // iterate over components
836
        for (component, path) in self.iter_max_path_comp(|d| d.sum().unwrap_or(1) as f32, |_| true) {
62,565✔
837
            let hashed_path = path.into_iter().map(|(id, _)| id).collect::<HashSet<usize>>();
233✔
838
            for node_id in component {
32,891✔
839
                self.node_to_dot(
32,658✔
840
                    &self.get_node(node_id),
32,658✔
841
                    &|node| node.node_dot_default(colors, config, translator, hashed_path.contains(&node_id), translate_id_groups), 
32,658✔
842
                    edge_label, 
32,658✔
843
                    &mut f
32,658✔
844
                );
845
            }
846
        }
847

848
        writeln!(&mut f, "}}").unwrap();
1✔
849
        
850
        f.flush().unwrap();
1✔
851
        debug!("large to dot loop: {}", self.len());
1✔
852
    }
1✔
853

854
    /// Write the graph to a dot file in parallel
855
    /// Will write in to n_threads files simultaniously,
856
    /// then go though the files and add the contents to a larger file, 
857
    /// and delete the small files.
858
    /// 
859
    /// ### Arguments: 
860
    /// 
861
    /// * `path`: path to the output file
862
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
863
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
864
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
865
    pub fn to_dot_parallel<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
1✔
866
    where 
1✔
867
        D: Sync,
1✔
868
        K: Sync,
1✔
869
        P: AsRef<Path> + Display + Sync,
1✔
870
        FN: Fn(&Node<K, D>) -> String + Sync,
1✔
871
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String + Sync,
1✔
872
    {        
873
        let slices = current_num_threads();
1✔
874
        let n_nodes = self.len();
1✔
875
        let sz = n_nodes / slices + 1;
1✔
876

877
        debug!("n_nodes: {}", n_nodes);
1✔
878
        debug!("sz: {}", sz);
1✔
879

880
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
881
        let mut start = 0;
1✔
882
        while start < n_nodes {
5✔
883
            parallel_ranges.push(start..start + sz);
4✔
884
            start += sz;
4✔
885
        }
4✔
886

887
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
888
        parallel_ranges.push(last_start..n_nodes);
1✔
889
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
890

891
        let mut files = Vec::with_capacity(current_num_threads());
1✔
892

893
        for i in 0..parallel_ranges.len() {
4✔
894
            files.push(format!("{}-{}.dot", path, i));
4✔
895
        } 
4✔
896

897
        let pb = ProgressBar::new(self.len() as u64);
1✔
898
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
899
        pb.set_message(format!("{:<32}", "writing graph to DOT files"));
1✔
900
    
901
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
4✔
902
            let mut f = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel dot file"));
4✔
903

904
            for i in range {
32,662✔
905
                self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
32,658✔
906
                pb.inc(1);
32,658✔
907
            }
32,658✔
908

909
            f.flush().unwrap();
4✔
910
        });
4✔
911
        pb.finish_and_clear();
1✔
912

913
        let mut out_file = BufWriter::with_capacity(BUF, File::create(path).expect("error creating combined dot file"));
1✔
914

915
        writeln!(&mut out_file, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
916

917
        let pb = ProgressBar::new(files.len() as u64);
1✔
918
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
919
        pb.set_message(format!("{:<32}", "combining files"));
1✔
920

921
        for file in files.iter().progress_with(pb) {
4✔
922
            let open_file = File::open(file).expect("error opening parallel dot file");
4✔
923
            let mut reader = BufReader::new(open_file);
4✔
924
            let mut buffer = [0; BUF];
4✔
925

926
            loop {
927
                let linecount = reader.read(&mut buffer).unwrap();
219✔
928
                if linecount == 0 { break }
219✔
929
                out_file.write_all(&buffer[..linecount]).unwrap();
215✔
930
            }
931

932
            remove_file(file).unwrap();
4✔
933
        }
934

935
        writeln!(&mut out_file, "}}").unwrap();
1✔
936

937
        out_file.flush().unwrap();
1✔
938

939

940
    }
1✔
941

942

943
    /// Write part of the graph to a dot file
944
    /// 
945
    /// ### Arguments: 
946
    /// 
947
    /// * `path`: path to the output file
948
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
949
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
950
    ///    and if the neighbor is flipped - returns a string containing commands for dot edges, 
951
    /// * `nodes`: [`Vec<usize>`] listing all IDs of nodes which should be included
952
    pub fn to_dot_partial<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE, nodes: Vec<usize>) 
1✔
953
    where 
1✔
954
        P: AsRef<Path>,
1✔
955
        FN: Fn(&Node<K, D>) -> String,
1✔
956
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
957
    {
958
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
959

960
        let pb = ProgressBar::new(nodes.len() as u64);
1✔
961
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
962
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
1✔
963

964
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset\noverlap=scalexy").unwrap();
1✔
965
        for i in nodes.into_iter().progress_with(pb) {
4✔
966
            self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
4✔
967
        }
4✔
968
        writeln!(&mut f, "}}").unwrap();
1✔
969

970
        f.flush().unwrap();
1✔
971

972
        debug!("large to dot loop: {}", self.len());
1✔
973
    }
1✔
974

975
    fn node_to_gfa<F: Fn(&Node<'_, K, D>) -> String>(
97,978✔
976
        &self,
97,978✔
977
        node: &Node<'_, K, D>,
97,978✔
978
        w: &mut dyn Write,
97,978✔
979
        tag_func: Option<&F>,
97,978✔
980
    ) -> Result<(), Error> {
97,978✔
981
        match tag_func {
97,978✔
982
            Some(f) => {
65,320✔
983
                let tags = (f)(node);
65,320✔
984
                writeln!(
65,320✔
985
                    w,
65,320✔
986
                    "S\t{}\t{}\t{}",
65,320✔
987
                    node.node_id,
988
                    node.sequence().to_dna_string(),
65,320✔
989
                    tags
990
                )?;
×
991
            }
992
            _ => writeln!(
32,658✔
993
                w,
32,658✔
994
                "S\t{}\t{}",
32,658✔
995
                node.node_id,
996
                node.sequence().to_dna_string()
32,658✔
997
            )?,
×
998
        }
999

1000
        for (_, target, dir, _) in node.l_edges() {
97,978✔
1001
            if target >= node.node_id {
97,591✔
1002
                let to_dir = match dir {
48,748✔
1003
                    Dir::Left => "+",
19,433✔
1004
                    Dir::Right => "-",
29,315✔
1005
                };
1006
                writeln!(
48,748✔
1007
                    w,
48,748✔
1008
                    "L\t{}\t-\t{}\t{}\t{}M",
48,748✔
1009
                    node.node_id,
1010
                    target,
1011
                    to_dir,
1012
                    K::k() - 1
48,748✔
1013
                )?;
×
1014
            }
48,843✔
1015
        }
1016

1017
        for (_, target, dir, _) in node.r_edges() {
97,978✔
1018
            if target > node.node_id {
97,165✔
1019
                let to_dir = match dir {
48,634✔
1020
                    Dir::Left => "+",
29,415✔
1021
                    Dir::Right => "-",
19,219✔
1022
                };
1023
                writeln!(
48,634✔
1024
                    w,
48,634✔
1025
                    "L\t{}\t+\t{}\t{}\t{}M",
48,634✔
1026
                    node.node_id,
1027
                    target,
1028
                    to_dir,
1029
                    K::k() - 1
48,634✔
1030
                )?;
×
1031
            }
48,531✔
1032
        }
1033

1034
        Ok(())
97,978✔
1035
    }
97,978✔
1036

1037
    /// Write the graph to GFA format
1038
    pub fn to_gfa<P: AsRef<Path>>(&self, gfa_out: P) -> Result<(), Error> {
1✔
1039
        let wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
1✔
1040
        self.write_gfa(&mut std::io::BufWriter::new(wtr))
1✔
1041
    }
1✔
1042

1043
    pub fn write_gfa(&self, wtr: &mut impl Write) -> Result<(), Error> {
1✔
1044
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1045

1046
        type DummyFn<K, D> = fn(&Node<'_, K, D>) -> String;
1047
        let dummy_opt: Option<&DummyFn<K, D>> = None;
1✔
1048

1049
        let pb = ProgressBar::new(self.len() as u64);
1✔
1050
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1051
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1052

1053
        for i in (0..self.len()).progress_with(pb) {
32,658✔
1054
            let n = self.get_node(i);
32,658✔
1055
            self.node_to_gfa(&n, wtr, dummy_opt)?;
32,658✔
1056
        }
1057

1058
        wtr.flush().unwrap();
1✔
1059

1060
        Ok(())
1✔
1061
    }
1✔
1062

1063
    /// Write the graph to GFA format
1064
    pub fn to_gfa_with_tags<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(
1✔
1065
        &self,
1✔
1066
        gfa_out: P,
1✔
1067
        tag_func: F,
1✔
1068
    ) -> Result<(), Error> {
1✔
1069
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creatinf gfa file"));
1✔
1070
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1071

1072
        let pb = ProgressBar::new(self.len() as u64);
1✔
1073
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1074
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1075

1076
        for i in (0..self.len()).progress_with(pb) {
32,658✔
1077
            let n = self.get_node(i);
32,658✔
1078
            self.node_to_gfa(&n, &mut wtr, Some(&tag_func))?;
32,658✔
1079
        }
1080

1081
        wtr.flush().unwrap();
1✔
1082

1083
        Ok(())
1✔
1084
    }
1✔
1085

1086
    /// Write the graph to GFA format, with multithreading, 
1087
    /// pass `tag_func=None` to write without tags
1088
    pub fn to_gfa_otags_parallel<P: AsRef<Path> + Display + Sync, F: Fn(&Node<'_, K, D>) -> String + Sync>(
1✔
1089
        &self,
1✔
1090
        gfa_out: P,
1✔
1091
        tag_func: Option<&F>,
1✔
1092
    ) -> Result<(), Error> 
1✔
1093
    where 
1✔
1094
    K: Sync,
1✔
1095
    D: Sync,
1✔
1096
    {
1097
        // split into ranges according to thread count
1098
        let slices = current_num_threads();
1✔
1099
        let n_nodes = self.len();
1✔
1100
        let sz = n_nodes / slices + 1;
1✔
1101

1102
        debug!("n_nodes: {}", n_nodes);
1✔
1103
        debug!("sz: {}", sz);
1✔
1104

1105
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
1106
        let mut start = 0;
1✔
1107
        while start < n_nodes {
5✔
1108
            parallel_ranges.push(start..start + sz);
4✔
1109
            start += sz;
4✔
1110
        }
4✔
1111

1112
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
1113
        parallel_ranges.push(last_start..n_nodes);
1✔
1114
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
1115

1116
        let mut files = Vec::with_capacity(current_num_threads());
1✔
1117

1118
        for i in 0..parallel_ranges.len() {
4✔
1119
            files.push(format!("{}-{}.gfa", gfa_out, i));
4✔
1120
        } 
4✔
1121

1122
        let pb = ProgressBar::new(self.len() as u64);
1✔
1123
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1124
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1125
        
1126
        
1127
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
4✔
1128
            let mut wtr = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel gfa file"));
4✔
1129

1130
            for i in range {
32,662✔
1131
                let n = self.get_node(i);
32,658✔
1132
                self.node_to_gfa(&n, &mut wtr, tag_func).unwrap();
32,658✔
1133
                pb.inc(1);
32,658✔
1134
            }
32,658✔
1135

1136
            wtr.flush().unwrap();
4✔
1137
        });
4✔
1138

1139
        pb.finish_and_clear();
1✔
1140

1141
        // combine files
1142
        let mut out_file = BufWriter::with_capacity(BUF, File::create(format!("{}.gfa", gfa_out)).expect("error creating combined gfa file"));
1✔
1143
        writeln!(out_file, "H\tVN:Z:debruijn-rs")?;
1✔
1144

1145
        let pb = ProgressBar::new(files.len() as u64);
1✔
1146
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1147
        pb.set_message(format!("{:<32}", "combining files"));
1✔
1148

1149
        for file in files.iter() {
4✔
1150
            let open_file = File::open(file).expect("error opening parallel gfa file");
4✔
1151
            let mut reader = BufReader::new(open_file);
4✔
1152
            let mut buffer = [0; BUF];
4✔
1153

1154
            loop {
1155
                let linecount = reader.read(&mut buffer).unwrap();
113✔
1156
                if linecount == 0 { break }
113✔
1157
                out_file.write_all(&buffer[..linecount]).unwrap();
109✔
1158
            }
1159

1160
            remove_file(file).unwrap();
4✔
1161
        }
1162

1163
        out_file.flush().unwrap();
1✔
1164

1165
        Ok(())
1✔
1166
    }
1✔
1167

1168
    /// Write the graph to GFA format
1169
    pub fn to_gfa_partial<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(&self, gfa_out: P, tag_func: Option<&F>, nodes: Vec<usize>) -> Result<(), Error> {
1✔
1170
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
1✔
1171
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1172

1173
        let pb = ProgressBar::new(self.len() as u64);
1✔
1174
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1175
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1176

1177
        for i in nodes.into_iter().progress_with(pb) {
4✔
1178
            let n = self.get_node(i);
4✔
1179
            self.node_to_gfa(&n, &mut wtr, tag_func)?;
4✔
1180
        }
1181

1182
        wtr.flush().unwrap();
1✔
1183

1184
        Ok(())    
1✔
1185
    }
1✔
1186

1187
    pub fn to_json_rest<W: Write, F: Fn(&D) -> Value>(
×
1188
        &self,
×
1189
        fmt_func: F,
×
1190
        mut writer: &mut W,
×
1191
        rest: Option<Value>,
×
1192
    ) {
×
1193
        writeln!(writer, "{{\n\"nodes\": [").unwrap();
×
1194
        for i in 0..self.len() {
×
1195
            let node = self.get_node(i);
×
1196
            node.to_json(&fmt_func, writer);
×
1197
            if i == self.len() - 1 {
×
1198
                writeln!(writer).unwrap();
×
1199
            } else {
×
1200
                writeln!(writer, ",").unwrap();
×
1201
            }
×
1202
        }
1203
        writeln!(writer, "],").unwrap();
×
1204

1205
        writeln!(writer, "\"links\": [").unwrap();
×
1206
        for i in 0..self.len() {
×
1207
            let node = self.get_node(i);
×
1208
            match node.edges_to_json(writer) {
×
1209
                true => {
1210
                    if i == self.len() - 1 {
×
1211
                        writeln!(writer).unwrap();
×
1212
                    } else {
×
1213
                        writeln!(writer, ",").unwrap();
×
1214
                    }
×
1215
                }
1216
                _ => continue,
×
1217
            }
1218
        }
1219
        writeln!(writer, "]").unwrap();
×
1220

1221
        match rest {
×
1222
            Some(Value::Object(v)) => {
×
1223
                for (k, v) in v.iter() {
×
1224
                    writeln!(writer, ",").expect("io error");
×
1225
                    write!(writer, "\"{}\": ", k).expect("io error");
×
1226
                    serde_json::to_writer(&mut writer, v).expect("io error");
×
1227
                    writeln!(writer).expect("io error");
×
1228
                }
×
1229
            }
1230
            _ => {
×
1231
                writeln!(writer).expect("io error");
×
1232
            }
×
1233
        }
1234

1235
        writeln!(writer, "}}").expect("io error");
×
1236
    }
×
1237

1238
    /// Write the graph to JSON
1239
    pub fn to_json<W: Write, F: Fn(&D) -> Value, RF: Fn(&mut W)>(
×
1240
        &self,
×
1241
        fmt_func: F,
×
1242
        writer: &mut W,
×
1243
    ) {
×
1244
        self.to_json_rest(fmt_func, writer, None);
×
1245
    }
×
1246

1247
    /// Print a text representation of the graph.
1248
    pub fn print(&self) {
2✔
1249
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
2✔
1250
        for node in self.iter_nodes() {
142✔
1251
            println!("{:?}", node);
142✔
1252
        }
142✔
1253
    }
2✔
1254

1255
    pub fn print_with_data(&self) {
×
1256
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
×
1257
        for node in self.iter_nodes() {
×
1258
            println!("{:?} ({:?})", node, node.data());
×
1259
        }
×
1260
    }
×
1261

1262
    pub fn max_path_beam<F, F2>(&self, beam: usize, score: F, _solid_path: F2) -> Vec<(usize, Dir)>
×
1263
    where
×
1264
        F: Fn(&D) -> f32,
×
1265
        F2: Fn(&D) -> bool,
×
1266
    {
1267
        if self.is_empty() {
×
1268
            return Vec::default();
×
1269
        }
×
1270

1271
        let mut states = Vec::new();
×
1272

1273
        for i in 0..self.len() {
×
1274
            let node = self.get_node(i);
×
1275

1276
            // Initialize beam search on terminal nodes
1277
            if node.exts().num_exts_l() == 0 || node.exts().num_exts_r() == 0 {
×
1278
                let dir = if node.exts().num_exts_l() > 0 {
×
1279
                    Dir::Right
×
1280
                } else {
1281
                    Dir::Left
×
1282
                };
1283

1284
                let status = if node.exts().num_exts_l() == 0 && node.exts().num_exts_r() == 0 {
×
1285
                    Status::End
×
1286
                } else {
1287
                    Status::Active
×
1288
                };
1289

1290
                let mut path = SmallVec8::new();
×
1291
                path.push((i as u32, dir));
×
1292

1293
                let s = State {
×
1294
                    path,
×
1295
                    status,
×
1296
                    score: score(node.data()),
×
1297
                };
×
1298
                states.push(s);
×
1299
            }
×
1300
        }
1301

1302
        // No end nodes -- just start on the first node
1303
        if states.is_empty() {
×
1304
            // Make a start
×
1305
            let node = self.get_node(0);
×
1306
            let mut path = SmallVec8::new();
×
1307
            path.push((0, Dir::Left));
×
1308
            states.push(State {
×
1309
                path,
×
1310
                status: Status::Active,
×
1311
                score: score(node.data()),
×
1312
            });
×
1313
        }
×
1314

1315
        // Beam search until we can't find any more expansions
1316
        let mut active = true;
×
1317
        while active {
×
1318
            let mut new_states = Vec::with_capacity(states.len());
×
1319
            active = false;
×
1320

1321
            for s in states {
×
1322
                if s.status == Status::Active {
×
1323
                    active = true;
×
1324
                    let expanded = self.expand_state(&s, &score);
×
1325
                    new_states.extend(expanded);
×
1326
                } else {
×
1327
                    new_states.push(s)
×
1328
                }
1329
            }
1330

1331
            // workaround to sort by descending score - will panic if there are NaN scores
1332
            new_states.sort_by(|a, b| (-(a.score)).partial_cmp(&-(b.score)).unwrap());
×
1333
            new_states.truncate(beam);
×
1334
            states = new_states;
×
1335
        }
1336

1337
        for (i, state) in states.iter().take(5).enumerate() {
×
1338
            trace!("i:{}  -- {:?}", i, state);
×
1339
        }
1340

1341
        // convert back to using usize for node_id
1342
        states[0]
×
1343
            .path
×
1344
            .iter()
×
1345
            .map(|&(node, dir)| (node as usize, dir))
×
1346
            .collect()
×
1347
    }
×
1348

1349
    fn expand_state<F>(&self, state: &State, score: &F) -> SmallVec4<State>
×
1350
    where
×
1351
        F: Fn(&D) -> f32,
×
1352
    {
1353
        if state.status != Status::Active {
×
1354
            panic!("only attempt to expand active states")
×
1355
        }
×
1356

1357
        let (node_id, dir) = state.path[state.path.len() - 1];
×
1358
        let node = self.get_node(node_id as usize);
×
1359
        let mut new_states = SmallVec4::new();
×
1360

1361
        for (_, next_node_id, incoming_dir, _) in node.edges(dir.flip()) {
×
1362
            let next_node = self.get_node(next_node_id);
×
1363
            let new_score = state.score + score(next_node.data());
×
1364

1365
            let cycle = state
×
1366
                .path
×
1367
                .iter()
×
1368
                .any(|&(prev_node, _)| prev_node == (next_node_id as u32));
×
1369

1370
            let status = if cycle {
×
1371
                Status::Cycle
×
1372
            } else if next_node.edges(incoming_dir.flip()).is_empty() {
×
1373
                Status::End
×
1374
            } else {
1375
                Status::Active
×
1376
            };
1377

1378
            let mut new_path = state.path.clone();
×
1379
            new_path.push((next_node_id as u32, incoming_dir));
×
1380

1381
            let next_state = State {
×
1382
                path: new_path,
×
1383
                score: new_score,
×
1384
                status,
×
1385
            };
×
1386

1387
            new_states.push(next_state);
×
1388
        }
1389

1390
        new_states
×
1391
    }
×
1392

1393

1394
    pub fn iter_components(&self) -> IterComponents<K, D> {
5✔
1395
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
5✔
1396
        let pos = 0;
5✔
1397

1398
        for _i in 0..self.len() {
33,218✔
1399
            visited.push(false);
33,218✔
1400
        }
33,218✔
1401

1402
        IterComponents { 
5✔
1403
            graph: self, 
5✔
1404
            visited, 
5✔
1405
            pos }
5✔
1406
    }
5✔
1407

1408

1409
    /// iteratively returns 2D Vec with node_ids grouped according to the connected components they form
1410
    pub fn components_i(&self) -> Vec<Vec<usize>> {
1✔
1411
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
1✔
1412
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
1✔
1413

1414
        for _i in 0..self.len() {
140✔
1415
            visited.push(false);
140✔
1416
        }
140✔
1417

1418
        for i in 0..self.len() {
140✔
1419
            if !visited[i] {
140✔
1420
                let comp = self.component_i(&mut visited, i);
2✔
1421
                components.push(comp);
2✔
1422
            }
138✔
1423
        }
1424

1425
        components
1✔
1426
    }
1✔
1427

1428
    /// recursively detects which nodes form separate graph components
1429
    /// returns 2D vector with node ids per component
1430
    /// (may lead to stack overflow)
1431
    pub fn components_r(&self) -> Vec<Vec<usize>> {
2✔
1432
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
2✔
1433
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
2✔
1434

1435
        for _i in 0..self.len() {
142✔
1436
            visited.push(false);
142✔
1437
        }
142✔
1438

1439
        for i in 0..self.len() {
142✔
1440
            if !visited[i] {
142✔
1441
                let mut comp: Vec<usize> = Vec::new();
4✔
1442
                self.component_r(&mut visited, i, &mut comp);
4✔
1443
                components.push(comp);
4✔
1444
            }
138✔
1445
        }
1446

1447
        components
2✔
1448

1449
    }
2✔
1450

1451
    fn component_r<'a>(&'a self, visited: &'a mut Vec<bool>, i: usize, comp: &'a mut Vec<usize>) {
142✔
1452
        
1453
        visited[i] = true;
142✔
1454
        comp.push(i);
142✔
1455
        let mut edges = self.find_edges(i, Dir::Left);
142✔
1456
        let mut r_edges = self.find_edges(i, Dir::Right);
142✔
1457

1458
        edges.append(&mut r_edges);
142✔
1459

1460
        for (_, edge, _, _) in edges.iter() {
276✔
1461
            if !visited[*edge] {
276✔
1462
                self.component_r(visited, *edge, comp);
138✔
1463
            }
138✔
1464
        }
1465
    }
142✔
1466

1467
    fn component_i<'a>(&'a self, visited: &'a mut [bool], i: usize) -> Vec<usize> {
243✔
1468
        let mut edges: Vec<usize> = Vec::new();
243✔
1469
        let mut comp: Vec<usize> = Vec::new();
243✔
1470

1471
        edges.push(i);
243✔
1472

1473
        while let Some(current_edge) = edges.pop() {
33,632✔
1474
            if !visited[current_edge] { 
33,389✔
1475
                comp.push(current_edge);
33,358✔
1476
                visited[current_edge] = true;
33,358✔
1477

1478
                let mut l_edges = self.find_edges(current_edge, Dir::Left);
33,358✔
1479
                let mut r_edges = self.find_edges(current_edge, Dir::Right);
33,358✔
1480

1481
                l_edges.append(&mut r_edges);
33,358✔
1482

1483
                for (_, new_edge, _, _) in l_edges.into_iter() {
66,296✔
1484
                    if !visited[new_edge] {
66,296✔
1485
                        edges.push(new_edge);
33,146✔
1486
                    }
33,150✔
1487
                }
1488
            }
31✔
1489
        }
1490
        comp
243✔
1491
    }
243✔
1492

1493
    /// iterate over all edges of the graph, item: (node, ext base, ext dir, target node)
1494
    pub fn iter_edges(&self) -> EdgeIter<'_, K, D> {
1✔
1495
        EdgeIter::new(self)
1✔
1496
    }
1✔
1497

1498
    pub fn find_bad_nodes<F: Fn(&Node<'_, K, D>) -> bool>(&self, valid: F) -> Vec<usize> {
×
1499
        let mut bad_nodes = Vec::new();
×
1500

1501
        for (i, node) in enumerate(self.iter_nodes()) {
×
1502
            if !valid(&node) { bad_nodes.push(i); }
×
1503
        }
1504
        
1505
        bad_nodes
×
1506
    }
×
1507
}
1508

1509
impl<K: Kmer, SD: Debug> DebruijnGraph<K, SD> {
1510
    pub fn create_colors<'a, 'b: 'a, DI>(&'a self, config: &SummaryConfig, color_mode: ColorMode<'b>) -> Colors<'b, SD, DI> 
×
1511
    where 
×
1512
    SD: SummaryData<DI>,
×
1513
    {
1514
        Colors::new(self, config, color_mode)
×
1515
    }
×
1516
    
1517
    /// [`crate::EdgeMult`] will contain hanging edges if the nodes were filtered
1518
    pub fn fix_edge_mults<DI>(&mut self) 
2✔
1519
    where 
2✔
1520
        SD: SummaryData<DI>
2✔
1521
    {
1522
        if self.get_node(0).data().edge_mults().is_some() {
2✔
1523
            for i in 0..self.len() {
280✔
1524
                self.base.data[i].fix_edge_mults(self.base.exts[i]);
280✔
1525
            }
280✔
1526
        }
×
1527
    }
2✔
1528

1529
    /// if there are [`crate::EdgeMult`]s in the data, prune the graph by removing edges that have a low coverage
1530
    pub fn filter_edges<DI>(&mut self, min: u32) -> Result<(), String>
1✔
1531
    where 
1✔
1532
        SD: SummaryData<DI>
1✔
1533
    {
1534
        // return if there is no edge coverage available
1535
        if self.get_node(0).data().edge_mults().is_none() { return Err(String::from("no edge mults available")) };
1✔
1536

1537
        for i in 0..self.len() {
140✔
1538
            let em = self.get_node(i).data().edge_mults().expect("shold have em").clone();
140✔
1539
            let edges = [(Dir::Left, 0), (Dir::Left, 1), (Dir::Left, 2), (Dir::Left, 3), 
140✔
1540
                (Dir::Right, 0), (Dir::Right, 1), (Dir::Right, 2), (Dir::Right, 3)];
140✔
1541
            
1542
            for (dir, base) in edges {
1,260✔
1543
                if min > em.edge_mult(base, dir) {
1,120✔
1544
                    // remove invalid ext from node
844✔
1545
                    let ext = self.base.exts[i].remove(dir, base);
844✔
1546
                    self.base.exts[i] = ext;
844✔
1547
                }
844✔
1548
            }
1549
        }
1550

1551
        // now that exts are remove, fix hanging edge mults
1552
        self.fix_edge_mults();
1✔
1553

1554
        Ok(())
1✔
1555
    }
1✔
1556
}
1557

1558
#[derive(Debug, Eq, PartialEq)]
1559
enum Status {
1560
    Active,
1561
    End,
1562
    Cycle,
1563
}
1564

1565
#[derive(Debug)]
1566
struct State {
1567
    path: SmallVec8<(u32, Dir)>,
1568
    score: f32,
1569
    status: Status,
1570
}
1571

1572
impl State {}
1573

1574
/// Iterator over nodes in a `DeBruijnGraph`
1575
pub struct NodeIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1576
    graph: &'a DebruijnGraph<K, D>,
1577
    node_id: usize,
1578
}
1579

1580
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIter<'a, K, D> {
1581
    type Item = Node<'a, K, D>;
1582

1583
    fn next(&mut self) -> Option<Node<'a, K, D>> {
543,434✔
1584
        if self.node_id < self.graph.len() {
543,434✔
1585
            let node = self.graph.get_node(self.node_id);
543,305✔
1586
            self.node_id += 1;
543,305✔
1587
            Some(node)
543,305✔
1588
        } else {
1589
            None
129✔
1590
        }
1591
    }
543,434✔
1592
}
1593

1594
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for &'a DebruijnGraph<K, D> {
1595
    type Item = NodeKmer<'a, K, D>;
1596
    type IntoIter = NodeIntoIter<'a, K, D>;
1597

1598
    fn into_iter(self) -> Self::IntoIter {
2,217✔
1599
        NodeIntoIter {
2,217✔
1600
            graph: self,
2,217✔
1601
            node_id: 0,
2,217✔
1602
        }
2,217✔
1603
    }
2,217✔
1604
}
1605

1606
/// Iterator over nodes in a `DeBruijnGraph`
1607
pub struct NodeIntoIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1608
    graph: &'a DebruijnGraph<K, D>,
1609
    node_id: usize,
1610
}
1611

1612
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIntoIter<'a, K, D> {
1613
    type Item = NodeKmer<'a, K, D>;
1614

1615
    fn next(&mut self) -> Option<Self::Item> {
285,540✔
1616
        if self.node_id < self.graph.len() {
285,540✔
1617
            let node_id = self.node_id;
285,540✔
1618
            let node = self.graph.get_node(node_id);
285,540✔
1619
            let node_seq = node.sequence();
285,540✔
1620

1621
            self.node_id += 1;
285,540✔
1622
            Some(NodeKmer {
285,540✔
1623
                node_id,
285,540✔
1624
                node_seq_slice: node_seq,
285,540✔
1625
                phantom_d: PhantomData,
285,540✔
1626
                phantom_k: PhantomData,
285,540✔
1627
            })
285,540✔
1628
        } else {
1629
            None
×
1630
        }
1631
    }
285,540✔
1632
}
1633

1634
/// A `DebruijnGraph` node with a reference to the sequence of the node.
1635
#[derive(Clone)]
1636
pub struct NodeKmer<'a, K: Kmer + 'a, D: Debug + 'a> {
1637
    pub node_id: usize,
1638
    node_seq_slice: DnaStringSlice<'a>,
1639
    phantom_k: PhantomData<K>,
1640
    phantom_d: PhantomData<D>,
1641
}
1642

1643
/// An iterator over the kmers in a `DeBruijn graph node`
1644
pub struct NodeKmerIter<'a, K: Kmer + 'a, D: Debug + 'a> {
1645
    kmer_id: usize,
1646
    kmer: K,
1647
    num_kmers: usize,
1648
    node_seq_slice: DnaStringSlice<'a>,
1649
    phantom_k: PhantomData<K>,
1650
    phantom_d: PhantomData<D>,
1651
}
1652

1653
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for NodeKmer<'a, K, D> {
1654
    type Item = K;
1655
    type IntoIter = NodeKmerIter<'a, K, D>;
1656

1657
    fn into_iter(self) -> Self::IntoIter {
571,080✔
1658
        let num_kmers = self.node_seq_slice.len() - K::k() + 1;
571,080✔
1659

1660
        let kmer = if num_kmers > 0 {
571,080✔
1661
            self.node_seq_slice.get_kmer::<K>(0)
571,080✔
1662
        } else {
1663
            K::empty()
×
1664
        };
1665

1666
        NodeKmerIter {
571,080✔
1667
            kmer_id: 0,
571,080✔
1668
            kmer,
571,080✔
1669
            num_kmers,
571,080✔
1670
            node_seq_slice: self.node_seq_slice,
571,080✔
1671
            phantom_k: PhantomData,
571,080✔
1672
            phantom_d: PhantomData,
571,080✔
1673
        }
571,080✔
1674
    }
571,080✔
1675
}
1676

1677
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeKmerIter<'a, K, D> {
1678
    type Item = K;
1679

1680
    fn next(&mut self) -> Option<Self::Item> {
3,286,290✔
1681
        if self.num_kmers == self.kmer_id {
3,286,290✔
1682
            None
×
1683
        } else {
1684
            let current_kmer = self.kmer;
3,286,290✔
1685

1686
            self.kmer_id += 1;
3,286,290✔
1687
            if self.kmer_id < self.num_kmers {
3,286,290✔
1688
                let next_base = self.node_seq_slice.get(self.kmer_id + K::k() - 1);
3,212,328✔
1689
                let new_kmer = self.kmer.extend_right(next_base);
3,212,328✔
1690
                self.kmer = new_kmer;
3,212,328✔
1691
            }
3,212,328✔
1692

1693
            Some(current_kmer)
3,286,290✔
1694
        }
1695
    }
3,286,290✔
1696

1697
    fn size_hint(&self) -> (usize, Option<usize>) {
285,540✔
1698
        (self.num_kmers, Some(self.num_kmers))
285,540✔
1699
    }
285,540✔
1700

1701
    /// Provide a 'fast-forward' capability for this iterator
1702
    /// MPHF will use this to reduce the number of kmers that
1703
    /// need to be produced.
1704
    fn nth(&mut self, n: usize) -> Option<Self::Item> {
2,366,508✔
1705
        if n <= 4 {
2,366,508✔
1706
            // for small skips forward, shift one base at a time
1707
            for _ in 0..n {
2,132,486✔
1708
                self.next();
919,782✔
1709
            }
919,782✔
1710
        } else {
234,022✔
1711
            self.kmer_id += n;
234,022✔
1712
            self.kmer = self.node_seq_slice.get_kmer::<K>(self.kmer_id);
234,022✔
1713
        }
234,022✔
1714

1715
        self.next()
2,366,508✔
1716
    }
2,366,508✔
1717
}
1718

1719
/// Marker signifying that NodeKmerIter has a known size.
1720
impl<'a, K: Kmer + 'a, D: Debug + 'a> ExactSizeIterator for NodeKmerIter<'a, K, D> {}
1721

1722
/// Unbranched sequence in the DeBruijn graph
1723
pub struct Node<'a, K: Kmer + 'a, D: 'a> {
1724
    pub node_id: usize,
1725
    pub graph: &'a DebruijnGraph<K, D>,
1726
}
1727

1728
impl<'a, K: Kmer, D: Debug> Node<'a, K, D> {
1729
    /// Length of the sequence of this node
1730
    pub fn len(&self) -> usize {
1,548,282✔
1731
        self.graph.base.sequences.get(self.node_id).len()
1,548,282✔
1732
    }
1,548,282✔
1733

1734
    pub fn is_empty(&self) -> bool {
×
1735
        self.graph.base.sequences.get(self.node_id).is_empty()
×
1736
    }
×
1737

1738
    /// Sequence of the node
1739
    pub fn sequence(&self) -> DnaStringSlice<'a> {
3,235,801✔
1740
        self.graph.base.sequences.get(self.node_id)
3,235,801✔
1741
    }
3,235,801✔
1742

1743
    /// Reference to auxiliarly data associated with the node
1744
    pub fn data(&self) -> &'a D {
3,757,977✔
1745
        &self.graph.base.data[self.node_id]
3,757,977✔
1746
    }
3,757,977✔
1747

1748
    /// Extension bases from this node
1749
    pub fn exts(&self) -> Exts {
2,249,719✔
1750
        self.graph.base.exts[self.node_id]
2,249,719✔
1751
    }
2,249,719✔
1752

1753
    /// Edges leaving the left side of the node in the format
1754
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
1755
    pub fn l_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
294,104✔
1756
        self.graph.find_edges(self.node_id, Dir::Left)
294,104✔
1757
    }
294,104✔
1758

1759
    /// Edges leaving the right side of the node in the format
1760
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
1761
    pub fn r_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
294,104✔
1762
        self.graph.find_edges(self.node_id, Dir::Right)
294,104✔
1763
    }
294,104✔
1764

1765
    /// Edges leaving the 'dir' side of the node in the format
1766
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
1767
    pub fn edges(&self, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
528,815✔
1768
        self.graph.find_edges(self.node_id, dir)
528,815✔
1769
    }
528,815✔
1770

1771
    fn to_json<F: Fn(&D) -> Value>(&self, func: &F, f: &mut dyn Write) {
×
1772
        write!(
×
1773
            f,
×
1774
            "{{\"id\":\"{}\",\"L\":{},\"D\":{},\"Se\":\"{:?}\"}}",
×
1775
            self.node_id,
1776
            self.sequence().len(),
×
1777
            (func)(self.data()),
×
1778
            self.sequence(),
×
1779
        )
1780
        .unwrap();
×
1781
    }
×
1782

1783
    fn edges_to_json(&self, f: &mut dyn Write) -> bool {
×
1784
        let mut wrote = false;
×
1785
        let edges = self.r_edges();
×
1786
        for (idx, &(_, id, incoming_dir, _)) in edges.iter().enumerate() {
×
1787
            write!(
×
1788
                f,
×
1789
                "{{\"source\":\"{}\",\"target\":\"{}\",\"D\":\"{}\"}}",
×
1790
                self.node_id,
1791
                id,
1792
                match incoming_dir {
×
1793
                    Dir::Left => "L",
×
1794
                    Dir::Right => "R",
×
1795
                }
1796
            )
1797
            .unwrap();
×
1798

1799
            if idx < edges.len() - 1 {
×
1800
                write!(f, ",").unwrap();
×
1801
            }
×
1802

1803
            wrote = true;
×
1804
        }
1805
        wrote
×
1806
    }
×
1807
}
1808

1809
// TODO make generic instead of u8 (u8 is sufficient for dbg)
1810
impl<K: Kmer, SD: Debug> Node<'_, K, SD>  {
1811
    /// get default format for dot edges based on node data
1812
    pub fn edge_dot_default<DI>(&self, colors: &Colors<SD, DI>, base: u8, incoming_dir: Dir, flipped: bool) -> String 
194,761✔
1813
    where SD: SummaryData<DI>
194,761✔
1814
    {
1815
        // set color based on dir
1816
        let color = match incoming_dir {
194,761✔
1817
            Dir::Left => "blue",
97,593✔
1818
            Dir::Right => "red"
97,168✔
1819
        };
1820
        
1821
        if let Some(em) = self.data().edge_mults() {
194,761✔
1822
            
1823
            let dir = if flipped { 
194,761✔
1824
                incoming_dir 
77,306✔
1825
            } else {
1826
                incoming_dir.flip()
117,455✔
1827
            };
1828

1829
            // set penwidth based on count
1830
            let count = em.edge_mult(base, dir);
194,761✔
1831
            let penwidth = colors.edge_width(count);
194,761✔
1832

1833
            format!("[color={color}, penwidth={penwidth}, label=\"{}: {count}\"]", bits_to_base(base))
194,761✔
1834
        } else {
1835
            format!("[color={color}, penwidth={}]", colors.edge_width(1)) // since there should be no edge mults, this will return default value
×
1836
        }
1837
    }
194,761✔
1838

1839
    /// get default format for dot nodes, based on node data
1840
    pub fn node_dot_default<DI>(&self, colors: &Colors<SD, DI>, config: &SummaryConfig, translator: &Translator, outline: bool, translate_id_groups: bool) -> String
97,994✔
1841
    where SD: SummaryData<DI>
97,994✔
1842
    {
1843
        // set color based on labels/fold change/p-value
1844
        let color = colors.node_color(self.data(), config, outline);
97,994✔
1845
        let translate_id_groups = if translate_id_groups { colors.id_group_ids() } else { None };
97,994✔
1846

1847
        let data_info = self.data().print(translator, config, translate_id_groups);
97,994✔
1848
        const MIN_TEXT_WIDTH: usize = 40;
1849
        let wrap = if self.len() > MIN_TEXT_WIDTH { self.len() } else { MIN_TEXT_WIDTH };
97,994✔
1850

1851
        let label = textwrap::fill(&format!("id: {}, len: {}, exts: {:?}, seq: {}\n{}", 
97,994✔
1852
            self.node_id,
97,994✔
1853
            self.len(),
97,994✔
1854
            self.exts(),
97,994✔
1855
            self.sequence(),
97,994✔
1856
            data_info
97,994✔
1857
        ), wrap);
97,994✔
1858

1859
        format!("[style=filled, {color}, label=\"{label}\"]")
97,994✔
1860
    }
97,994✔
1861
}
1862

1863
impl<K: Kmer, D> fmt::Debug for Node<'_, K, D>
1864
where
1865
    D: Debug,
1866
{
1867
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
142✔
1868
        write!(
142✔
1869
            f,
142✔
1870
            "Node {{ id:{}, Exts: {:?}, L:{:?} R:{:?}, Seq: {:?}, Data: {:?} }}",
142✔
1871
            self.node_id,
1872
            self.exts(),
142✔
1873
            self.l_edges(),
142✔
1874
            self.r_edges(),
142✔
1875
            self.sequence().len(),
142✔
1876
            self.data()
142✔
1877
        )
1878
    }
142✔
1879
}
1880

1881
pub struct IterComponents<'a, K: Kmer, D> {
1882
    graph: &'a DebruijnGraph<K, D>,
1883
    visited: Vec<bool>,
1884
    pos: usize,
1885
}
1886

1887
impl<K: Kmer, D: Debug> Iterator for IterComponents<'_, K, D> {
1888
    type Item = Vec<usize>;
1889
    fn next(&mut self) -> Option<Self::Item> {
246✔
1890
        while self.pos < self.graph.len() {
33,223✔
1891
            if !self.visited[self.pos] {
33,218✔
1892
                let comp = self.graph.component_i(&mut self.visited, self.pos);
241✔
1893
                self.pos += 1;
241✔
1894
                return Some(comp)
241✔
1895
            } else {
32,977✔
1896
                self.pos += 1;
32,977✔
1897
            }
32,977✔
1898
        }
1899
        assert!(self.visited.iter().map(|x| *x as usize).sum::<usize>() == self.graph.len());
33,218✔
1900
        None
5✔
1901
    }
246✔
1902
    
1903
}
1904

1905
pub struct PathCompIter<'a, K: Kmer, D: Debug, F, F2> 
1906
where 
1907
F: Fn(&D) -> f32,
1908
F2: Fn(&D) -> bool
1909
{
1910
    graph: &'a DebruijnGraph<K, D>,
1911
    component_iterator: IterComponents<'a, K, D>,
1912
    graph_pos: usize,
1913
    score: F,
1914
    solid_path: F2,
1915
}
1916

1917
/// returns the component and the "best" path in the component
1918
impl<K: Kmer, D: Debug, F, F2> Iterator for PathCompIter<'_, K, D, F, F2> 
1919
where 
1920
F: Fn(&D) -> f32,
1921
F2: Fn(&D) -> bool
1922
{
1923
    type Item = (Vec<usize>, VecDeque<(usize, Dir)>,);
1924
    fn next(&mut self) -> Option<Self::Item> {
240✔
1925
        match self.component_iterator.next() {
240✔
1926
            Some(component) => {
237✔
1927
                let current_comp = component;
237✔
1928
                
1929
    
1930
                let mut best_node = current_comp[0];
237✔
1931
                let mut best_score = f32::MIN;
237✔
1932
                for c in current_comp.iter() {
32,938✔
1933
                    let node = self.graph.get_node(*c);
32,938✔
1934
                    let node_score = (self.score)(node.data());
32,938✔
1935
    
1936
                    if node_score > best_score {
32,938✔
1937
                        best_node = *c;
281✔
1938
                        best_score = node_score;
281✔
1939
                    }
32,657✔
1940
                }
1941
    
1942
                let oscore = |state| match state {
59,994✔
1943
                    None => 0.0,
29,917✔
1944
                    Some((id, _)) => (self.score)(self.graph.get_node(id).data()),
30,077✔
1945
                };
59,994✔
1946
    
1947
                let osolid_path = |state| match state {
29,997✔
1948
                    None => false,
×
1949
                    Some((id, _)) => (self.solid_path)(self.graph.get_node(id).data()),
29,997✔
1950
                };
29,997✔
1951
    
1952
                // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
1953
                // already put into the path
1954
                let mut used_nodes = HashSet::new();
237✔
1955
                let mut path = VecDeque::new();
237✔
1956
    
1957
                // Start w/ initial state
1958
                used_nodes.insert(best_node);
237✔
1959
                path.push_front((best_node, Dir::Left));
237✔
1960
    
1961
                for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
474✔
1962
                    let &(start_node, dir, do_flip) = init;
474✔
1963
                    let mut current = (start_node, dir);
474✔
1964
    
1965
                    loop {
1966
                        let mut next = None;
30,388✔
1967
                        let (cur_id, incoming_dir) = current;
30,388✔
1968
                        let node = self.graph.get_node(cur_id);
30,388✔
1969
                        let edges = node.edges(incoming_dir.flip());
30,388✔
1970
    
1971
                        let mut solid_paths = 0;
30,388✔
1972
                        for (_, id, dir, _) in edges {
60,385✔
1973
                            let cand = Some((id, dir));
29,997✔
1974
                            if osolid_path(cand) {
29,997✔
1975
                                solid_paths += 1;
29,997✔
1976

1977
                                // second if clause is outside of first in original code (see max_path) 
1978
                                // but would basically ignore path validity.
1979
                                if oscore(cand) > oscore(next) {
29,997✔
1980
                                    next = cand;
29,955✔
1981
                                }
29,955✔
1982
                            }
×
1983
                        }
1984
                        
1985
                        // break if multiple solid paths are available
1986
                        /* if solid_paths > 1 {
1987
                            break;
1988
                        } */
1989
    
1990
                        match next {
29,917✔
1991
                            Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
29,917✔
1992
                                if do_flip {
29,914✔
1993
                                    path.push_front((next_id, next_incoming.flip()));
15,005✔
1994
                                } else {
15,005✔
1995
                                    path.push_back((next_id, next_incoming));
14,909✔
1996
                                }
14,909✔
1997
    
1998
                                used_nodes.insert(next_id);
29,914✔
1999
                                current = (next_id, next_incoming);
29,914✔
2000
                            }
2001
                            _ => break,
474✔
2002
                        }
2003
                    }
2004
                }
2005
                
2006
                
2007
                Some((current_comp, path))
237✔
2008
            }, 
2009
            None => {
2010
                // should technically not need graph_pos after this 
2011
                self.graph_pos += 1;
3✔
2012
                None
3✔
2013
            }
2014
        }
2015
    }
240✔
2016
}
2017

2018

2019
/// iterator over the edges of the de bruijn graph
2020
pub struct EdgeIter<'a, K: Kmer, D: Debug> {
2021
    graph: &'a DebruijnGraph<K, D>,
2022
    visited_edges: HashSet<(usize, usize)>,
2023
    current_node: usize,
2024
    current_dir: Dir,
2025
    node_edge_iter: smallvec::IntoIter<[(u8, usize, Dir, bool); 4]>
2026
}
2027

2028
impl<K: Kmer, D: Debug> EdgeIter<'_, K, D> {
2029
    pub fn new(graph: &DebruijnGraph<K, D>) -> EdgeIter<'_, K, D>{
1✔
2030
        let node_edge_iter = graph.get_node(0).l_edges().into_iter();
1✔
2031

2032
        EdgeIter { 
1✔
2033
            graph, 
1✔
2034
            visited_edges: HashSet::new(), 
1✔
2035
            current_node: 0, 
1✔
2036
            current_dir: Dir::Left, 
1✔
2037
            node_edge_iter
1✔
2038
        }
1✔
2039
    }
1✔
2040
}
2041

2042
impl<K: Kmer, D: Debug> Iterator for EdgeIter<'_, K, D> {
2043
    type Item = (usize, Dir, u8, usize); // node, direction leaving node, base, target node
2044

2045
    fn next(&mut self) -> Option<Self::Item> {
27✔
2046
        loop {
2047
            if let Some((base, nb_node_id, _, _)) = self.node_edge_iter.next() {
108✔
2048
                let edge = if self.current_node > nb_node_id { (nb_node_id, self.current_node) } else { (self.current_node, nb_node_id) };
52✔
2049

2050
                if self.visited_edges.insert(edge) { return Some((self.current_node, self.current_dir, base, nb_node_id)); } // else simply skip and move on
52✔
2051

2052
            } else {
2053
                match self.current_dir {
56✔
2054
                Dir::Left => {
28✔
2055
                    // no left edges, switch to right edges
28✔
2056
                    self.current_dir = Dir::Right;
28✔
2057
                    self.node_edge_iter = self.graph.get_node(self.current_node).r_edges().into_iter();
28✔
2058
                    
28✔
2059
                }
28✔
2060
                Dir::Right => {
2061
                    // no right edges, switch to next node left edges
2062
                    self.current_node += 1;
28✔
2063

2064
                    // quit if end of graph is reached
2065
                    if self.current_node == self.graph.len() { return None }
28✔
2066

2067
                    self.current_dir = Dir::Left;
27✔
2068
                    self.node_edge_iter = self.graph.get_node(self.current_node).l_edges().into_iter();
27✔
2069
                }
2070
            }
2071
            }
2072
            
2073
        }
2074
    }
27✔
2075
}
2076

2077
#[cfg(test)]
2078
mod test {
2079
    use std::{fs::File, io::BufReader};
2080

2081
    use crate::{compression::uncompressed_graph, graph, kmer::{Kmer16, Kmer22}, serde::{SerGraph, SerKmers}, summarizer::{IDEMData, IDMapEMData, TagsCountsSumData, ID}};
2082

2083
    use super::DebruijnGraph;
2084
    use crate::{summarizer::SummaryData, Dir, BUF};
2085

2086

2087
    #[test]
2088
    #[cfg(not(feature = "sample128"))]
2089
    fn test_components() {
2090

2091
        let path = "test_data/400.graph.dbg";
2092
        let file = BufReader::with_capacity(BUF, File::open(path).unwrap());
2093

2094
        let (graph, _, _): (DebruijnGraph<Kmer16, TagsCountsSumData>, Vec<String>, crate::summarizer::SummaryConfig) = 
2095
            bincode::deserialize_from(file).expect("error deserializing graph");
2096

2097
        let components = graph.iter_components();
2098

2099
        let check_components = [
2100
            vec![3, 67, 130, 133, 59, 119, 97, 110, 68, 137, 29, 84, 131, 43, 30, 91, 14, 70, 79, 142, 136, 105, 103, 62, 
2101
                141, 104, 134, 88, 38, 81, 108, 92, 135, 96, 116, 121, 63, 124, 106, 129, 132, 126, 93, 109, 83, 112, 118, 
2102
                123, 125, 78, 122, 115, 75, 128, 140, 111, 26, 143, 113],
2103
            vec![41, 138, 100, 139, 86],
2104
            vec![53, 117, 127],
2105
            vec![69, 144, 77, 120, 114, 107, 101],
2106
        ];
2107

2108
        let mut counter = 0;
2109

2110
        for component in components {
2111
            if component.len() > 1 { 
2112
                println!("component: {:?}", component);
2113
                assert_eq!(component, check_components[counter]);
2114
                counter += 1;
2115
            }
2116
        }
2117
        assert_eq!(vec![(139, Dir::Left)], graph.max_path(|data| data.sum().unwrap_or(1) as f32, |_| true));
2118
    }
2119

2120
    #[test]
2121
    fn test_iter_edges() {
1✔
2122
        use crate::{compression::uncompressed_graph, filter::filter_kmers, reads::{Reads, ReadsPaired}, summarizer::{SampleInfo, SummaryConfig, TagsData}, Exts};
2123

2124
        let read1 = "CAGCATCGATGCGACGAGCGCTCGCATCGA".as_bytes();
1✔
2125
        let read2 = "ACGATCGTACGTAGCTAGCTGACTGAGC".as_bytes();
1✔
2126

2127
        let mut reads = Reads::new(crate::reads::Strandedness::Forward);
1✔
2128
        reads.add_from_bytes(read1, Exts::empty(), 0u8);
1✔
2129
        reads.add_from_bytes(read2, Exts::empty(), 1);
1✔
2130

2131
        let reads_paired = ReadsPaired::Unpaired { reads };
1✔
2132

2133
        let sample_info = SampleInfo::new(0b1, 0b10, 1, 1, vec![12, 12]);
1✔
2134
        let summary_config = SummaryConfig::new(1, None, crate::summarizer::GroupFrac::None, 0.3, sample_info, None, crate::summarizer::StatTest::WelchsTTest);
1✔
2135
        let (kmers, _) = filter_kmers::<TagsData, Kmer16, _>(&reads_paired, &summary_config, false, 1, false);
1✔
2136

2137
        let graph = uncompressed_graph(&kmers, true).finish();
1✔
2138

2139
        let check_edges: Vec<(usize, Dir, u8, usize)> = vec![(0, Dir::Left, 2, 16), (0, Dir::Right, 1, 2), (1, Dir::Left, 0, 11), 
1✔
2140
        (1, Dir::Right, 0, 16), (3, Dir::Left, 0, 13), (3, Dir::Right, 3, 10), (4, Dir::Left, 2, 21), (4, Dir::Right, 1, 17), (5, Dir::Left, 1, 27), 
1✔
2141
        (5, Dir::Right, 2, 19), (6, Dir::Left, 1, 24), (6, Dir::Right, 2, 12), (7, Dir::Left, 2, 23), (7, Dir::Right, 3, 11), (8, Dir::Left, 0, 12), 
1✔
2142
        (9, Dir::Left, 3, 17), (9, Dir::Right, 3, 24), (10, Dir::Right, 1, 21), (13, Dir::Left, 1, 18), (14, Dir::Right, 0, 26), 
1✔
2143
        (15, Dir::Left, 2, 20), (15, Dir::Right, 3, 22), (18, Dir::Left, 2, 19), (20, Dir::Left, 1, 26), (22, Dir::Right, 2, 25), (23, Dir::Left, 1, 25)];
1✔
2144

2145
        let edges = graph.iter_edges().collect::<Vec<_>>();
1✔
2146

2147
        assert_eq!(check_edges, edges);
1✔
2148
    }
1✔
2149

2150
    // dbg -c ../marbel_datasets/sim_reads_100.csv -s sum --stranded -o ../rust-debruijn/test_data/marbel_100_sum --checkpoint -k 22
2151
    #[cfg(not(feature = "sample128"))]
2152
    const TEST_GRAPH: &str = "test_data/marbel_100_sum.kmers.dbg";
2153

2154
    // cargo run --features sample128 -- -c ../marbel_datasets/sim_reads_100.csv -s sum --stranded -o ../rust-debruijn/test_data/marbel_100_sum_128 --checkpoint -k 22
2155
    #[cfg(feature = "sample128")]
2156
    const TEST_GRAPH: &str = "test_data/marbel_100_sum_128.kmers.dbg";
2157

2158
    #[test]
2159
    fn test_map_transcripts() {
1✔
2160
        // dbg -c ../marbel_datasets/sim_reads_100.csv -s sum --stranded -o ../rust-debruijn/test_data/marbel_100_sum --checkpoint -k 22
2161
        let graph_path = TEST_GRAPH;
1✔
2162
        let t_ref_path = "test_data/marbel_100_tr_ref.fasta";
1✔
2163
        let (kmers, mut translator, _) = SerKmers::<Kmer22, u32>::deserialize_from(graph_path).dissolve();
1✔
2164

2165
        let unc_graph = uncompressed_graph(&kmers, true).finish();
1✔
2166

2167
        let t_map = unc_graph.map_transcripts(t_ref_path, &mut translator).unwrap();
1✔
2168
        assert_eq!(t_map.len(), unc_graph.len());
1✔
2169
        assert_eq!(t_map.iter().filter(|&ids| !ids.is_empty()).collect::<Vec<_>>().len(), 10720);
21,993✔
2170
    }
1✔
2171
}
2172

STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc