• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jlab / rust-debruijn / 25311459989

04 May 2026 09:27AM UTC coverage: 88.239% (+1.0%) from 87.212%
25311459989

Pull #40

github

evolp
fix typos and update .gitignore
Pull Request #40: Utilize PHRED-scores for error removal, add 3D json compatibility

1573 of 1714 new or added lines in 9 files covered. (91.77%)

14 existing lines in 4 files now uncovered.

8988 of 10186 relevant lines covered (88.24%)

4073771.11 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

80.95
/src/graph.rs
1
// Copyright 2017 10x Genomics
2

3
//! Containers for path-compressed De Bruijn graphs
4

5
use bimap::BiHashMap;
6
use bio::io::fasta;
7
use bit_set::BitSet;
8
use indicatif::ProgressBar;
9
use indicatif::ProgressIterator;
10
use indicatif::ProgressStyle;
11
use itertools::chain;
12
use itertools::enumerate;
13
use log::warn;
14
use log::{debug, trace};
15
use rayon::prelude::*;
16
use rayon::current_num_threads;
17
use serde_derive::{Deserialize, Serialize};
18
use smallvec::SmallVec;
19
use std::borrow::Borrow;
20

21
use std::collections::HashSet;
22
use std::collections::VecDeque;
23
use std::f32;
24
use std::fmt::{self, Debug, Display};
25
use std::fs::{remove_file, File};
26
use std::hash::Hash;
27
use std::io::BufWriter;
28
use std::io::{BufReader, Error, Read};
29
use std::io::Write;
30
use std::iter::FromIterator;
31
use std::marker::PhantomData;
32
use std::path::Path;
33
use std::usize;
34

35
use boomphf::hashmap::BoomHashMap;
36

37
use serde_json;
38
use serde_json::Value;
39

40
type SmallVec4<T> = SmallVec<[T; 4]>;
41
type SmallVec8<T> = SmallVec<[T; 8]>;
42

43
use crate::BaseQuality;
44
use crate::bits_to_base;
45
use crate::colors::ColorMode;
46
use crate::colors::Colors;
47
use crate::compression::CompressionSpec;
48
use crate::dna_string::{DnaString, DnaStringSlice, PackedDnaStringSet};
49
use crate::summarizer::SummaryConfig;
50
use crate::summarizer::SummaryData;
51
use crate::summarizer::Translator;
52
use crate::summarizer::ID;
53
use crate::BUF;
54
use crate::PROGRESS_STYLE;
55
use crate::{Dir, Exts, Kmer, Mer, Vmer};
56

57
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
58
/// This type does not carry the sorted index arrays the allow the graph
59
/// to be walked efficiently. The `DeBruijnGraph` type wraps this type and add those
60
/// vectors.
61
#[derive(Serialize, Deserialize, Clone, Debug)]
62
pub struct BaseGraph<K, D> {
63
    pub sequences: PackedDnaStringSet,
64
    pub exts: Vec<Exts>,
65
    pub data: Vec<D>,
66
    pub stranded: bool,
67
    phantom: PhantomData<K>,
68
}
69

70
impl<K, D> BaseGraph<K, D> {
71
    pub fn new(stranded: bool) -> Self {
2,618✔
72
        BaseGraph {
2,618✔
73
            sequences: PackedDnaStringSet::new(),
2,618✔
74
            exts: Vec::new(),
2,618✔
75
            data: Vec::new(),
2,618✔
76
            phantom: PhantomData,
2,618✔
77
            stranded,
2,618✔
78
        }
2,618✔
79
    }
2,618✔
80

81
    pub fn len(&self) -> usize {
323,723✔
82
        self.sequences.len()
323,723✔
83
    }
323,723✔
84

85
    pub fn is_empty(&self) -> bool {
1✔
86
        self.sequences.is_empty()
1✔
87
    }
1✔
88

89
    pub fn combine<I: Iterator<Item = BaseGraph<K, D>>>(graphs: I) -> Self {
12✔
90
        let mut sequences = PackedDnaStringSet::new();
12✔
91
        let mut exts = Vec::new();
12✔
92
        let mut data = Vec::new();
12✔
93
        let mut stranded = Vec::new();
12✔
94

95
        for g in graphs {
1,795✔
96
            for s in 0..g.sequences.len() {
5,617✔
97
                sequences.add(&g.sequences.get(s));
5,617✔
98
            }
5,617✔
99

100
            exts.extend(g.exts);
1,795✔
101
            data.extend(g.data);
1,795✔
102
            stranded.push(g.stranded);
1,795✔
103
        }
104

105
        let out_stranded = stranded.iter().all(|x| *x);
12✔
106

107
        if !out_stranded && !stranded.iter().all(|x| !*x) {
1,795✔
108
            panic!("attempted to combine stranded and unstranded graphs");
×
109
        }
12✔
110

111
        BaseGraph {
12✔
112
            sequences,
12✔
113
            stranded: out_stranded,
12✔
114
            exts,
12✔
115
            data,
12✔
116
            phantom: PhantomData,
12✔
117
        }
12✔
118
    }
12✔
119
}
120

121
impl<K: Kmer, D> BaseGraph<K, D> {
122
    pub fn add<R: Borrow<u8>, S: IntoIterator<Item = R>>(
2,392,712✔
123
        &mut self,
2,392,712✔
124
        sequence: S,
2,392,712✔
125
        exts: Exts,
2,392,712✔
126
        data: D,
2,392,712✔
127
    ) {
2,392,712✔
128
        self.sequences.add(sequence);
2,392,712✔
129
        self.exts.push(exts);
2,392,712✔
130
        self.data.push(data);
2,392,712✔
131
    }
2,392,712✔
132
}
133

134
impl<K: Kmer + Send + Sync, D> BaseGraph<K, D> {
135
    pub fn finish(self) -> DebruijnGraph<K, D> {
712✔
136
        let indices: Vec<u32> = (0..self.len() as u32).collect();
712✔
137

138
        let left_order = {
712✔
139
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
712✔
140
            for idx in &indices {
1,625,915✔
141
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
1,625,915✔
142
                
1,625,915✔
143
            }
1,625,915✔
144
            
145
            BoomHashMap::new_parallel(kmers, indices.clone())
712✔
146
        };
147

148
        let right_order = {
712✔
149
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
712✔
150
            for idx in &indices {
1,625,915✔
151
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
1,625,915✔
152
            }
1,625,915✔
153
            
154
            BoomHashMap::new_parallel(kmers, indices)
712✔
155
        };
156
        debug!("finish graph loops: 2x {}", self.len());
712✔
157

158
        DebruijnGraph {
712✔
159
            base: self,
712✔
160
            left_order,
712✔
161
            right_order,
712✔
162
        }
712✔
163
    }
712✔
164
}
165

166
impl<K: Kmer, D> BaseGraph<K, D> {
167
    pub fn finish_serial(self) -> DebruijnGraph<K, D> {
111✔
168
        let indices: Vec<u32> = (0..self.len() as u32).collect();
111✔
169

170
        let left_order = {
111✔
171
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
111✔
172
            let mut sequences: Vec<String> = Vec::new();
111✔
173
            for idx in &indices {
765,927✔
174
                kmers.push(self.sequences.get(*idx as usize).first_kmer());
765,927✔
175
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
765,927✔
176
            }
765,927✔
177
            println!("left kmers: {:?}", kmers);
111✔
178
            println!("left seqs: {:?}", sequences);
111✔
179
            BoomHashMap::new(kmers, indices.clone())
111✔
180
        };
181

182
        let right_order = {
111✔
183
            let mut kmers: Vec<K> = Vec::with_capacity(self.len());
111✔
184
            let mut sequences: Vec<String> = Vec::new();
111✔
185
            for idx in &indices {
765,927✔
186
                kmers.push(self.sequences.get(*idx as usize).last_kmer());
765,927✔
187
                sequences.push(self.sequences.get(*idx as usize).to_dna_string());
765,927✔
188
            }
765,927✔
189
            println!("right kmers: {:?}", kmers);
111✔
190
            println!("right seqs: {:?}", sequences);
111✔
191
            BoomHashMap::new(kmers, indices)
111✔
192
        };
193

194
        DebruijnGraph {
111✔
195
            base: self,
111✔
196
            left_order,
111✔
197
            right_order,
111✔
198
        }
111✔
199
    }
111✔
200
}
201

202
/// A compressed DeBruijn graph carrying auxiliary data on each node of type `D`.
203
/// The struct carries sorted index arrays the allow the graph
204
/// to be walked efficiently.
205
#[derive(Serialize, Deserialize, Debug)]
206
pub struct DebruijnGraph<K: Hash, D> {
207
    pub base: BaseGraph<K, D>,
208
    left_order: BoomHashMap<K, u32>,
209
    right_order: BoomHashMap<K, u32>,
210
}
211

212
impl<K: Kmer, D: Debug> DebruijnGraph<K, D> {
213
    /// Total number of nodes in the DeBruijn graph
214
    pub fn len(&self) -> usize {
321,230✔
215
        self.base.len()
321,230✔
216
    }
321,230✔
217

218
    pub fn is_empty(&self) -> bool {
1✔
219
        self.base.is_empty()
1✔
220
    }
1✔
221

222
    /// Get a node given it's `node_id`
223
    pub fn get_node(&'_ self, node_id: usize) -> Node<'_, K, D> {
6,134,104✔
224
        Node {
6,134,104✔
225
            node_id,
6,134,104✔
226
            graph: self,
6,134,104✔
227
        }
6,134,104✔
228
    }
6,134,104✔
229

230
    /// Get a node given it's `node_id`
231
    pub fn get_node_kmer(&'_ self, node_id: usize) -> NodeKmer<'_, K, D> {
×
232
        let node = self.get_node(node_id);
×
233
        let node_seq = node.sequence();
×
234

235
        NodeKmer {
×
236
            node_id,
×
237
            node_seq_slice: node_seq,
×
238
            phantom_d: PhantomData,
×
239
            phantom_k: PhantomData,
×
240
        }
×
241
    }
×
242

243
    /// Return an iterator over all nodes in the graph
244
    pub fn iter_nodes(&'_ self) -> NodeIter<'_, K, D> {
130✔
245
        NodeIter {
130✔
246
            graph: self,
130✔
247
            node_id: 0,
130✔
248
        }
130✔
249
    }
130✔
250

251
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
252
    /// accessed via a Node wrapper object
253
    fn find_edges(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
523,591✔
254
        let exts = self.base.exts[node_id];
523,591✔
255
        let sequence = self.base.sequences.get(node_id);
523,591✔
256
        let kmer: K = sequence.term_kmer(dir);
523,591✔
257
        let mut edges = SmallVec4::new();
523,591✔
258

259
        for i in 0..4 {
2,094,364✔
260
            if exts.has_ext(dir, i) {
2,094,364✔
261
                let link = self.find_link(kmer.extend(i, dir), dir).expect("missing link");
1,009,315✔
262
                edges.push((i, link.0, link.1, link.2));
1,009,315✔
263
            }
1,085,049✔
264
        }
265

266
        edges
523,591✔
267
    }
523,591✔
268

269
    /// Find the edges leaving node `node_id` in direction `Dir`. Should generally be
270
    /// accessed via a Node wrapper object
271
    /// 
272
    /// allows missing links
273
    fn _find_edges_sharded(&self, node_id: usize, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
×
274
        let exts = self.base.exts[node_id];
×
275
        let sequence = self.base.sequences.get(node_id);
×
276
        let kmer: K = sequence.term_kmer(dir);
×
277
        let mut edges = SmallVec4::new();
×
278

279
        for i in 0..4 {
×
280
            if exts.has_ext(dir, i) {
×
281
                let link = self.find_link(kmer.extend(i, dir), dir); //.expect("missing link");
×
282
                if let Some(l) = link {
×
283
                    edges.push((i, l.0, l.1, l.2));
×
284
                }
×
285
                // Otherwise, this edge doesn't exist within this shard, so ignore it.
286
                // NOTE: this should be allowed in a 'complete' DBG
287
            }
×
288
        }
289

290
        edges
×
291
    }
×
292

293
    /// Seach for the kmer `kmer`, appearing at the given `side` of a node sequence.
294
    fn search_kmer(&self, kmer: K, side: Dir) -> Option<usize> {
7,113,159✔
295
        match side {
7,113,159✔
296
            Dir::Left => self.left_order.get(&kmer).map(|pos| *pos as usize),
3,556,809✔
297
            Dir::Right => self.right_order.get(&kmer).map(|pos| *pos as usize),
3,556,350✔
298
        }
299
    }
7,113,159✔
300

301
    /// Find a link in the graph, possibly handling a RC switch.
302
    pub fn find_link(&self, kmer: K, dir: Dir) -> Option<(usize, Dir, bool)> {
4,966,188✔
303
        // Only test self-consistent paths through
304
        // the edges
305
        // Avoids problems due to single kmer edges
306
        // (dir, next_side_incoming, rc)
307
        // (Dir::Left, Dir::Right, false) => true,
308
        // (Dir::Left, Dir::Left,  true) => true,
309
        // (Dir::Right, Dir::Left, false) => true,
310
        // (Dir::Right, Dir::Right, true) => true,
311

312
        let rc = kmer.rc();
4,966,188✔
313

314
        match dir {
4,966,188✔
315
            Dir::Left => {
316
                if let Some(idx) = self.search_kmer(kmer, Dir::Right) {
2,482,855✔
317
                    return Some((idx, Dir::Right, false));
1,409,377✔
318
                }
1,073,478✔
319

320
                if !self.base.stranded {
1,073,478✔
321
                    if let Some(idx) = self.search_kmer(rc, Dir::Left) {
1,073,476✔
322
                        return Some((idx, Dir::Left, true));
1,073,269✔
323
                    }
207✔
324
                }
2✔
325
            }
326

327
            Dir::Right => {
328
                if let Some(idx) = self.search_kmer(kmer, Dir::Left) {
2,483,333✔
329
                    return Some((idx, Dir::Left, false));
1,410,802✔
330
                }
1,072,531✔
331

332
                if !self.base.stranded {
1,072,531✔
333
                    if let Some(idx) = self.search_kmer(rc, Dir::Right) {
1,072,529✔
334
                        return Some((idx, Dir::Right, true));
1,072,292✔
335
                    }
237✔
336
                }
2✔
337
            }
338
        }
339

340
        None
448✔
341
    }
4,966,188✔
342

343
    /// Check whether the graph is fully compressed. Return `None` if it's compressed,
344
    /// otherwise return `Some(node1, node2)` representing a pair of node that could
345
    /// be collapsed. Probably only useful for testing.
346
    pub fn is_compressed<S: CompressionSpec<D>>(&self, spec: &S) -> Option<(usize, usize)> {
789✔
347
        for i in 0..self.len() {
149,792✔
348
            let n = self.get_node(i);
149,792✔
349

350
            for dir in &[Dir::Left, Dir::Right] {
299,584✔
351
                let dir_edges = n.edges(*dir);
299,584✔
352
                if dir_edges.len() == 1 {
299,584✔
353
                    let (_, next_id, return_dir, _) = dir_edges[0];
212,848✔
354
                    let next = self.get_node(next_id);
212,848✔
355

356
                    let ret_edges = next.edges(return_dir);
212,848✔
357
                    if ret_edges.len() == 1 {
212,848✔
358
                        // Test for us being a palindrome: this makes it OK
359
                        if n.len() == K::k() && n.sequence().first_kmer::<K>().is_palindrome() {
24✔
360
                            continue;
16✔
361
                        }
8✔
362

363
                        // Test for a neighbor being a palindrome: this makes it OK
364
                        if next.len() == K::k() && next.sequence().first_kmer::<K>().is_palindrome()
8✔
365
                        {
366
                            continue;
8✔
367
                        }
×
368

369
                        // Test for this edge representing a smooth circle (biting it's own tail):
370
                        if n.node_id == next_id {
×
371
                            continue;
×
372
                        }
×
373

374
                        if spec.join_test(n.data(), next.data()) {
×
375
                            // Found a unbranched edge that should have been eliminated
376
                            return Some((i, next_id));
×
377
                        }
×
378
                    }
212,824✔
379
                }
86,736✔
380
            }
381
        }
382

383
        None
789✔
384
    }
789✔
385

386
    /// Remove non-existent extensions that may be created due to filtered kmers
387
    /// 
388
    /// if `valid_nodes` if `None`, all nodes are valid
389
    pub fn fix_exts(&mut self, valid_nodes: Option<&BitSet>) {
360✔
390
        for i in 0..self.len() {
1,559,889✔
391
            let valid_exts = self.get_valid_exts(i, valid_nodes);
1,559,889✔
392
            self.base.exts[i] = valid_exts;
1,559,889✔
393
        }
1,559,889✔
394
    }
360✔
395

396
    pub fn get_valid_exts(&self, node_id: usize, valid_nodes: Option<&BitSet>) -> Exts {
1,559,889✔
397
        let mut new_exts = Exts::empty();
1,559,889✔
398
        let node = self.get_node(node_id);
1,559,889✔
399
        let exts = node.exts();
1,559,889✔
400
        let l_kmer: K = node.sequence().first_kmer();
1,559,889✔
401
        let r_kmer: K = node.sequence().last_kmer();
1,559,889✔
402

403
        let check_node = |id| match valid_nodes {
3,175,333✔
404
            Some(bs) => bs.contains(id),
1,562,116✔
405
            None => true,
1,613,217✔
406
        };
3,175,333✔
407

408
        for i in 0..4 {
6,239,556✔
409
            if exts.has_ext(Dir::Left, i) {
6,239,556✔
410
                match self.find_link(l_kmer.extend_left(i), Dir::Left) {
1,587,877✔
411
                    Some((target, _, _)) if check_node(target) => {
1,587,668✔
412
                        new_exts = new_exts.set(Dir::Left, i)
1,587,668✔
413
                    }
414
                    _ => (),
209✔
415
                }
416
            }
4,651,679✔
417

418
            if exts.has_ext(Dir::Right, i) {
6,239,556✔
419
                match self.find_link(r_kmer.extend_right(i), Dir::Right) {
1,587,904✔
420
                    Some((target, _, _)) if check_node(target) => {
1,587,665✔
421
                        new_exts = new_exts.set(Dir::Right, i)
1,587,665✔
422
                    }
423
                    _ => (),
239✔
424
                }
425
            }
4,651,652✔
426
        }
427

428
        new_exts
1,559,889✔
429
    }
1,559,889✔
430

431
    /// mutable reference to the auxiliary data of the node node_id
432
    pub fn mut_data(&mut self, node_id: usize) -> &mut D {
791✔
433
        &mut self.base.data[node_id]
791✔
434
    }
791✔
435

436
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
437
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
438
    /// paths will be terminated if there are multiple valid paths emanating from a node.
439
    pub fn max_path<F, F2>(&self, score: F, solid_path: F2) -> Vec<(usize, Dir)>
×
440
    where
×
441
        F: Fn(&D) -> f32,
×
442
        F2: Fn(&D) -> bool,
×
443
    {
444
        if self.is_empty() {
×
445
            return Vec::default();
×
446
        }
×
447

448
        let mut best_node = 0;
×
449
        let mut best_score = f32::MIN;
×
450
        for i in 0..self.len() {
×
451
            let node = self.get_node(i);
×
452
            let node_score = score(node.data());
×
453

454
            if node_score > best_score {
×
455
                best_node = i;
×
456
                best_score = node_score;
×
457
            }
×
458
        }
459

460
        let oscore = |state| match state {
×
461
            None => 0.0,
×
462
            Some((id, _)) => score(self.get_node(id).data()),
×
463
        };
×
464

465
        let osolid_path = |state| match state {
×
466
            None => false,
×
467
            Some((id, _)) => solid_path(self.get_node(id).data()),
×
468
        };
×
469

470
        // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
471
        // already put into the path
472
        let mut used_nodes = HashSet::new();
×
473
        let mut path = VecDeque::new();
×
474

475
        // Start w/ initial state
476
        used_nodes.insert(best_node);
×
477
        path.push_front((best_node, Dir::Left));
×
478

479
        for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
×
480
            let &(start_node, dir, do_flip) = init;
×
481
            let mut current = (start_node, dir);
×
482

483
            loop {
484
                let mut next = None;
×
485
                let (cur_id, incoming_dir) = current;
×
486
                let node = self.get_node(cur_id);
×
487
                let edges = node.edges(incoming_dir.flip());
×
488

489
                let mut solid_paths = 0;
×
490
                for (_, id, dir, _) in edges {
×
491
                    let cand = Some((id, dir));
×
492
                    if osolid_path(cand) {
×
493
                        solid_paths += 1;
×
494
                    }
×
495

496
                    if oscore(cand) > oscore(next) {
×
497
                        next = cand;
×
498
                    }
×
499
                }
500

501
                if solid_paths > 1 {
×
502
                    break;
×
503
                }
×
504

505
                match next {
×
506
                    Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
×
507
                        if do_flip {
×
508
                            path.push_front((next_id, next_incoming.flip()));
×
509
                        } else {
×
510
                            path.push_back((next_id, next_incoming));
×
511
                        }
×
512

513
                        used_nodes.insert(next_id);
×
514
                        current = (next_id, next_incoming);
×
515
                    }
516
                    _ => break,
×
517
                }
518
            }
519
        }
520

521
        Vec::from_iter(path)
×
522
    }
×
523

524

525
    /// Find the highest-scoring, unambiguous path in the graph. Each node get a score
526
    /// given by `score`. Any node where `solid_path(node) == True` are valid paths -
527
    /// paths will be terminated if there are multiple valid paths emanating from a node.
528
    /// Returns vec with path for each component
529
    pub fn max_path_comp<F, F2>(&self, score: F, solid_path: F2) -> Vec<VecDeque<(usize, Dir)>>
1✔
530
    where
1✔
531
        F: Fn(&D) -> f32,
1✔
532
        F2: Fn(&D) -> bool,
1✔
533
    {
534
        if self.is_empty() {
1✔
535
            let vec: Vec<VecDeque<(usize, Dir)>> = Vec::new();
×
536
            return vec;
×
537
        }
1✔
538

539
        let components = self.iter_components();
1✔
540
        let mut paths: Vec<VecDeque<(usize, Dir)>> = Vec::new();
1✔
541

542
        for component in components {
2✔
543

544
            let current_comp = &component;
2✔
545
            
546

547
            let mut best_node = current_comp[0];
2✔
548
            let mut best_score = f32::MIN;
2✔
549
            for c in current_comp.iter() {
135✔
550
                let node = self.get_node(*c);
135✔
551
                let node_score = score(node.data());
135✔
552

553
                if node_score > best_score {
135✔
554
                    best_node = *c;
2✔
555
                    best_score = node_score;
2✔
556
                }
133✔
557
            }
558

559
            let oscore = |state| match state {
248✔
560
                None => 0.0,
124✔
561
                Some((id, _)) => score(self.get_node(id).data()),
124✔
562
            };
248✔
563

564
            let osolid_path = |state| match state {
124✔
565
                None => false,
×
566
                Some((id, _)) => solid_path(self.get_node(id).data()),
124✔
567
            };
124✔
568

569
            // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
570
            // already put into the path
571
            let mut used_nodes = HashSet::new();
2✔
572
            let mut path = VecDeque::new();
2✔
573

574
            // Start w/ initial state
575
            used_nodes.insert(best_node);
2✔
576
            path.push_front((best_node, Dir::Left));
2✔
577

578
            for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
4✔
579
                let &(start_node, dir, do_flip) = init;
4✔
580
                let mut current = (start_node, dir);
4✔
581

582
                loop {
583
                    let mut next = None;
128✔
584
                    let (cur_id, incoming_dir) = current;
128✔
585
                    let node = self.get_node(cur_id);
128✔
586
                    let edges = node.edges(incoming_dir.flip());
128✔
587

588
                    let mut solid_paths = 0;
128✔
589
                    for (_, id, dir, _) in edges {
128✔
590
                        let cand = Some((id, dir));
124✔
591
                        if osolid_path(cand) {
124✔
592
                            solid_paths += 1;
124✔
593
                        }
124✔
594

595
                        if oscore(cand) > oscore(next) {
124✔
596
                            next = cand;
124✔
597
                        }
124✔
598
                    }
599

600
                    if solid_paths > 1 {
128✔
601
                        break;
×
602
                    }
128✔
603

604
                    match next {
124✔
605
                        Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
124✔
606
                            if do_flip {
124✔
607
                                path.push_front((next_id, next_incoming.flip()));
67✔
608
                            } else {
67✔
609
                                path.push_back((next_id, next_incoming));
57✔
610
                            }
57✔
611

612
                            used_nodes.insert(next_id);
124✔
613
                            current = (next_id, next_incoming);
124✔
614
                        }
615
                        _ => break,
4✔
616
                    }
617
                }
618
            }
619
            
620
            paths.push(path);
2✔
621
            //paths.push(Vec::from_iter(path));
622
        }
623

624
        paths
1✔
625
    
626
    }
1✔
627

628
    pub fn iter_max_path_comp<F, F2>(&'_ self, score: F, solid_path: F2) -> PathCompIter<'_, K, D, F, F2> 
3✔
629
    where 
3✔
630
    F: Fn(&D) -> f32,
3✔
631
    F2: Fn(&D) -> bool
3✔
632
    {
633
        let component_iterator = self.iter_components();
3✔
634
        PathCompIter { graph: self, component_iterator, graph_pos: 0, score, solid_path }
3✔
635
    }
3✔
636

637
    /// write the paths from `iter_max_path_comp` to a fasta file
638
    pub fn path_to_fasta<F, F2>(&self, f: &mut dyn std::io::Write, path_iter: PathCompIter<K, D, F, F2>, return_lens: bool) -> (Vec<usize>, Vec<usize>)
1✔
639
    where 
1✔
640
    F: Fn(&D) -> f32,
1✔
641
    F2: Fn(&D) -> bool
1✔
642
    {
643
        // width of fasta lines
644
        let columns = 80;
1✔
645

646
        // sizes of components and of paths
647
        let mut comp_sizes = Vec::new();
1✔
648
        let mut path_lens = Vec::new();
1✔
649

650
        for (seq_counter, (component, path)) in path_iter.enumerate() {
2✔
651
            // get dna sequence from path
652
            let seq = self.sequence_of_path(path.iter());
2✔
653

654
            //write header with length & start node
655
            writeln!(f, ">path{} len={} start_node={}", seq_counter, seq.len(), path[0].0).unwrap();
2✔
656

657
            // calculate how sequence has to be split up
658
            let slices = (seq.len() / columns) + 1;
2✔
659
            let mut ranges = Vec::with_capacity(slices);
2✔
660

661
            let mut start = 0;
2✔
662
            while start < seq.len() {
6✔
663
                ranges.push(start..start + columns);
4✔
664
                start += columns;
4✔
665
            }
4✔
666

667
            let last_start = ranges.pop().expect("no kmers in parallel ranges").start;
2✔
668
            ranges.push(last_start..seq.len());
2✔
669

670
            // split up sequence and write to file accordingly
671
            for range in ranges {
4✔
672
                writeln!(f, "{:?}", seq.slice(range.start, range.end)).unwrap();
4✔
673
            }
4✔
674

675
            if return_lens {
2✔
676
                comp_sizes.push(component.len());
×
677
                path_lens.push(path.len());
×
678
            }
2✔
679
        }    
680

681
        (comp_sizes, path_lens)
1✔
682
        
683
    }
1✔
684

685

686
    /// Get the sequence of a path through the graph. The path is given as a sequence of node_id integers
687
    pub fn sequence_of_path<'a, I: 'a + Iterator<Item = &'a (usize, Dir)>>(
22,505✔
688
        &self,
22,505✔
689
        path: I,
22,505✔
690
    ) -> DnaString {
22,505✔
691
        let mut seq = DnaString::new();
22,505✔
692

693
        for (idx, &(node_id, dir)) in path.enumerate() {
771,922✔
694
            let start = if idx == 0 { 0 } else { K::k() - 1 };
771,922✔
695

696
            let node_seq = match dir {
771,922✔
697
                Dir::Left => self.get_node(node_id).sequence(),
400,494✔
698
                Dir::Right => self.get_node(node_id).sequence().rc(),
371,428✔
699
            };
700

701
            for p in start..node_seq.len() {
1,501,554✔
702
                seq.push(node_seq.get(p))
1,501,554✔
703
            }
704
        }
705

706
        seq
22,505✔
707
    }
22,505✔
708

709
    /// map sequences from a fasta file to a **completely uncompressed** and **stranded** debruijn graph
710
    pub fn map_transcripts<P>(&self, path: P, translator: &mut Translator) -> Result<Vec<Box<[ID]>>, String> 
2✔
711
    where 
2✔
712
        P: AsRef<Path>
2✔
713
    {
714
        // return err if not stranded
715
        if !self.base.stranded { return Err("graph has to be stranded".to_string()) };
2✔
716

717
        let reader = fasta::Reader::new(BufReader::new(File::open(path).unwrap()));
2✔
718
        let mut node_transcript_ids = vec![Vec::new(); self.len()];
2✔
719

720
        // if the translator has a id translator, use it, else make a new one to use and put it into the translator
721
        let id_tr = if let Some(id_tr) = translator.mut_id_translator() {
2✔
722
            id_tr
1✔
723
        } else {
724
            let new_id_tr = BiHashMap::new();
1✔
725
            translator.mut_id_translator().replace(new_id_tr);
1✔
726
            
727
            let Some(id_tr) = translator.mut_id_translator() else { panic!("should not happen") };
1✔
728

729
            id_tr
1✔
730
        };
731

732
        // go through each transcript and map to graph
733
        for result in reader.records() {
8✔
734
            let record = result.expect("error parsing transcripts fasta");
8✔
735

736
            // get gene id or make new id
737
            let gene_id = match id_tr.get_by_left(&record.id().to_string()) {
8✔
738
                Some(id) => *id,
4✔
739
                None => {
740
                    let new_id = id_tr.len() as ID;
4✔
741
                    id_tr.insert(record.id().to_string(), new_id);
4✔
742
                    new_id
4✔
743
                }
744
            };
745

746
            // iterate over k-mers in transcript and find each one in the graph
747
            let sequence = DnaString::from_acgt_bytes(record.seq());
8✔
748
            for kmer in sequence.iter_kmers::<K>() {
966✔
749
                if let Some(node) = self.search_kmer(kmer, Dir::Right) {
966✔
750
                    node_transcript_ids[node].push(gene_id);
918✔
751
                }
918✔
752
            }
753
        }
754

755
        // change to boxed slices to reduce memory
756
        let boxed_transcripts = node_transcript_ids.into_iter().map(|vec| vec.into()).collect();
974✔
757

758
        Ok(boxed_transcripts)
2✔
759
    }
2✔
760

761
    /// write a node to a dot file
762
    /// 
763
    /// ### Arguments: 
764
    /// 
765
    /// * `node`: [`Node<K, D>`] which will be written to a dot file
766
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
767
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
768
    ///   and if the neighbor is flipped - returns a string containing commands for dot edges, 
769
    /// * `f`: writer
770
    fn node_to_dot<FN: Fn(&Node<K, D>) -> String, FE: Fn(&Node<K, D>, u8, Dir, bool) -> String>(
313✔
771
        &self,
313✔
772
        node: &Node<'_, K, D>,
313✔
773
        node_label: &FN,
313✔
774
        edge_label: &FE,
313✔
775
        f: &mut dyn Write,
313✔
776
    ) {
313✔
777
        writeln!(f, "n{} {}", node.node_id, node_label(node)).unwrap();
313✔
778
        assert_eq!(node.exts().val.count_ones() as usize, node.l_edges().len() + node.r_edges().len());
313✔
779

780
        for (base, id, incoming_dir, flipped) in node.l_edges() {
313✔
781
            writeln!(f, "n{} -> n{} {}", id, node.node_id, edge_label(node, base, incoming_dir, flipped)).unwrap();
307✔
782
        }
307✔
783

784
        for (base, id, incoming_dir, flipped) in node.r_edges() {
313✔
785
            writeln!(f, "n{} -> n{} {}", node.node_id, id, edge_label(node, base, incoming_dir, flipped)).unwrap();
307✔
786
        }
307✔
787
    }
313✔
788

789
    /// Write the graph to a dot file
790
    /// 
791
    /// ### Arguments: 
792
    /// 
793
    /// * `path`: path to the output file
794
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes, e.g. [`Node::node_dot_default`]
795
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge, e.g. [`Node::edge_dot_default`]
796
    ///   and if the neighbor is flipped - returns a string containing commands for dot edges, 
797
    pub fn to_dot<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
1✔
798
    where 
1✔
799
    P: AsRef<Path>,
1✔
800
    FN: Fn(&Node<K, D>) -> String,
1✔
801
    FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
802
    {
803
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
804

805
        let pb = ProgressBar::new(self.len() as u64);
1✔
806
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
807
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
1✔
808

809
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset").unwrap();
1✔
810
        for i in (0..self.len()).progress_with(pb) {
103✔
811
            self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
103✔
812
        }
103✔
813
        writeln!(&mut f, "}}").unwrap();
1✔
814
        
815
        f.flush().unwrap();
1✔
816
        debug!("large to dot loop: {}", self.len());
1✔
817
    }
1✔
818

819
    /// Write the graph to a dot file, highlight the nodes which form the 
820
    /// "best" path, according to [`PathCompIter`], with the number of occurences 
821
    /// as the score and `solid_path` always `true`.
822
    /// The nodes are formatted according to [`Node::node_dot_default`].
823
    /// 
824
    /// ### Arguments: 
825
    /// 
826
    /// * `path`: path to the output file
827
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge, e.g. [`Node::edge_dot_default`]
828
    ///   and if the neighbor is flipped - returns a string containing commands for dot edges, 
829
    /// * `colors`: a [`Colors`] with the color settings for the graph
830
    /// * `translator`: a [`Translator`] which translates tags or IDs to strings
831
    /// * `config`: a [`SummaryConfig`] which contains settings for the graph
832
    pub fn to_dot_with_path<P, FE, DI>(&self, path: P, edge_label: &FE, colors: &Colors<'_, D, DI>, translator: &Translator, config: &SummaryConfig, translate_id_groups: bool)
1✔
833
    where 
1✔
834
    P: AsRef<Path>,
1✔
835
    D: SummaryData<DI>,
1✔
836
    FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
837
    {
838
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
839

840
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset").unwrap();
1✔
841

842
        // iterate over components
843
        for (component, path) in self.iter_max_path_comp(|d| d.sum().unwrap_or(1) as f32, |_| true) {
194✔
844
            let hashed_path = path.into_iter().map(|(id, _)| id).collect::<HashSet<usize>>();
3✔
845
            for node_id in component {
103✔
846
                self.node_to_dot(
103✔
847
                    &self.get_node(node_id),
103✔
848
                    &|node| node.node_dot_default(colors, config, translator, hashed_path.contains(&node_id), translate_id_groups), 
103✔
849
                    edge_label, 
103✔
850
                    &mut f
103✔
851
                );
852
            }
853
        }
854

855
        writeln!(&mut f, "}}").unwrap();
1✔
856
        
857
        f.flush().unwrap();
1✔
858
        debug!("large to dot loop: {}", self.len());
1✔
859
    }
1✔
860

861
    /// Write the graph to a dot file in parallel
862
    /// Will write in to n_threads files simultaniously,
863
    /// then go though the files and add the contents to a larger file, 
864
    /// and delete the small files.
865
    /// 
866
    /// ### Arguments: 
867
    /// 
868
    /// * `path`: path to the output file
869
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
870
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
871
    ///   and if the neighbor is flipped - returns a string containing commands for dot edges, 
872
    pub fn to_dot_parallel<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE) 
1✔
873
    where 
1✔
874
        D: Sync,
1✔
875
        K: Sync,
1✔
876
        P: AsRef<Path> + Display + Sync,
1✔
877
        FN: Fn(&Node<K, D>) -> String + Sync,
1✔
878
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String + Sync,
1✔
879
    {        
880
        let slices = current_num_threads();
1✔
881
        let n_nodes = self.len();
1✔
882
        let sz = n_nodes / slices + 1;
1✔
883

884
        debug!("n_nodes: {}", n_nodes);
1✔
885
        debug!("sz: {}", sz);
1✔
886

887
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
888
        let mut start = 0;
1✔
889
        while start < n_nodes {
5✔
890
            parallel_ranges.push(start..start + sz);
4✔
891
            start += sz;
4✔
892
        }
4✔
893

894
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
895
        parallel_ranges.push(last_start..n_nodes);
1✔
896
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
897

898
        let mut files = Vec::with_capacity(current_num_threads());
1✔
899

900
        for i in 0..parallel_ranges.len() {
4✔
901
            files.push(format!("{}-{}.dot", path, i));
4✔
902
        } 
4✔
903

904
        let pb = ProgressBar::new(self.len() as u64);
1✔
905
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
906
        pb.set_message(format!("{:<32}", "writing graph to DOT files"));
1✔
907
    
908
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
4✔
909
            let mut f = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel dot file"));
4✔
910

911
            for i in range {
103✔
912
                self.node_to_dot(&self.get_node(i), node_label, edge_label, &mut f);
103✔
913
                pb.inc(1);
103✔
914
            }
103✔
915

916
            f.flush().unwrap();
4✔
917
        });
4✔
918
        pb.finish_and_clear();
1✔
919

920
        let mut out_file = BufWriter::with_capacity(BUF, File::create(path).expect("error creating combined dot file"));
1✔
921

922
        writeln!(&mut out_file, "digraph {{\nrankdir=\"LR\"\nmodel=subset").unwrap();
1✔
923

924
        let pb = ProgressBar::new(files.len() as u64);
1✔
925
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
926
        pb.set_message(format!("{:<32}", "combining files"));
1✔
927

928
        for file in files.iter().progress_with(pb) {
4✔
929
            let open_file = File::open(file).expect("error opening parallel dot file");
4✔
930
            let mut reader = BufReader::new(open_file);
4✔
931
            let mut buffer = [0; BUF];
4✔
932

933
            loop {
934
                let linecount = reader.read(&mut buffer).unwrap();
8✔
935
                if linecount == 0 { break }
8✔
936
                out_file.write_all(&buffer[..linecount]).unwrap();
4✔
937
            }
938

939
            remove_file(file).unwrap();
4✔
940
        }
941

942
        writeln!(&mut out_file, "}}").unwrap();
1✔
943

944
        out_file.flush().unwrap();
1✔
945

946

947
    }
1✔
948

949

950
    /// Write part of the graph to a dot file
951
    /// 
952
    /// ### Arguments: 
953
    /// 
954
    /// * `path`: path to the output file
955
    /// * `node_label`: closure taking [`Node<K, D>`] and returning a string containing commands for dot nodes 
956
    /// * `edge_label`: closure taking [`Node<K, D>`], the base as a [`u8`], the incoming [`Dir`] of the edge 
957
    ///   and if the neighbor is flipped - returns a string containing commands for dot edges, 
958
    /// * `nodes`: [`Vec<usize>`] listing all IDs of nodes which should be included
959
    pub fn to_dot_partial<P, FN, FE>(&self, path: P, node_label: &FN, edge_label: &FE, nodes: &[usize]) 
1✔
960
    where 
1✔
961
        P: AsRef<Path>,
1✔
962
        FN: Fn(&Node<K, D>) -> String,
1✔
963
        FE: Fn(&Node<K, D>, u8, Dir, bool) -> String,
1✔
964
    {
965
        let mut f = BufWriter::with_capacity(BUF, File::create(path).expect("error creating dot file"));
1✔
966

967
        let pb = ProgressBar::new(nodes.len() as u64);
1✔
968
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
969
        pb.set_message(format!("{:<32}", "writing graph to DOT file"));
1✔
970

971
        writeln!(&mut f, "digraph {{\nrankdir=\"LR\"\nmodel=subset").unwrap();
1✔
972
        for i in nodes.iter().progress_with(pb) {
4✔
973
            self.node_to_dot(&self.get_node(*i), node_label, edge_label, &mut f);
4✔
974
        }
4✔
975
        writeln!(&mut f, "}}").unwrap();
1✔
976

977
        f.flush().unwrap();
1✔
978

979
        debug!("large to dot loop: {}", self.len());
1✔
980
    }
1✔
981

982
    fn node_to_gfa<F: Fn(&Node<'_, K, D>) -> String>(
313✔
983
        &self,
313✔
984
        node: &Node<'_, K, D>,
313✔
985
        w: &mut dyn Write,
313✔
986
        tag_func: Option<&F>,
313✔
987
    ) -> Result<(), Error> {
313✔
988
        match tag_func {
313✔
989
            Some(f) => {
210✔
990
                let tags = (f)(node);
210✔
991
                writeln!(
210✔
992
                    w,
210✔
993
                    "S\t{}\t{}\t{}",
994
                    node.node_id,
995
                    node.sequence().to_dna_string(),
210✔
996
                    tags
997
                )?;
×
998
            }
999
            _ => writeln!(
103✔
1000
                w,
103✔
1001
                "S\t{}\t{}",
1002
                node.node_id,
1003
                node.sequence().to_dna_string()
103✔
1004
            )?,
×
1005
        }
1006

1007
        for (_, target, dir, _) in node.l_edges() {
313✔
1008
            if target >= node.node_id {
307✔
1009
                let to_dir = match dir {
151✔
UNCOV
1010
                    Dir::Left => "+",
×
1011
                    Dir::Right => "-",
151✔
1012
                };
1013
                writeln!(
151✔
1014
                    w,
151✔
1015
                    "L\t{}\t-\t{}\t{}\t{}M",
1016
                    node.node_id,
1017
                    target,
1018
                    to_dir,
1019
                    K::k() - 1
151✔
1020
                )?;
×
1021
            }
156✔
1022
        }
1023

1024
        for (_, target, dir, _) in node.r_edges() {
313✔
1025
            if target > node.node_id {
307✔
1026
                let to_dir = match dir {
160✔
1027
                    Dir::Left => "+",
160✔
UNCOV
1028
                    Dir::Right => "-",
×
1029
                };
1030
                writeln!(
160✔
1031
                    w,
160✔
1032
                    "L\t{}\t+\t{}\t{}\t{}M",
1033
                    node.node_id,
1034
                    target,
1035
                    to_dir,
1036
                    K::k() - 1
160✔
1037
                )?;
×
1038
            }
147✔
1039
        }
1040

1041
        Ok(())
313✔
1042
    }
313✔
1043

1044
    /// Write the graph to GFA format
1045
    pub fn to_gfa<P: AsRef<Path>>(&self, gfa_out: P) -> Result<(), Error> {
1✔
1046
        let wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
1✔
1047
        self.write_gfa(&mut std::io::BufWriter::new(wtr))
1✔
1048
    }
1✔
1049

1050
    pub fn write_gfa(&self, wtr: &mut impl Write) -> Result<(), Error> {
1✔
1051
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1052

1053
        type DummyFn<K, D> = fn(&Node<'_, K, D>) -> String;
1054
        let dummy_opt: Option<&DummyFn<K, D>> = None;
1✔
1055

1056
        let pb = ProgressBar::new(self.len() as u64);
1✔
1057
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1058
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1059

1060
        for i in (0..self.len()).progress_with(pb) {
103✔
1061
            let n = self.get_node(i);
103✔
1062
            self.node_to_gfa(&n, wtr, dummy_opt)?;
103✔
1063
        }
1064

1065
        wtr.flush().unwrap();
1✔
1066

1067
        Ok(())
1✔
1068
    }
1✔
1069

1070
    /// Write the graph to GFA format
1071
    pub fn to_gfa_with_tags<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(
1✔
1072
        &self,
1✔
1073
        gfa_out: P,
1✔
1074
        tag_func: F,
1✔
1075
    ) -> Result<(), Error> {
1✔
1076
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creatinf gfa file"));
1✔
1077
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1078

1079
        let pb = ProgressBar::new(self.len() as u64);
1✔
1080
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1081
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1082

1083
        for i in (0..self.len()).progress_with(pb) {
103✔
1084
            let n = self.get_node(i);
103✔
1085
            self.node_to_gfa(&n, &mut wtr, Some(&tag_func))?;
103✔
1086
        }
1087

1088
        wtr.flush().unwrap();
1✔
1089

1090
        Ok(())
1✔
1091
    }
1✔
1092

1093
    /// Write the graph to GFA format, with multithreading, 
1094
    /// pass `tag_func=None` to write without tags
1095
    pub fn to_gfa_otags_parallel<P: AsRef<Path> + Display + Sync, F: Fn(&Node<'_, K, D>) -> String + Sync>(
1✔
1096
        &self,
1✔
1097
        gfa_out: P,
1✔
1098
        tag_func: Option<&F>,
1✔
1099
    ) -> Result<(), Error> 
1✔
1100
    where 
1✔
1101
    K: Sync,
1✔
1102
    D: Sync,
1✔
1103
    {
1104
        // split into ranges according to thread count
1105
        let slices = current_num_threads();
1✔
1106
        let n_nodes = self.len();
1✔
1107
        let sz = n_nodes / slices + 1;
1✔
1108

1109
        debug!("n_nodes: {}", n_nodes);
1✔
1110
        debug!("sz: {}", sz);
1✔
1111

1112
        let mut parallel_ranges = Vec::with_capacity(slices);
1✔
1113
        let mut start = 0;
1✔
1114
        while start < n_nodes {
5✔
1115
            parallel_ranges.push(start..start + sz);
4✔
1116
            start += sz;
4✔
1117
        }
4✔
1118

1119
        let last_start = parallel_ranges.pop().expect("no kmers in parallel ranges").start;
1✔
1120
        parallel_ranges.push(last_start..n_nodes);
1✔
1121
        debug!("parallel ranges: {:?}", parallel_ranges);
1✔
1122

1123
        let mut files = Vec::with_capacity(current_num_threads());
1✔
1124

1125
        for i in 0..parallel_ranges.len() {
4✔
1126
            files.push(format!("{}-{}.gfa", gfa_out, i));
4✔
1127
        } 
4✔
1128

1129
        let pb = ProgressBar::new(self.len() as u64);
1✔
1130
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1131
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1132
        
1133
        
1134
        parallel_ranges.into_par_iter().enumerate().for_each(|(i, range)| {
4✔
1135
            let mut wtr = BufWriter::with_capacity(BUF, File::create(&files[i]).expect("error creating parallel gfa file"));
4✔
1136

1137
            for i in range {
103✔
1138
                let n = self.get_node(i);
103✔
1139
                self.node_to_gfa(&n, &mut wtr, tag_func).unwrap();
103✔
1140
                pb.inc(1);
103✔
1141
            }
103✔
1142

1143
            wtr.flush().unwrap();
4✔
1144
        });
4✔
1145

1146
        pb.finish_and_clear();
1✔
1147

1148
        // combine files
1149
        let mut out_file = BufWriter::with_capacity(BUF, File::create(format!("{}.gfa", gfa_out)).expect("error creating combined gfa file"));
1✔
1150
        writeln!(out_file, "H\tVN:Z:debruijn-rs")?;
1✔
1151

1152
        let pb = ProgressBar::new(files.len() as u64);
1✔
1153
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1154
        pb.set_message(format!("{:<32}", "combining files"));
1✔
1155

1156
        for file in files.iter() {
4✔
1157
            let open_file = File::open(file).expect("error opening parallel gfa file");
4✔
1158
            let mut reader = BufReader::new(open_file);
4✔
1159
            let mut buffer = [0; BUF];
4✔
1160

1161
            loop {
1162
                let linecount = reader.read(&mut buffer).unwrap();
8✔
1163
                if linecount == 0 { break }
8✔
1164
                out_file.write_all(&buffer[..linecount]).unwrap();
4✔
1165
            }
1166

1167
            remove_file(file).unwrap();
4✔
1168
        }
1169

1170
        out_file.flush().unwrap();
1✔
1171

1172
        Ok(())
1✔
1173
    }
1✔
1174

1175
    /// Write the graph to GFA format
1176
    pub fn to_gfa_partial<P: AsRef<Path>, F: Fn(&Node<'_, K, D>) -> String>(&self, gfa_out: P, tag_func: Option<&F>, nodes: Vec<usize>) -> Result<(), Error> {
1✔
1177
        let mut wtr = BufWriter::with_capacity(BUF, File::create(gfa_out).expect("error creating gfa file"));
1✔
1178
        writeln!(wtr, "H\tVN:Z:debruijn-rs")?;
1✔
1179

1180
        let pb = ProgressBar::new(self.len() as u64);
1✔
1181
        pb.set_style(ProgressStyle::with_template(PROGRESS_STYLE).unwrap().progress_chars("#/-"));
1✔
1182
        pb.set_message(format!("{:<32}", "writing graph to GFA file"));
1✔
1183

1184
        for i in nodes.into_iter().progress_with(pb) {
4✔
1185
            let n = self.get_node(i);
4✔
1186
            self.node_to_gfa(&n, &mut wtr, tag_func)?;
4✔
1187
        }
1188

1189
        wtr.flush().unwrap();
1✔
1190

1191
        Ok(())    
1✔
1192
    }
1✔
1193

1194
    fn node_to_tsv<W: Write, F>(&self, writer: &mut W, node_id: usize, data_format: F) -> Result<(), Box<dyn std::error::Error>> 
557✔
1195
    where 
557✔
1196
        F: Fn(&Node<'_, K, D>) -> String,
557✔
1197
    {
1198

1199
        let node = self.get_node(node_id);
557✔
1200
        let l_e = node.l_edges();
557✔
1201
        let r_e = node.r_edges();
557✔
1202

1203
        
1204
        if self.base.stranded {
557✔
1205
            // format: node id    l nb    r nb    seq    data
1206
            let l_nb = l_e.iter().map(|(_b, nb, _d, _f)| *nb).collect::<Vec<_>>();
230✔
1207
            let r_nb = r_e.iter().map(|(_b, nb, _d, _f)| *nb).collect::<Vec<_>>();
230✔
1208
            writeln!(writer, "{node_id}\t{:?}\t{:?}\t{}\t{}", l_nb, r_nb, node.sequence(), data_format(&node))?
230✔
1209
        } else {
1210
            // format: node id    l nb    l inc dir    r nb    r inc dir    seq    data
1211
            let l_nb = l_e.iter().map(|(_b, nb, _d, _f)| *nb).collect::<Vec<_>>();
327✔
1212
            let r_nb = r_e.iter().map(|(_b, nb, _d, _f)| *nb).collect::<Vec<_>>();
327✔
1213
            let l_nb_dirs = l_e.iter().map(|(_b, _nb, dir, _f)| *dir).collect::<Vec<_>>();
327✔
1214
            let r_nb_dirs = r_e.iter().map(|(_b, _nb, dir, _f)| *dir).collect::<Vec<_>>();
327✔
1215
            writeln!(writer, "{node_id}\t{:?}\t{:?}\t{:?}\t{:?}\t{}\t{}", l_nb, l_nb_dirs, r_nb, r_nb_dirs, node.sequence(), data_format(&node))?
327✔
1216
        }
1217

1218
        Ok(())
557✔
1219
    }
557✔
1220

1221
    /// save the graph as a tsv file with custom formatting for the node data
1222
    pub fn to_tsv<P, F>(&self, path: P, data_format: F) -> Result<(), Box<dyn std::error::Error>> 
2✔
1223
    where 
2✔
1224
        F: Fn(&Node<'_, K, D>) -> String,
2✔
1225
        P: AsRef<Path> + Display,
2✔
1226
    { 
1227
        let mut writer = BufWriter::new(File::create(path)?);
2✔
1228

1229
        // different format if stranded vs unstranded
1230
        if self.base.stranded {
2✔
1231
            writeln!(writer, "node id\tleft neighbors\tright neighbors\tsequence\tdata")?;
1✔
1232
        } else {
1233
            writeln!(writer, "node id\tleft neighbors\tleft nb incoming dirs\tright neighbors\tright nb incoming dirs\tsequence\tdata")?;
1✔
1234
        }
1235
            
1236
        for i in 0..self.len() {
557✔
1237
            self.node_to_tsv(&mut writer, i, &data_format)?
557✔
1238
        }
1239
        
1240
        Ok(())
2✔
1241
    }
2✔
1242

1243
    pub fn to_json_rest<W: Write, F: Fn(&D) -> Value>(
×
1244
        &self,
×
1245
        fmt_func: F,
×
1246
        mut writer: &mut W,
×
1247
        rest: Option<Value>,
×
1248
    ) {
×
1249
        writeln!(writer, "{{\n\"nodes\": [").unwrap();
×
1250
        for i in 0..self.len() {
×
1251
            let node = self.get_node(i);
×
1252
            node.to_json(&fmt_func, writer);
×
1253
            if i == self.len() - 1 {
×
1254
                writeln!(writer).unwrap();
×
1255
            } else {
×
1256
                writeln!(writer, ",").unwrap();
×
1257
            }
×
1258
        }
1259
        writeln!(writer, "],").unwrap();
×
1260

1261
        writeln!(writer, "\"links\": [").unwrap();
×
1262
        for i in 0..self.len() {
×
1263
            let node = self.get_node(i);
×
1264
            match node.edges_to_json(writer) {
×
1265
                true => {
1266
                    if i == self.len() - 1 {
×
1267
                        writeln!(writer).unwrap();
×
1268
                    } else {
×
1269
                        writeln!(writer, ",").unwrap();
×
1270
                    }
×
1271
                }
1272
                _ => continue,
×
1273
            }
1274
        }
1275
        writeln!(writer, "]").unwrap();
×
1276

1277
        match rest {
×
1278
            Some(Value::Object(v)) => {
×
1279
                for (k, v) in v.iter() {
×
1280
                    writeln!(writer, ",").expect("io error");
×
1281
                    write!(writer, "\"{}\": ", k).expect("io error");
×
1282
                    serde_json::to_writer(&mut writer, v).expect("io error");
×
1283
                    writeln!(writer).expect("io error");
×
1284
                }
×
1285
            }
1286
            _ => {
×
1287
                writeln!(writer).expect("io error");
×
1288
            }
×
1289
        }
1290

1291
        writeln!(writer, "}}").expect("io error");
×
1292
    }
×
1293

1294
    /// Write the graph to JSON
1295
    pub fn to_json<W: Write, F: Fn(&D) -> Value, RF: Fn(&mut W)>(
×
1296
        &self,
×
1297
        fmt_func: F,
×
1298
        writer: &mut W,
×
1299
    ) {
×
1300
        self.to_json_rest(fmt_func, writer, None);
×
1301
    }
×
1302

1303
    // iterate over graph or parial node IDs while leaving out the last node
1304
    fn iter_optional_partial<'a>(&self, partial_nodes: Option<&'a Vec<usize>>) -> Box<dyn Iterator<Item = usize> + 'a> {
4✔
1305
        if let Some(partial) = partial_nodes {
4✔
1306
            Box::new(partial[..(partial.len()-1)].iter().copied())
2✔
1307
        } else {
1308
            Box::new(0..(self.len()-1))
2✔
1309
        }
1310
    }
4✔
1311

1312
    /// write the graph or parts of the graph to a json file to view in 3d
1313
    pub fn to_json_3d<P, FN, FE>(&self, 
2✔
1314
        path: P, 
2✔
1315
        node_properties: &FN, 
2✔
1316
        edge_properties: &FE, 
2✔
1317
        partial_nodes: Option<&'_ Vec<usize>>
2✔
1318
    ) -> Result<(), Box<dyn std::error::Error>> 
2✔
1319
        where 
2✔
1320
        P: AsRef<Path>,
2✔
1321
        FN: Fn(&Node<K, D>) -> String,
2✔
1322
        FE: Fn(&Node<K, D>, usize, u8, Dir, bool) -> String,
2✔
1323
    {
1324
        let mut writer = BufWriter::new(File::create(path)?);
2✔
1325

1326
        writeln!(writer, "{{")?;
2✔
1327
        writeln!(writer, "\t\"nodes\": [")?;
2✔
1328

1329
        // write nodes to json
1330

1331
        for node_id in self.iter_optional_partial(partial_nodes) {
105✔
1332
            let node = self.get_node(node_id);
105✔
1333
            let node_fmt = node_properties(&node);
105✔
1334

1335
            writeln!(writer, "\t\t{{ {node_fmt} }},")?;
105✔
1336
        }
1337

1338
        // do last node separately because of comma
1339
        let last_node_id = match partial_nodes {
2✔
1340
            Some(partial) => *partial.last().expect("empty parial nodes vector"),
1✔
1341
            None => self.len() - 1
1✔
1342
        };
1343

1344
        let last_node = self.get_node(last_node_id);
2✔
1345
        let last_node_fmt = node_properties(&last_node);
2✔
1346

1347
        writeln!(writer, "\t\t{{ {last_node_fmt} }}")?;
2✔
1348

1349
        writeln!(writer, "\t],")?;
2✔
1350
        writeln!(writer, "\t\"links\": [")?;
2✔
1351

1352
        // write links to json
1353

1354
        for node_id in self.iter_optional_partial(partial_nodes) {
105✔
1355
            let node = self.get_node(node_id);
105✔
1356
            // write edges to the right
1357
            for (base, target_id, dir, flipped) in node.r_edges() {
105✔
1358
                let edge_fmt = edge_properties(&node, target_id, base, dir, flipped);
103✔
1359
                writeln!(writer, "\t\t{{ {edge_fmt} }},")?;
103✔
1360
            }
1361

1362
            // if stranded, continue, else also look at left edges
1363
            if self.base.stranded { continue; }
105✔
1364

1365
            // write edges to the right
NEW
1366
            for (base, target_id, dir, flipped) in node.l_edges() {
×
NEW
1367
                let edge_fmt = edge_properties(&node, target_id, base, dir, flipped);
×
NEW
1368
                writeln!(writer, "\t\t{{ {edge_fmt} }},")?;
×
1369
            }
1370
        }
1371

1372
        // eges for last node without comma
1373
        // FIXME only last edge should be without comma, not all edges from last node
1374
        // write edges to the right
1375
        for (base, target_id, dir, flipped) in last_node.r_edges() {
2✔
1376
            let edge_fmt = edge_properties(&last_node, target_id, base, dir, flipped);
2✔
1377
            writeln!(writer, "\t\t{{ {edge_fmt} }}")?;
2✔
1378
        }
1379

1380
        // if not stranded also look at left edges
1381
        if !self.base.stranded { 
2✔
1382
            // write edges to the right
NEW
1383
            for (base, target_id, dir, flipped) in last_node.l_edges() {
×
NEW
1384
                let edge_fmt = edge_properties(&last_node, target_id, base, dir, flipped);
×
NEW
1385
                writeln!(writer, "\t\t{{ {edge_fmt} }}")?;
×
1386
            }
1387
        }
2✔
1388

1389
        writeln!(writer, "\t]")?;
2✔
1390
        writeln!(writer, "}}")?;
2✔
1391

1392
        Ok(())
2✔
1393
    }
2✔
1394

1395
    /// Print a text representation of the graph.
1396
    pub fn print(&self) {
3✔
1397
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
3✔
1398
        for node in self.iter_nodes() {
139✔
1399
            println!("{:?}", node);
139✔
1400
        }
139✔
1401
    }
3✔
1402

1403
    pub fn print_with_data(&self) {
×
1404
        println!("DebruijnGraph {{ len: {}, K: {} }} :", self.len(), K::k());
×
1405
        for node in self.iter_nodes() {
×
1406
            println!("{:?} ({:?})", node, node.data());
×
1407
        }
×
1408
    }
×
1409

1410
    pub fn max_path_beam<F, F2>(&self, beam: usize, score: F, _solid_path: F2) -> Vec<(usize, Dir)>
×
1411
    where
×
1412
        F: Fn(&D) -> f32,
×
1413
        F2: Fn(&D) -> bool,
×
1414
    {
1415
        if self.is_empty() {
×
1416
            return Vec::default();
×
1417
        }
×
1418

1419
        let mut states = Vec::new();
×
1420

1421
        for i in 0..self.len() {
×
1422
            let node = self.get_node(i);
×
1423

1424
            // Initialize beam search on terminal nodes
1425
            if node.exts().num_exts_l() == 0 || node.exts().num_exts_r() == 0 {
×
1426
                let dir = if node.exts().num_exts_l() > 0 {
×
1427
                    Dir::Right
×
1428
                } else {
1429
                    Dir::Left
×
1430
                };
1431

1432
                let status = if node.exts().num_exts_l() == 0 && node.exts().num_exts_r() == 0 {
×
1433
                    Status::End
×
1434
                } else {
1435
                    Status::Active
×
1436
                };
1437

1438
                let mut path = SmallVec8::new();
×
1439
                path.push((i as u32, dir));
×
1440

1441
                let s = State {
×
1442
                    path,
×
1443
                    status,
×
1444
                    score: score(node.data()),
×
1445
                };
×
1446
                states.push(s);
×
1447
            }
×
1448
        }
1449

1450
        // No end nodes -- just start on the first node
1451
        if states.is_empty() {
×
1452
            // Make a start
×
1453
            let node = self.get_node(0);
×
1454
            let mut path = SmallVec8::new();
×
1455
            path.push((0, Dir::Left));
×
1456
            states.push(State {
×
1457
                path,
×
1458
                status: Status::Active,
×
1459
                score: score(node.data()),
×
1460
            });
×
1461
        }
×
1462

1463
        // Beam search until we can't find any more expansions
1464
        let mut active = true;
×
1465
        while active {
×
1466
            let mut new_states = Vec::with_capacity(states.len());
×
1467
            active = false;
×
1468

1469
            for s in states {
×
1470
                if s.status == Status::Active {
×
1471
                    active = true;
×
1472
                    let expanded = self.expand_state(&s, &score);
×
1473
                    new_states.extend(expanded);
×
1474
                } else {
×
1475
                    new_states.push(s)
×
1476
                }
1477
            }
1478

1479
            // workaround to sort by descending score - will panic if there are NaN scores
1480
            new_states.sort_by(|a, b| (-(a.score)).partial_cmp(&-(b.score)).unwrap());
×
1481
            new_states.truncate(beam);
×
1482
            states = new_states;
×
1483
        }
1484

1485
        for (i, state) in states.iter().take(5).enumerate() {
×
1486
            trace!("i:{}  -- {:?}", i, state);
×
1487
        }
1488

1489
        // convert back to using usize for node_id
1490
        states[0]
×
1491
            .path
×
1492
            .iter()
×
1493
            .map(|&(node, dir)| (node as usize, dir))
×
1494
            .collect()
×
1495
    }
×
1496

1497
    fn expand_state<F>(&self, state: &State, score: &F) -> SmallVec4<State>
×
1498
    where
×
1499
        F: Fn(&D) -> f32,
×
1500
    {
1501
        if state.status != Status::Active {
×
1502
            panic!("only attempt to expand active states")
×
1503
        }
×
1504

1505
        let (node_id, dir) = state.path[state.path.len() - 1];
×
1506
        let node = self.get_node(node_id as usize);
×
1507
        let mut new_states = SmallVec4::new();
×
1508

1509
        for (_, next_node_id, incoming_dir, _) in node.edges(dir.flip()) {
×
1510
            let next_node = self.get_node(next_node_id);
×
1511
            let new_score = state.score + score(next_node.data());
×
1512

1513
            let cycle = state
×
1514
                .path
×
1515
                .iter()
×
1516
                .any(|&(prev_node, _)| prev_node == (next_node_id as u32));
×
1517

1518
            let status = if cycle {
×
1519
                Status::Cycle
×
1520
            } else if next_node.edges(incoming_dir.flip()).is_empty() {
×
1521
                Status::End
×
1522
            } else {
1523
                Status::Active
×
1524
            };
1525

1526
            let mut new_path = state.path.clone();
×
1527
            new_path.push((next_node_id as u32, incoming_dir));
×
1528

1529
            let next_state = State {
×
1530
                path: new_path,
×
1531
                score: new_score,
×
1532
                status,
×
1533
            };
×
1534

1535
            new_states.push(next_state);
×
1536
        }
1537

1538
        new_states
×
1539
    }
×
1540

1541

1542
    pub fn iter_components(&'_ self) -> IterComponents<'_, K, D> {
7✔
1543
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
7✔
1544
        let pos = 0;
7✔
1545

1546
        for _i in 0..self.len() {
649✔
1547
            visited.push(false);
649✔
1548
        }
649✔
1549

1550
        IterComponents { 
7✔
1551
            graph: self, 
7✔
1552
            visited, 
7✔
1553
            pos }
7✔
1554
    }
7✔
1555

1556

1557
    /// iteratively returns 2D Vec with node_ids grouped according to the connected components they form
1558
    pub fn components_i(&self) -> Vec<Vec<usize>> {
1✔
1559
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
1✔
1560
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
1✔
1561

1562
        for _i in 0..self.len() {
135✔
1563
            visited.push(false);
135✔
1564
        }
135✔
1565

1566
        for i in 0..self.len() {
135✔
1567
            if !visited[i] {
135✔
1568
                let comp = self.component_i(&mut visited, i);
2✔
1569
                components.push(comp);
2✔
1570
            }
133✔
1571
        }
1572

1573
        components
1✔
1574
    }
1✔
1575

1576
    /// recursively detects which nodes form separate graph components
1577
    /// returns 2D vector with node ids per component
1578
    /// (may lead to stack overflow)
1579
    pub fn components_r(&self) -> Vec<Vec<usize>> {
2✔
1580
        let mut components: Vec<Vec<usize>> = Vec::with_capacity(self.len());
2✔
1581
        let mut visited: Vec<bool> = Vec::with_capacity(self.len());
2✔
1582

1583
        for _i in 0..self.len() {
137✔
1584
            visited.push(false);
137✔
1585
        }
137✔
1586

1587
        for i in 0..self.len() {
137✔
1588
            if !visited[i] {
137✔
1589
                let mut comp: Vec<usize> = Vec::new();
4✔
1590
                self.component_r(&mut visited, i, &mut comp);
4✔
1591
                components.push(comp);
4✔
1592
            }
133✔
1593
        }
1594

1595
        components
2✔
1596

1597
    }
2✔
1598

1599
    fn component_r<'a>(&'a self, visited: &'a mut Vec<bool>, i: usize, comp: &'a mut Vec<usize>) {
137✔
1600
        
1601
        visited[i] = true;
137✔
1602
        comp.push(i);
137✔
1603
        let mut edges = self.find_edges(i, Dir::Left);
137✔
1604
        let mut r_edges = self.find_edges(i, Dir::Right);
137✔
1605

1606
        edges.append(&mut r_edges);
137✔
1607

1608
        for (_, edge, _, _) in edges.iter() {
266✔
1609
            if !visited[*edge] {
266✔
1610
                self.component_r(visited, *edge, comp);
133✔
1611
            }
133✔
1612
        }
1613
    }
137✔
1614

1615
    fn component_i<'a>(&'a self, visited: &'a mut [bool], i: usize) -> Vec<usize> {
19✔
1616
        let mut edges: Vec<usize> = Vec::new();
19✔
1617
        let mut comp: Vec<usize> = Vec::new();
19✔
1618

1619
        edges.push(i);
19✔
1620

1621
        while let Some(current_edge) = edges.pop() {
804✔
1622
            if !visited[current_edge] { 
785✔
1623
                comp.push(current_edge);
784✔
1624
                visited[current_edge] = true;
784✔
1625

1626
                let mut l_edges = self.find_edges(current_edge, Dir::Left);
784✔
1627
                let mut r_edges = self.find_edges(current_edge, Dir::Right);
784✔
1628

1629
                l_edges.append(&mut r_edges);
784✔
1630

1631
                for (_, new_edge, _, _) in l_edges.into_iter() {
1,532✔
1632
                    if !visited[new_edge] {
1,532✔
1633
                        edges.push(new_edge);
766✔
1634
                    }
766✔
1635
                }
1636
            }
1✔
1637
        }
1638
        comp
19✔
1639
    }
19✔
1640

1641
    /// iterate over all edges of the graph, item: (node, ext base, ext dir, target node)
1642
    pub fn iter_edges(&self) -> EdgeIter<'_, K, D> {
17✔
1643
        EdgeIter::new(self)
17✔
1644
    }
17✔
1645

1646
    pub fn find_bad_nodes<F: Fn(&Node<'_, K, D>) -> bool>(&self, valid: F) -> Vec<usize> {
×
1647
        let mut bad_nodes = Vec::new();
×
1648

1649
        for (i, node) in enumerate(self.iter_nodes()) {
×
1650
            if !valid(&node) { bad_nodes.push(i); }
×
1651
        }
1652
        
1653
        bad_nodes
×
1654
    }
×
1655
}
1656

1657
impl<K: Kmer, SD: Debug> DebruijnGraph<K, SD> {
1658
    pub fn create_colors<'a, 'b: 'a, DI>(&'a self, config: &SummaryConfig, color_mode: ColorMode<'b>) -> Colors<'b, SD, DI> 
×
1659
    where 
×
1660
    SD: SummaryData<DI>,
×
1661
    {
1662
        Colors::new(self, config, color_mode)
×
1663
    }
×
1664
    
1665
    /// [`crate::EdgeMult`] will contain hanging edges if the nodes were filtered
1666
    pub fn fix_edge_mults<DI>(&mut self) 
1✔
1667
    where 
1✔
1668
        SD: SummaryData<DI>
1✔
1669
    {
1670
        if self.get_node(0).data().edge_mults().is_some() {
1✔
1671
            for i in 0..self.len() {
135✔
1672
                self.base.data[i].fix_edge_mults(self.base.exts[i]);
135✔
1673
            }
135✔
1674
        }
×
1675
    }
1✔
1676

1677
    /// if there are [`crate::EdgeMult`]s in the data, prune the graph by removing edges that have a low coverage
1678
    pub fn filter_edges<DI>(&mut self, min: u32) -> Result<(), String>
1✔
1679
    where 
1✔
1680
        SD: SummaryData<DI>
1✔
1681
    {
1682
        // return if there is no edge coverage available
1683
        if self.get_node(0).data().edge_mults().is_none() { return Err(String::from("no edge mults available")) };
1✔
1684

1685
        for i in 0..self.len() {
135✔
1686
            let em = self.get_node(i).data().edge_mults().expect("shold have em").clone();
135✔
1687
            let edges = [(Dir::Left, 0), (Dir::Left, 1), (Dir::Left, 2), (Dir::Left, 3), 
135✔
1688
                (Dir::Right, 0), (Dir::Right, 1), (Dir::Right, 2), (Dir::Right, 3)];
135✔
1689
            
1690
            for (dir, base) in edges {
1,080✔
1691
                if min > em.edge_mult(base, dir) {
1,080✔
1692
                    // remove invalid ext from node
814✔
1693
                    let ext = self.base.exts[i].remove(dir, base);
814✔
1694
                    self.base.exts[i] = ext;
814✔
1695
                }
814✔
1696
            }
1697
        }
1698

1699
        Ok(())
1✔
1700
    }
1✔
1701

1702
    /// if a node has a connection to a high quality node and low quality nodes 
1703
    /// in the same direction, remove the connections to the low quality ndoes
1704
    pub fn remove_lq_splits<DI>(&mut self, min_quality: BaseQuality) -> Result<(), String>
2✔
1705
    where 
2✔
1706
        SD: SummaryData<DI>
2✔
1707
    {
1708
        // check we do indeed have quality and graph is stranded
1709
        if self.get_node(0).data().quality().is_none() { return Err(String::from("no quality scores available")); }
2✔
1710
        if !self.base.stranded { return Err(String::from("graph must be stranded to remove ladders")) };
2✔
1711

1712
        // iterate over all nodes and look in both directions if there is a low quality node splitting off
1713
        for (node_id, out_dir) in (0..self.len()).flat_map(|id| [(id, Dir::Right), (id, Dir::Left)]) {
582✔
1714
            let out_edges = self.get_node(node_id).edges(out_dir);
582✔
1715

1716
            // check for split
1717
            if out_edges.len() < 2 {
582✔
1718
                // no split, move on
1719
                continue;
556✔
1720
            }
26✔
1721

1722
            // get qualities 
1723
            let nb_qualities = out_edges
26✔
1724
                .iter()
26✔
1725
                .map(|(_, nb_id, _, _)| (*nb_id, self
52✔
1726
                    .get_node(*nb_id)
52✔
1727
                    .data()
52✔
1728
                    .quality()
52✔
1729
                    .unwrap()
52✔
1730
                )).collect::<Vec<_>>();
26✔
1731

1732
            // check for good neighbor
1733
            let has_good_nb = nb_qualities.iter()
26✔
1734
                .any(|(_, quality)| *quality >= min_quality);
40✔
1735

1736
            if !has_good_nb {
26✔
1737
                // split but no good path, move on
1738
                continue;
4✔
1739
            }
22✔
1740

1741
            // remove split with low quality
1742
            for (nb_id, _) in nb_qualities.iter().filter(|(_, quality)| *quality < min_quality ) {
44✔
1743
                let path = vec![node_id, *nb_id];
22✔
1744
                if self.remove_path(path, out_dir).is_err() {
22✔
NEW
1745
                    warn!("lq tip path could not be removed")
×
1746
                }
22✔
1747
            }
1748
        }
1749

1750
        Ok(())
2✔
1751
    }
2✔
1752

1753
    /// remove bubbles/ladders and tips in which one path has a quality lower than the given `min_quality`
1754
    pub fn remove_lq_ladders_tips<DI>(&mut self, min_quality: BaseQuality, max_path_fac: usize) -> Result<(), String>
2✔
1755
    where
2✔
1756
        SD: SummaryData<DI>
2✔
1757
    {
1758
        // check we do indeed have quality and graph is stranded
1759
        if self.get_node(0).data().quality().is_none() { return Err(String::from("no quality scores available")); }
2✔
1760
        if !self.base.stranded { return Err(String::from("graph must be stranded to remove ladders")) };
2✔
1761

1762
        let min_path = 2 * K::k() - 1;
2✔
1763
        let max_path = max_path_fac * K::k() - 1;
2✔
1764

1765
        // iterate over nodes
1766
        for (node_id, out_dir) in (0..self.len()).flat_map(|id| [(id, Dir::Right), (id, Dir::Left)]) {
582✔
1767
            let in_dir = out_dir.flip();
582✔
1768

1769
            // check if node has multile outs to the right, at least one with bad quality and one with good quality
1770
            let node_out_edges = self.get_node(node_id).edges(out_dir);
582✔
1771

1772
            let good_neighbors = node_out_edges.iter()
582✔
1773
                .map(|(_, target_id, _, _)| *target_id)
582✔
1774
                .filter(|target_id| self.get_node(*target_id)
582✔
1775
                    .data()
481✔
1776
                    .quality()
481✔
1777
                    .unwrap() >= min_quality
481✔
1778
                ).collect::<Vec<_>>();
582✔
1779

1780
            let bad_neighbors = node_out_edges.iter()
582✔
1781
                .map(|(_, target_id, _, _)| *target_id)
582✔
1782
                .filter(|target_id| self.get_node(*target_id)
582✔
1783
                    .data()
481✔
1784
                    .quality()
481✔
1785
                    .unwrap() < min_quality
481✔
1786
                ).collect::<Vec<_>>();
582✔
1787

1788
            if good_neighbors.is_empty() | bad_neighbors.is_empty() { continue; }
582✔
1789

1790
            // follow the bad quality paths until we reach nodes with high quality again (or max search radius)
1791
            let mut possible_paths = Vec::new();
14✔
1792
            let mut tips = Vec::new();
14✔
1793

1794
            for bn in bad_neighbors {
14✔
1795
                let mut current_node_id = bn;
14✔
1796
                let mut path_groups = vec![vec![node_id]];
14✔
1797
                let mut path_length = K::k() - 1;
14✔
1798

1799
                let mut state = LadderState::Singular;
14✔
1800
                let mut q_state_high = false;
14✔
1801

1802
                loop {
1803
                    let current_node = self.get_node(current_node_id);
175✔
1804
                    let out_edges = current_node.edges(out_dir);
175✔
1805

1806
                    // add current node length to path
1807
                    path_length += current_node.len() - K::k() + 1;
175✔
1808

1809
                    // check if path has reached max length -> interrupt
1810
                    if path_length > max_path {
175✔
NEW
1811
                        break;
×
1812
                    }
175✔
1813
                    
1814
                    // check state and add current node to path
1815
                    let path_index = path_groups.len() - 1;
175✔
1816

1817
                    // if in singular state now or before, add node to path
1818
                    if matches!(state, LadderState::Singular) { 
175✔
1819
                        path_groups[path_index].push(current_node_id);
149✔
1820
                    }
149✔
1821

1822
                    // check if we increase ladder state
1823
                    let q_increase = (current_node.data().quality().unwrap() >= min_quality) & !q_state_high;
175✔
1824
                    let mult_increase = current_node.edges(in_dir).len() > 1;
175✔
1825
                    
1826
                    if q_increase {
175✔
1827
                        q_state_high = true
15✔
1828
                    }
160✔
1829

1830
                    if q_increase | mult_increase {
175✔
1831
                        match state {
17✔
1832
                            LadderState::Singular => {
17✔
1833
                                state = LadderState::Double;
17✔
1834
                            }
17✔
NEW
1835
                            LadderState::Double => () // ignore
×
1836
                        };
1837
                    }
158✔
1838

1839
                    // check if we decrease ladder state
1840
                    let q_decrease = (current_node.data().quality().unwrap() < min_quality) & q_state_high;
175✔
1841
                    let mult_decrease = out_edges.len() > 1;
175✔
1842

1843
                    if q_decrease {
175✔
1844
                        q_state_high = false;
5✔
1845
                    }
170✔
1846

1847
                    if q_decrease | mult_decrease {
175✔
1848
                        match state {
13✔
1849
                            LadderState::Singular => (), // ignore
5✔
1850
                            LadderState::Double => {
8✔
1851
                                state = LadderState::Singular;
8✔
1852
                                path_groups.push(vec![current_node_id]); // start new path group
8✔
1853
                            }
8✔
1854
                        }
1855
                    }
162✔
1856

1857
                    // check if we have met end criterium -> save path
1858
                    let quality_req =  current_node.data().quality().unwrap() >= min_quality;
175✔
1859
                    let len_req = path_length >= min_path;
175✔
1860
                    // path is a simple tip -> save as tip
1861
                    let is_tip = out_edges.is_empty() & (path_groups.len() == 1); // TODO maybe remove req 2 in future
175✔
1862

1863
                    if quality_req & len_req {
175✔
1864
                        possible_paths.push(path_groups);
10✔
1865
                        break;
10✔
1866
                    } else if is_tip {
165✔
1867
                        tips.push(path_groups);
2✔
1868
                        break;
2✔
1869
                    }
163✔
1870

1871
                    // we have not met the conditions and keep moving
1872

1873
                    // find next node
1874
                    let next_node_id = if out_edges.len() == 1 {
163✔
1875
                        let (_, next_node_id, _, _) = out_edges[0];
154✔
1876
                        next_node_id
154✔
1877
                    } else if out_edges.is_empty() {
9✔
1878
                        // dead end which has not qualified as tip
1879
                        break;
2✔
1880
                    } else {
1881
                        // choose the lowest quality path
1882

1883
                        let worst_neighbor = out_edges.iter()
7✔
1884
                            .map(|(_, target_id, _, _)| (*target_id, self.get_node(*target_id)
14✔
1885
                                .data()
14✔
1886
                                .quality()
14✔
1887
                                .unwrap()))
14✔
1888
                            .min_by(|(_, q_a), (_, q_b)| q_a.cmp(q_b));
7✔
1889

1890
                        if let Some((worst_nb_id, _)) = worst_neighbor {
7✔
1891
                            worst_nb_id
7✔
1892
                        } else {
NEW
1893
                            break;
×
1894
                        }
1895
                    };
1896

1897
                    current_node_id = next_node_id;
161✔
1898
                }
1899
            }
1900

1901
            let possible_targets = possible_paths.iter().map(|p| p.last().unwrap().last().unwrap()).collect::<Vec<_>>();
14✔
1902
            let mut confirmed_targets = Vec::new();
14✔
1903
            // follow the good quality paths until we reach a possible target node (or max search radius)
1904
            // if we reached a target node, send bad path to be removed from graph
1905
            for gn in good_neighbors {
14✔
1906
                let mut path_length = K::k() - 1;
14✔
1907
                let mut current_node_id = gn;
14✔
1908

1909
                loop {
1910
                    // check if we have exceeded the search radius
1911
                    if path_length > max_path {
180✔
1912
                        break;
1✔
1913
                    }
179✔
1914

1915
                    // add current node length to path length
1916
                    let current_node = self.get_node(current_node_id);
179✔
1917
                    path_length += current_node.len() - K::k() + 1;
179✔
1918

1919
                    // check if we have found a target
1920
                    if possible_targets.contains(&&current_node_id) {
179✔
1921
                        confirmed_targets.push(current_node_id);
8✔
1922
                        break;
8✔
1923
                    }
171✔
1924

1925
                    // look for next node
1926
                    let out_edges = current_node.edges(out_dir);
171✔
1927

1928
                    let next_node_id = if out_edges.len() == 1 {
171✔
1929
                        let (_, next_node_id, _, _) = out_edges[0];
166✔
1930
                        next_node_id
166✔
1931
                    } else if out_edges.is_empty() {
5✔
1932
                        // dead end, break
1933
                        break;
5✔
1934
                    } else {
1935
                        // use node with highest quality
1936
                        // TODO future: use read mapping?
NEW
1937
                        let good_neighbors = out_edges.iter()
×
NEW
1938
                            .map(|(_, target_id, _, _)| (*target_id, self.get_node(*target_id)
×
NEW
1939
                                .data()
×
NEW
1940
                                .quality()
×
NEW
1941
                                .unwrap()))
×
NEW
1942
                            .max_by(|(_, q_a), (_, q_b)| q_a.cmp(q_b));
×
NEW
1943
                        if let Some((best_nb_id, _)) = good_neighbors {
×
NEW
1944
                            best_nb_id
×
1945
                        } else {
NEW
1946
                            break;
×
1947
                        }
1948
                    };
1949

1950
                    current_node_id = next_node_id;
166✔
1951
                }
1952
            }
1953

1954
            // check if we have found end nodes of possible paths by following good quality paths
1955
            // if so, remove path
1956
            for path_group in possible_paths {
14✔
1957
                let target = path_group.last().unwrap().last().unwrap();
10✔
1958

1959
                if confirmed_targets.contains(target) {
10✔
1960
                    for path in path_group {
13✔
1961
                        if let Err(err) = self.remove_path(path.clone(), out_dir) {
13✔
NEW
1962
                            warn!("lq ladder partial path could not be removed, likely cause: loop, edges were already removed. parital path: {:?}", path)
×
1963
                        }
13✔
1964
                    }
1965
                }
2✔
1966
            }
1967

1968
            // remove tip paths
1969
            for path_group in tips {
14✔
1970
                let path = path_group.into_iter().next().expect("empty tip path found");
2✔
1971
                if let Err(err) = self.remove_path(path.clone(), out_dir) {
2✔
NEW
1972
                    warn!("lq tip partial path could not be removed, likely cause: loop, edges were already removed. parital path: {:?}", path)
×
1973
                }
2✔
1974
            }
1975

1976

1977
        }
1978

1979

1980
        Ok(())
2✔
1981
    }
2✔
1982

1983
    /// remove simple ladder structures (bubbles) caused by 1-base sequencing errors from the graph
1984
    /// 
1985
    /// graph must contain edge mults and be stranded
1986
    /// this function will likely leave tips on the graph, so it is recommended to run
1987
    /// [`DebruijnGraph::remove_tips`] afterwards
1988
    /// 
1989
    /// ladder structure refers to bubbles where one side has been compressed into one node
1990
    /// but the other has low compression, due to differences in coverage and thus data variance,
1991
    /// leading to a ladder-like appearance
1992
    pub fn remove_ladders<DI, P>(&mut self, min_diff_factor: u32, max_avg_low_cov: f32, out_path: Option<P>) -> Result<(), String> 
2✔
1993
    where 
2✔
1994
        SD: SummaryData<DI>,
2✔
1995
        P: AsRef<Path>
2✔
1996
    {
1997
        // interrupt if we dont have edge mults
1998
        if self.get_node(0).data().edge_mults().is_none() { return Err(String::from("no edge mults available")) };
2✔
1999
        if !self.base.stranded { return Err(String::from("must be stranded")) };
2✔
2000

2001
        let mut writer = out_path.map(|path| BufWriter::new(File::create(path).expect("error creating ladder stats file")));
2✔
2002
        if let Some(wtr) = writer.as_mut() {
2✔
2003
            writeln!(wtr, "avg high cov,avg low cov,high truth ratio,low truth ratio").unwrap();
×
2004
        }
2✔
2005

2006
        // iterate over nodes
2007
        for node_id in 0..self.len() {
142✔
2008
            // check if node has outgoing edge(s) with both high and low coverage
2009
            let outs = self.get_node(node_id).data().edge_mults().expect("should have em").right();
142✔
2010

2011
            let Some((out_max_base, &out_max_cov)) = outs.iter().rev().enumerate().filter(|&(_, &c)| c  > 0).max_by(|&(_b1, &c1), &(_b2, c2)| c1.cmp(c2)) else { continue };
568✔
2012
            let smaller_outs = outs.iter().copied().rev().enumerate().filter(|&(_b, c)| (c > 0) & (out_max_cov > c)).collect::<Vec<_>>();
544✔
2013

2014
            // TODO add factor back in, remove writer
2015

2016
            if smaller_outs.is_empty() { continue; }
136✔
2017

2018
            // follow path with highest coverage until target length is reached
2019
            let Some((target_node, avg_high_cov, high_cc)) = self.follow_ladder_path_high(node_id, out_max_base as u8, out_max_cov) else { continue; };
8✔
2020
            
2021
            // check all small outs
2022
            for (s_base, s_cov) in smaller_outs {
5✔
2023
                let Some((target_paths, avg_low_cov, low_cc)) = self.follow_ladder_path_low(node_id, s_base as u8, s_cov) else { continue; };
5✔
2024
                let other_target_node = *target_paths.last().expect("should have at least one element").last().expect("should have at least two elements");
5✔
2025

2026
                if target_node == other_target_node {
5✔
2027
                    // the two paths landed on the same node -> remove all edges in the low coverage path
2028
                    if (s_cov * min_diff_factor <= out_max_cov) & (avg_low_cov <= max_avg_low_cov) {
2✔
2029
                        for path in target_paths.iter() {
7✔
2030
                            if self.remove_path(path.clone(), Dir::Right).is_err() {
7✔
2031
                                warn!("removing ladders: partial path could not be removed, likely cause: loop, edges were already removed. parital path: {:?}", path)
×
2032
                            }
7✔
2033
                        } 
2034
                    }
×
2035
                    
2036
                    if let Some(wtr) = writer.as_mut() {
2✔
2037
                        writeln!(wtr, "{},{},{},{}", avg_high_cov, avg_low_cov, high_cc, low_cc).unwrap();
×
2038
                    }
2✔
2039
                }
3✔
2040
            }
2041
        }
2042

2043
        Ok(())
2✔
2044
    }
2✔
2045

2046
    /// follow the presumably erroneous ladder path with low coverage
2047
    fn follow_ladder_path_low<DI>(&self, start_node_id: usize, start_ext: u8, start_cov: u32) -> Option<(Vec<Vec<usize>>, f32, f32)> 
5✔
2048
    where SD: SummaryData<DI>
5✔
2049
    {
2050
        // state is switched if coverage rises by 20% + 2 (so it's at least 2 more)
2051
        // nodes with higher state are kept connected together
2052
        // tests have shown that increase in coverage does not necessarily indicate 
2053
        // a multiplicity change
2054
        // however, we are trying to avoid false positives - false negatives will 
2055
        // most likely be cut off from the component or would be removed by remove_tips
2056
        // as a next step
2057
        // increasing these values will increase the number of removed edges
2058
        const COV_STATE_FACTOR: f32 = 0.2; 
2059
        const COV_STATE_ADD: f32 = 2.;
2060

2061
        // path, including start and target node
2062
        let mut paths = Vec::new();
5✔
2063
        paths.push(Vec::new());
5✔
2064
        paths[0].push(start_node_id);
5✔
2065

2066
        let target_length = K::k();
5✔
2067
        let mut path_length = 0;
5✔
2068
        let mut n_correct_edges = 0;
5✔
2069

2070
        // get fist next node
2071
        let sequence = self.base.sequences.get(start_node_id);
5✔
2072
        let term_kmer: K = sequence.term_kmer(Dir::Right);
5✔
2073
        let next_kmer = term_kmer.extend(start_ext, Dir::Right);
5✔
2074
        let (mut current_node_id, _, _) = self.find_link(next_kmer, Dir::Right).expect("link should exist"); 
5✔
2075
        paths[0].push(current_node_id);
5✔
2076

2077
        if self.check_edge_truth(start_node_id, current_node_id) {
5✔
2078
            n_correct_edges += 1;
×
2079
        }
5✔
2080

2081
        let mut current_cov = start_cov as f32;
5✔
2082
        let mut sum_path_cov = start_cov as f32;
5✔
2083
        let mut coverage_counter = 1; 
5✔
2084

2085
        // state to track if nodes should be removed or we're suspecting there's another path here
2086
        // only track edges for removal while state is Singular
2087
        // only track one additional path, else too complicated, return None
2088
        // increase state if there is another incoming edge or if the coverage suddenly increases drastically
2089
        // decrease state if there is another outgoing edge ot if the coverage suddenly decreases drastically
2090
        // if state cannot be increased or decreased further, return None
2091
        let mut state = LadderState::Singular;
5✔
2092

2093
        loop {
2094
            // check if current node is "target node", i.e., the path has reached the desired length
2095
            match path_length {
59✔
2096
                len if len == target_length => return Some((paths, (sum_path_cov / coverage_counter as f32), (n_correct_edges as f32 / (path_length + 1) as f32))), // path has target length
59✔
2097
                len if len > target_length => return None, // path has surpassed desired length => invalid
54✔
2098
                _ => () // path has not yet reached desired length, continue
54✔
2099
            }
2100

2101
            // since current node is not target node, add length to path
2102
            let len = self.get_node(current_node_id).len() - (K::k() - 1);
54✔
2103
            path_length += len;
54✔
2104

2105
            // get current node
2106
            let current_node = self.get_node(current_node_id);
54✔
2107
            
2108
            // get edges
2109
            let in_edges = current_node.l_edges();
54✔
2110
            let out_edges = current_node.r_edges();
54✔
2111

2112
            // get coverage
2113
            // edge cov must be available
2114
            let edge_coverages = current_node.data().edge_mults().expect("must have edge mults");
54✔
2115

2116
            // choose next edge by choosing coverage closest to current coverage
2117
            let mut out_ext = None;
54✔
2118
            let mut out_node_id = None;
54✔
2119
            let mut cov_diff = i32::MAX;
54✔
2120
            for (e, id, _, _) in out_edges.iter() {
59✔
2121
                let cov = edge_coverages.edge_mult(*e, Dir::Right) as i32;
59✔
2122
                let new_cov_diff = (current_cov as i32 - cov).abs();
59✔
2123
                if new_cov_diff < cov_diff {
59✔
2124
                    (out_ext, out_node_id, cov_diff) = (Some(*e), Some(*id), new_cov_diff);
59✔
2125
                }
59✔
2126
            }
2127
            let (Some(out_ext), Some(out_node_id)) = (out_ext, out_node_id) else { return None; };
54✔
2128

2129
            // ideally, low path nodes should have one incoming and one outgoing edge
2130
            // if two incoming edges, increase state from singular to double if possible
2131
            match in_edges.len() {
54✔
2132
                1 => (),
49✔
2133
                2 => match state {
5✔
2134
                    LadderState::Singular => state = LadderState::Double,
5✔
2135
                    LadderState::Double => return None // getting too complicated
×
2136
                }
2137
                _ => return None
×
2138
            }
2139

2140
            // if two outgoing edges, decrease state from double to singular if possible
2141
            // if already singular, 
2142
            match out_edges.len() {
54✔
2143
                1 => (),
49✔
2144
                2 => match state {
5✔
2145
                    LadderState::Singular => (), // could return None, but would kill if overlap is only one node long
1✔
2146
                    LadderState::Double => {
4✔
2147
                        state = LadderState::Singular;
4✔
2148
                        paths.push(vec![current_node_id]); // start new path
4✔
2149
                    }
4✔
2150
                }
2151
                _ => return None
×
2152
            }
2153

2154
            // check coverage, in theoretical ladder, coverage should be uniform
2155
            let coverage = edge_coverages.edge_mult(out_ext, Dir::Right) as f32;
54✔
2156
            match state {
54✔
2157
                LadderState::Singular => {
2158
                    // single state: check if next coverage is similar enough to current coverage
2159
                    // if way bigger, increase state, else adapt current coverage
2160
                    if  coverage > current_cov + current_cov * COV_STATE_FACTOR + COV_STATE_ADD {
34✔
2161
                        // higher by too much, change state
2✔
2162
                        state = LadderState::Double;
2✔
2163
                    } else {
32✔
2164
                        // in acceptable frame
32✔
2165
                        current_cov = coverage;
32✔
2166
                    }
32✔
2167
                }
2168
                LadderState::Double => {
2169
                    // double state: if way smaller (back in acceptable frame), decrease state, else dont treat as current coverage
2170
                    if coverage < current_cov + current_cov * COV_STATE_FACTOR + COV_STATE_ADD {
20✔
2171
                        // back in acceptable range
2✔
2172
                        state = LadderState::Singular;
2✔
2173
                        paths.push(vec![current_node_id]); // start new path
2✔
2174
                        current_cov = coverage;
2✔
2175
                    } // else continue on 
18✔
2176
                    // TODO check if better to also interrupt if coverage increases further
2177
                }
2178
            }
2179

2180
            // if state singular try to check if edge is "correct", add extra length of current node to it to account for compression
2181
            match state {
54✔
2182
                LadderState::Singular => {
2183
                    if self.check_edge_truth(current_node_id, out_node_id) {
34✔
2184
                        n_correct_edges += len;
×
2185
                    }
34✔
2186
                }
2187
                LadderState::Double => ()
20✔
2188
            }
2189

2190
            // set current node id to next node to be visited
2191
            current_node_id = out_node_id;
54✔
2192

2193
            // add current node to path, unless state is double
2194
            match state {
54✔
2195
                LadderState::Double => (),
20✔
2196
                LadderState::Singular => {
34✔
2197
                    sum_path_cov += current_cov;
34✔
2198
                    coverage_counter += 1;
34✔
2199
                    let last_path = paths.len() - 1;
34✔
2200
                    paths[last_path].push(current_node_id); // add node to latest path
34✔
2201
                }
34✔
2202
            }
2203
        }
2204
    }
5✔
2205

2206
    /// follow a path of the length 2*k - 1 by choosing the edges with the hightest coverage
2207
    /// requres the graph to have edge mults
2208
    fn follow_ladder_path_high<DI>(&self, start_node_id: usize, start_ext: u8, start_cov: u32) -> Option<(usize, f32, f32)> 
8✔
2209
    where SD: SummaryData<DI>
8✔
2210
    {
2211

2212
        let target_length = K::k();
8✔
2213
        let mut path_length = 0;
8✔
2214
        let mut sum_path_cov = start_cov; 
8✔
2215
        let mut coverage_counter = 1;
8✔
2216
        let mut n_correct_edges = 0; 
8✔
2217

2218
        // get fist next node
2219
        let sequence = self.base.sequences.get(start_node_id);
8✔
2220
        let term_kmer: K = sequence.term_kmer(Dir::Right);
8✔
2221
        let next_kmer = term_kmer.extend(start_ext, Dir::Right);
8✔
2222
        let (mut current_node_id, _, _) = self.find_link(next_kmer, Dir::Right).expect("link should exist"); 
8✔
2223

2224
        if self.check_edge_truth(start_node_id, current_node_id) {
8✔
2225
            n_correct_edges += 1;
4✔
2226
        }
4✔
2227

2228
        loop {
2229
            // check if current node is "target node", i.e., the path has reached the desired length
2230
            match path_length {
68✔
2231
                pl if pl == target_length => return Some((current_node_id, (sum_path_cov as f32 / coverage_counter as f32), (n_correct_edges as f32 / (path_length + 1) as f32))), // path has target length
68✔
2232
                pl if pl > target_length => return None, // path has surpassed desired length => invalid
63✔
2233
                _ => () // path has not yet reached desired length, continue
63✔
2234
            }
2235

2236
            // since current node is not target node, add length to path
2237
            let len = self.get_node(current_node_id).len() - (K::k() - 1);
63✔
2238
            path_length += len;
63✔
2239

2240
            // get current node
2241
            let current_node = self.get_node(current_node_id);
63✔
2242
        
2243
            // find outgoing edge with highest coverage
2244
            let em = current_node.data().edge_mults().expect("must have edge mults").right();
63✔
2245
            let (max_cov_base, &max_cov) = em.iter().rev().enumerate().filter(|&(_, &c)| c > 0).max_by(|&(_b1, &c1), &(_b2, c2)| c1.cmp(c2))?;
252✔
2246

2247
            // get next node id
2248
            let sequence = self.base.sequences.get(current_node_id);
60✔
2249
            let term_kmer: K = sequence.term_kmer(Dir::Right);
60✔
2250
            let next_kmer = term_kmer.extend(max_cov_base as u8, Dir::Right);
60✔
2251
            let (out_node_id, _, _) = self.find_link(next_kmer, Dir::Right).expect("link should exist"); 
60✔
2252

2253
            // try to check if edge is "correct", add extra length of current node to it to account for compression
2254
            if self.check_edge_truth(current_node_id, out_node_id) {
60✔
2255
                n_correct_edges += len;
35✔
2256
            }
35✔
2257

2258
            // set current node id to next node to be visited
2259
            current_node_id = out_node_id;
60✔
2260
            sum_path_cov += max_cov;
60✔
2261
            coverage_counter += 1;
60✔
2262
        }
2263
    }
8✔
2264

2265
    /// remove tips from the graph, reqires the graoh to have edge mults and be stranded
2266
    /// it is recommended to use this function after [`DebruijnGraph::remove_ladders`], since 
2267
    /// the latter will likely leave tips in the graph
2268
    pub fn remove_tips<DI, P>(&mut self, min_diff_factor: u32, max_avg_tip_cov: f32, out_path: Option<P>) -> Result<(), String> 
2✔
2269
    where 
2✔
2270
        SD: SummaryData<DI>,
2✔
2271
        P: AsRef<Path>
2✔
2272
    {
2273
        // interrupt if we dont have edge mults
2274
        if self.get_node(0).data().edge_mults().is_none() { return Err(String::from("no edge mults available")) };
2✔
2275
        if !self.base.stranded { return Err(String::from("must be stranded")) };
2✔
2276

2277
        let mut writer = out_path.map(|path| BufWriter::new(File::create(path).expect("error creating ladder stats file")));
2✔
2278
        if let Some(wtr) = writer.as_mut() {
2✔
2279
            writeln!(wtr, "high cov,avg low cov,max true,tip truth ratio,tip len,dir").unwrap();
×
2280
        }
2✔
2281

2282
        let max_len = K::k();
2✔
2283

2284
        // iterate over nodes
2285
        for node_id in 0..self.len() {
66✔
2286
            // do for both left and right as "outgoing" direction 
2287
            // right for regular outgoing and tips from the read-end
2288
            // left for the reverse -> tips from the read-start
2289
            for dir in [Dir::Left, Dir::Right] {
132✔
2290
                let current_node = self.get_node(node_id);
132✔
2291
                // check if node has outgoing edge(s) with both high and low coverage
2292
                let outs = current_node.data().edge_mults().expect("should have em").single_dir(dir).edge_mults;
132✔
2293

2294
                let Some((out_max_base, &out_max_cov)) = outs.iter().rev().enumerate().filter(|&(_, &c)| c  > 0).max_by(|&(_b1, &c1), &(_b2, c2)| c1.cmp(c2)) else { continue };
528✔
2295
                let smaller_outs = outs.iter().copied().rev().enumerate().filter(|&(_b, c)| (c > 0) & (out_max_cov > c)).collect::<Vec<_>>();
444✔
2296
                
2297
                if smaller_outs.is_empty() { continue; }
111✔
2298

2299
                // try and check if max cov edge is correct
2300
                let max_connection_correct = {
4✔
2301
                    let sequence = self.base.sequences.get(node_id);
4✔
2302
                    let term_kmer: K = sequence.term_kmer(dir);
4✔
2303
                    let next_kmer = term_kmer.extend(out_max_base as u8, dir);
4✔
2304
                    let (next_node_id, _, _) = self.find_link(next_kmer, dir).expect("missing link");
4✔
2305

2306
                    self.check_edge_truth(node_id, next_node_id)
4✔
2307
                }; // if current node is incorrect, connection is incorrect in any case
2308
                
2309
                // check all small outs
2310
                for (s_base, s_cov) in smaller_outs {
4✔
2311
                    // path has to be short + unambiguous + consistently low coverage
2312
                    let Some((tip_path, avg_tip_coverage, truth_ratio, tip_len)) = self.follow_tip_path(node_id, s_base as u8, s_cov, dir) else { continue; };
4✔
2313

2314
                    // remove path if coverage below threshold
2315
                    if (s_cov * min_diff_factor <= out_max_cov) & (avg_tip_coverage <= max_avg_tip_cov) & (tip_len <= max_len) {
4✔
2316
                        self.remove_path(tip_path, dir)?;
4✔
2317
                    }
×
2318
                        
2319
                    if let Some(wtr) = writer.as_mut() {
4✔
2320
                        writeln!(wtr, "{},{},{},{},{},{:?}", out_max_cov, avg_tip_coverage, max_connection_correct, truth_ratio, tip_len, dir).unwrap();
×
2321
                    }
4✔
2322
                }
2323
            }
2324
        }
2325
        
2326
        Ok(())
2✔
2327
    }
2✔
2328

2329
    /// follow a tip path, returns path, average coverage and ration of correct to incorrect connections
2330
    fn follow_tip_path<DI>(&self, start_node_id: usize, start_ext: u8, start_cov: u32, dir: Dir) -> Option<(Vec<usize>, f32, f32, usize)> 
4✔
2331
    where 
4✔
2332
        SD: SummaryData<DI>
4✔
2333
    {
2334
        const COV_MARGIN: f32 = 0.3;
2335
        const COV_ADD_MARGIN: f32 = 4.;
2336

2337
        // path, including start and target node
2338
        let mut path = Vec::new();
4✔
2339
        path.push(start_node_id);
4✔
2340

2341
        let mut path_length = 0;
4✔
2342
        let mut n_correct_edges = 0;
4✔
2343

2344
        // get fist next node
2345
        let sequence = self.base.sequences.get(start_node_id);
4✔
2346
        let term_kmer: K = sequence.term_kmer(dir);
4✔
2347
        let next_kmer = term_kmer.extend(start_ext, dir);
4✔
2348
        let (mut current_node_id, _, _) = self.find_link(next_kmer, dir).expect("link should exist"); 
4✔
2349
        path.push(current_node_id);
4✔
2350

2351
        if self.check_edge_truth(start_node_id, current_node_id) {
4✔
2352
            n_correct_edges += 1;
×
2353
        }
4✔
2354

2355
        let mut current_cov = start_cov as f32;
4✔
2356
        let mut sum_path_cov = start_cov as f32;
4✔
2357
        let mut coverage_counter = 1;
4✔
2358

2359
        loop {     
2360
            // get current node
2361
            let current_node = self.get_node(current_node_id);
13✔
2362

2363
            // add length to path
2364
            let len = current_node.len() - (K::k() - 1);
13✔
2365
            path_length += len;
13✔
2366

2367
            // low path nodes should have only one incoming and one outgoing edge
2368
            let in_edges = current_node.edges(dir.flip());
13✔
2369
            if in_edges.len() != 1 { return None; }
13✔
2370

2371
            let out_edges = current_node.edges(dir);
13✔
2372
            match out_edges.len() {
13✔
2373
                oe if oe > 1 => return None,
13✔
2374
                0 => return Some((path, (sum_path_cov / coverage_counter as f32), (n_correct_edges as f32 / path_length as f32), path_length)),
4✔
2375
                _ => ()
9✔
2376
            }
2377

2378
            // if edge cov avalable, check for similarity
2379
            let (out_ext, out_node_id, _, _) = out_edges[0];
9✔
2380
            if let Some(em) = current_node.data().edge_mults() {
9✔
2381
                let coverage = em.edge_mult(out_ext, dir) as f32;
9✔
2382
                if coverage < current_cov - current_cov * COV_MARGIN - COV_ADD_MARGIN // extra two for small values
9✔
2383
                    || coverage > current_cov + current_cov * COV_MARGIN + COV_ADD_MARGIN {
9✔
2384
                    return None;
×
2385
                } else {
9✔
2386
                    current_cov = coverage;
9✔
2387
                }
9✔
2388
            }
×
2389

2390
            // try to check if edge is "correct", add extra length of current node to it to account for compression
2391
            if self.check_edge_truth(current_node_id, out_node_id) {
9✔
2392
                n_correct_edges += len;
×
2393
            }
9✔
2394

2395
            // set current node id to next node to be visited
2396
            current_node_id = out_node_id;
9✔
2397
            sum_path_cov += current_cov;
9✔
2398
            coverage_counter += 1;
9✔
2399
            // add current node to path
2400
            path.push(current_node_id);
9✔
2401

2402
        }
2403
    }
4✔
2404

2405
    /// remove the edges (not the nodes) of a path in the graph
2406
    fn remove_path<DI>(&mut self, path: Vec<usize>, base_dir: Dir) -> Result<(), String> 
48✔
2407
    where SD: SummaryData<DI>
48✔
2408
    {
2409
        let mut path_iter = path.clone().into_iter();
48✔
2410
        let Some(mut current_node_id) = path_iter.next() else { return Ok(()); };
48✔
2411

2412
        loop {
2413
            // get next node id
2414
            let Some(next_node_id) = path_iter.next() else { return Ok(()); }; // whole path has been covered
204✔
2415
            // remove ext to the right of current node
2416
            let Some((out_base, _, _, _)) = self.get_node(current_node_id).edges(base_dir).iter().find(|&(_, id, _, _)| *id == next_node_id).copied()
181✔
2417
                else { return Err(format!(
×
2418
"no edge to remove (base dir)
×
2419
path: {:?}
×
2420
dir: {:?}
×
2421
node1:
×
2422
    id: {current_node_id}
×
2423
    left e: {:?}
×
2424
    right e: {:?}
×
2425
    exts: {:?}
×
2426
    data: {:?}
×
2427
node2:
×
2428
    id: {next_node_id}
×
2429
    left e: {:?}
×
2430
    right e: {:?}
×
2431
    exts: {:?}
×
2432
    data: {:?}",
×
2433
                path,
×
2434
                base_dir,
×
2435
                self.get_node(current_node_id).l_edges(),
×
2436
                self.get_node(current_node_id).r_edges(),
×
2437
                self.get_node(current_node_id).exts(),
×
2438
                self.get_node(current_node_id).data(),
×
2439
                self.get_node(next_node_id).l_edges(),
×
2440
                self.get_node(next_node_id).r_edges(),
×
2441
                self.get_node(next_node_id).exts(),
×
2442
                self.get_node(next_node_id).data(),
×
2443
                ))
×
2444
            };
2445

2446
            // remove ext 
2447
            self.base.exts[current_node_id] = self.base.exts[current_node_id].remove(base_dir, out_base);
156✔
2448
        
2449

2450
            // remove ext to the left of the next node
2451
            let Some((in_base, _, _, _)) = self.get_node(next_node_id).edges(base_dir.flip()).iter().find(|&(_, id, _, _)| *id == current_node_id).copied() 
165✔
2452
                else { return Err( format!(
×
2453
"no edge to remove (other dir)
×
2454
path: {:?}
×
2455
dir: {:?}
×
2456
node1:
×
2457
    id: {current_node_id}
×
2458
    left e: {:?}
×
2459
    right e: {:?}
×
2460
    exts: {:?}
×
2461
    data: {:?}
×
2462
node2:
×
2463
    id: {next_node_id}
×
2464
    left e: {:?}
×
2465
    right e: {:?}
×
2466
    exts: {:?}
×
2467
    data: {:?}",
×
2468
                path,
×
2469
                base_dir,
×
2470
                self.get_node(current_node_id).l_edges(),
×
2471
                self.get_node(current_node_id).r_edges(),
×
2472
                self.get_node(current_node_id).exts(),
×
2473
                self.get_node(current_node_id).data(),
×
2474
                self.get_node(next_node_id).l_edges(),
×
2475
                self.get_node(next_node_id).r_edges(),
×
2476
                self.get_node(next_node_id).exts(),
×
2477
                self.get_node(next_node_id).data(),
×
2478
                ))
×
2479
            };
2480

2481
            // remove ext
2482
            self.base.exts[next_node_id] = self.base.exts[next_node_id].remove(base_dir.flip(), in_base); 
156✔
2483

2484
            // use new exts to fix edge mults
2485
            self.base.data[current_node_id].fix_edge_mults(self.base.exts[current_node_id]);
156✔
2486
            self.base.data[next_node_id].fix_edge_mults(self.base.exts[next_node_id]);
156✔
2487

2488

2489
            current_node_id = next_node_id;
156✔
2490
        }
2491
    }
48✔
2492

2493
    /// use mapped ids to check if the nodes of an edge were mapped to the same id
2494
    /// returns false if mapped ids are not available
2495
    pub fn check_edge_truth<DI>(&self, node_id_1: usize, node_id_2: usize) -> bool
124✔
2496
    where 
124✔
2497
        SD: SummaryData<DI>
124✔
2498
    {
2499
        let mapped_ids_1 = self.get_node(node_id_1).data().mapped_ids();
124✔
2500
        let mapped_ids_2 = self.get_node(node_id_2).data().mapped_ids();
124✔
2501

2502
        if let (Some(mids1), Some(mids2)) = (mapped_ids_1, mapped_ids_2) {
124✔
2503
            let hashed_mi2 = mids2.iter().copied().collect::<HashSet<_>>();
124✔
2504
            for mid in mids1 {
124✔
2505
                if hashed_mi2.contains(mid) {
51✔
2506
                    return true;
43✔
2507
                }
8✔
2508
            }
2509
        }
×
2510

2511
        false
81✔
2512
    }
124✔
2513
}
2514

2515

2516
#[derive(Debug, Eq, PartialEq)]
2517
enum Status {
2518
    Active,
2519
    End,
2520
    Cycle,
2521
}
2522

2523
#[derive(Debug)]
2524
struct State {
2525
    path: SmallVec8<(u32, Dir)>,
2526
    score: f32,
2527
    status: Status,
2528
}
2529

2530
impl State {}
2531

2532
#[derive(Debug, PartialEq, Eq)]
2533
pub enum LadderState {
2534
    Singular,
2535
    Double
2536
}
2537

2538
/// Iterator over nodes in a `DeBruijnGraph`
2539
pub struct NodeIter<'a, K: Kmer + 'a, D: Debug + 'a> {
2540
    graph: &'a DebruijnGraph<K, D>,
2541
    node_id: usize,
2542
}
2543

2544
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIter<'a, K, D> {
2545
    type Item = Node<'a, K, D>;
2546

2547
    fn next(&mut self) -> Option<Node<'a, K, D>> {
23,127✔
2548
        if self.node_id < self.graph.len() {
23,127✔
2549
            let node = self.graph.get_node(self.node_id);
22,997✔
2550
            self.node_id += 1;
22,997✔
2551
            Some(node)
22,997✔
2552
        } else {
2553
            None
130✔
2554
        }
2555
    }
23,127✔
2556
}
2557

2558
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for &'a DebruijnGraph<K, D> {
2559
    type Item = NodeKmer<'a, K, D>;
2560
    type IntoIter = NodeIntoIter<'a, K, D>;
2561

2562
    fn into_iter(self) -> Self::IntoIter {
2,229✔
2563
        NodeIntoIter {
2,229✔
2564
            graph: self,
2,229✔
2565
            node_id: 0,
2,229✔
2566
        }
2,229✔
2567
    }
2,229✔
2568
}
2569

2570
/// Iterator over nodes in a `DeBruijnGraph`
2571
pub struct NodeIntoIter<'a, K: Kmer + 'a, D: Debug + 'a> {
2572
    graph: &'a DebruijnGraph<K, D>,
2573
    node_id: usize,
2574
}
2575

2576
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeIntoIter<'a, K, D> {
2577
    type Item = NodeKmer<'a, K, D>;
2578

2579
    fn next(&mut self) -> Option<Self::Item> {
293,934✔
2580
        if self.node_id < self.graph.len() {
293,934✔
2581
            let node_id = self.node_id;
293,934✔
2582
            let node = self.graph.get_node(node_id);
293,934✔
2583
            let node_seq = node.sequence();
293,934✔
2584

2585
            self.node_id += 1;
293,934✔
2586
            Some(NodeKmer {
293,934✔
2587
                node_id,
293,934✔
2588
                node_seq_slice: node_seq,
293,934✔
2589
                phantom_d: PhantomData,
293,934✔
2590
                phantom_k: PhantomData,
293,934✔
2591
            })
293,934✔
2592
        } else {
2593
            None
×
2594
        }
2595
    }
293,934✔
2596
}
2597

2598
/// A `DebruijnGraph` node with a reference to the sequence of the node.
2599
#[derive(Clone)]
2600
pub struct NodeKmer<'a, K: Kmer + 'a, D: Debug + 'a> {
2601
    pub node_id: usize,
2602
    node_seq_slice: DnaStringSlice<'a>,
2603
    phantom_k: PhantomData<K>,
2604
    phantom_d: PhantomData<D>,
2605
}
2606

2607
/// An iterator over the kmers in a `DeBruijn graph node`
2608
pub struct NodeKmerIter<'a, K: Kmer + 'a, D: Debug + 'a> {
2609
    kmer_id: usize,
2610
    kmer: K,
2611
    num_kmers: usize,
2612
    node_seq_slice: DnaStringSlice<'a>,
2613
    phantom_k: PhantomData<K>,
2614
    phantom_d: PhantomData<D>,
2615
}
2616

2617
impl<'a, K: Kmer + 'a, D: Debug + 'a> IntoIterator for NodeKmer<'a, K, D> {
2618
    type Item = K;
2619
    type IntoIter = NodeKmerIter<'a, K, D>;
2620

2621
    fn into_iter(self) -> Self::IntoIter {
587,868✔
2622
        let num_kmers = self.node_seq_slice.len() - K::k() + 1;
587,868✔
2623

2624
        let kmer = if num_kmers > 0 {
587,868✔
2625
            self.node_seq_slice.get_kmer::<K>(0)
587,868✔
2626
        } else {
2627
            K::empty()
×
2628
        };
2629

2630
        NodeKmerIter {
587,868✔
2631
            kmer_id: 0,
587,868✔
2632
            kmer,
587,868✔
2633
            num_kmers,
587,868✔
2634
            node_seq_slice: self.node_seq_slice,
587,868✔
2635
            phantom_k: PhantomData,
587,868✔
2636
            phantom_d: PhantomData,
587,868✔
2637
        }
587,868✔
2638
    }
587,868✔
2639
}
2640

2641
impl<'a, K: Kmer + 'a, D: Debug + 'a> Iterator for NodeKmerIter<'a, K, D> {
2642
    type Item = K;
2643

2644
    fn next(&mut self) -> Option<Self::Item> {
3,804,348✔
2645
        if self.num_kmers == self.kmer_id {
3,804,348✔
2646
            None
×
2647
        } else {
2648
            let current_kmer = self.kmer;
3,804,348✔
2649

2650
            self.kmer_id += 1;
3,804,348✔
2651
            if self.kmer_id < self.num_kmers {
3,804,348✔
2652
                let next_base = self.node_seq_slice.get(self.kmer_id + K::k() - 1);
3,729,106✔
2653
                let new_kmer = self.kmer.extend_right(next_base);
3,729,106✔
2654
                self.kmer = new_kmer;
3,729,106✔
2655
            }
3,729,106✔
2656

2657
            Some(current_kmer)
3,804,348✔
2658
        }
2659
    }
3,804,348✔
2660

2661
    fn size_hint(&self) -> (usize, Option<usize>) {
293,934✔
2662
        (self.num_kmers, Some(self.num_kmers))
293,934✔
2663
    }
293,934✔
2664

2665
    /// Provide a 'fast-forward' capability for this iterator
2666
    /// MPHF will use this to reduce the number of kmers that
2667
    /// need to be produced.
2668
    fn nth(&mut self, n: usize) -> Option<Self::Item> {
2,741,452✔
2669
        if n <= 4 {
2,741,452✔
2670
            // for small skips forward, shift one base at a time
2671
            for _ in 0..n {
2,466,852✔
2672
                self.next();
1,062,896✔
2673
            }
1,062,896✔
2674
        } else {
274,600✔
2675
            self.kmer_id += n;
274,600✔
2676
            self.kmer = self.node_seq_slice.get_kmer::<K>(self.kmer_id);
274,600✔
2677
        }
274,600✔
2678

2679
        self.next()
2,741,452✔
2680
    }
2,741,452✔
2681
}
2682

2683
/// Marker signifying that NodeKmerIter has a known size.
2684
impl<'a, K: Kmer + 'a, D: Debug + 'a> ExactSizeIterator for NodeKmerIter<'a, K, D> {}
2685

2686
/// Unbranched sequence in the DeBruijn graph
2687
pub struct Node<'a, K: Kmer + 'a, D: 'a> {
2688
    pub node_id: usize,
2689
    pub graph: &'a DebruijnGraph<K, D>,
2690
}
2691

2692
impl<'a, K: Kmer, D: Debug> Node<'a, K, D> {
2693
    /// Length of the sequence of this node
2694
    pub fn len(&self) -> usize {
1,563,318✔
2695
        self.graph.base.sequences.get(self.node_id).len()
1,563,318✔
2696
    }
1,563,318✔
2697

2698
    pub fn is_empty(&self) -> bool {
×
2699
        self.graph.base.sequences.get(self.node_id).is_empty()
×
2700
    }
×
2701

2702
    /// Sequence of the node
2703
    pub fn sequence(&self) -> DnaStringSlice<'a> {
5,768,080✔
2704
        self.graph.base.sequences.get(self.node_id)
5,768,080✔
2705
    }
5,768,080✔
2706

2707
    /// Reference to auxiliarly data associated with the node
2708
    pub fn data(&self) -> &'a D {
3,877,406✔
2709
        &self.graph.base.data[self.node_id]
3,877,406✔
2710
    }
3,877,406✔
2711

2712
    /// Extension bases from this node
2713
    pub fn exts(&self) -> Exts {
3,901,438✔
2714
        self.graph.base.exts[self.node_id]
3,901,438✔
2715
    }
3,901,438✔
2716

2717
    /// Edges leaving the left side of the node in the format
2718
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
2719
    pub fn l_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
3,410✔
2720
        self.graph.find_edges(self.node_id, Dir::Left)
3,410✔
2721
    }
3,410✔
2722

2723
    /// Edges leaving the right side of the node in the format
2724
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
2725
    pub fn r_edges(&self) -> SmallVec4<(u8, usize, Dir, bool)> {
3,411✔
2726
        self.graph.find_edges(self.node_id, Dir::Right)
3,411✔
2727
    }
3,411✔
2728

2729
    /// Edges leaving the 'dir' side of the node in the format
2730
    /// (base, target_node id, incoming side of target node, whether target node is flipped)
2731
    pub fn edges(&self, dir: Dir) -> SmallVec4<(u8, usize, Dir, bool)> {
514,928✔
2732
        self.graph.find_edges(self.node_id, dir)
514,928✔
2733
    }
514,928✔
2734

2735
    fn to_json<F: Fn(&D) -> Value>(&self, func: &F, f: &mut dyn Write) {
×
2736
        write!(
×
2737
            f,
×
2738
            "{{\"id\":\"{}\",\"L\":{},\"D\":{},\"Se\":\"{:?}\"}}",
2739
            self.node_id,
2740
            self.sequence().len(),
×
2741
            (func)(self.data()),
×
2742
            self.sequence(),
×
2743
        )
2744
        .unwrap();
×
2745
    }
×
2746

2747
    fn edges_to_json(&self, f: &mut dyn Write) -> bool {
×
2748
        let mut wrote = false;
×
2749
        let edges = self.r_edges();
×
2750
        for (idx, &(_, id, incoming_dir, _)) in edges.iter().enumerate() {
×
2751
            write!(
×
2752
                f,
×
2753
                "{{\"source\":\"{}\",\"target\":\"{}\",\"D\":\"{}\"}}",
2754
                self.node_id,
2755
                id,
2756
                match incoming_dir {
×
2757
                    Dir::Left => "L",
×
2758
                    Dir::Right => "R",
×
2759
                }
2760
            )
2761
            .unwrap();
×
2762

2763
            if idx < edges.len() - 1 {
×
2764
                write!(f, ",").unwrap();
×
2765
            }
×
2766

2767
            wrote = true;
×
2768
        }
2769
        wrote
×
2770
    }
×
2771
}
2772

2773
// TODO make generic instead of u8 (u8 is sufficient for dbg)
2774
impl<K: Kmer, SD: Debug> Node<'_, K, SD>  {
2775
    /// get default format for dot edges based on node data
2776
    pub fn edge_dot_default<DI>(&self, colors: &Colors<SD, DI>, base: u8, incoming_dir: Dir, flipped: bool) -> String 
616✔
2777
    where SD: SummaryData<DI>
616✔
2778
    {
2779
        // set color based on dir
2780
        let color = match incoming_dir {
616✔
2781
            Dir::Left => "blue",
308✔
2782
            Dir::Right => "red"
308✔
2783
        };
2784
        
2785
        if let Some(em) = self.data().edge_mults() {
616✔
2786
            
2787
            let dir = if flipped { 
616✔
2788
                incoming_dir 
2✔
2789
            } else {
2790
                incoming_dir.flip()
614✔
2791
            };
2792

2793
            // set penwidth based on count
2794
            let count = em.edge_mult(base, dir);
616✔
2795
            let penwidth = colors.edge_width(count);
616✔
2796

2797
            format!("[color={color}, penwidth={penwidth}, label=\"{}: {count}\", weight={count}]", bits_to_base(base))
616✔
2798
        } else {
2799
            format!("[color={color}, penwidth={}]", colors.edge_width(1)) // since there should be no edge mults, this will return default value
×
2800
        }
2801
    }
616✔
2802

2803
    /// get default format for dot nodes, based on node data
2804
    pub fn node_dot_default<DI>(&self, colors: &Colors<SD, DI>, config: &SummaryConfig, translator: &Translator, outline: bool, translate_id_groups: bool) -> String
331✔
2805
    where SD: SummaryData<DI>
331✔
2806
    {
2807
        // set color based on labels/fold change/p-value
2808
        let color = colors.node_color_dot(self.data(), config, outline);
331✔
2809
        let translate_id_groups = if translate_id_groups { colors.id_group_ids() } else { None };
331✔
2810

2811
        let data_info = self.data().print(translator, config, translate_id_groups);
331✔
2812
        const MIN_TEXT_WIDTH: usize = 40;
2813
        let wrap = if self.len() > MIN_TEXT_WIDTH { self.len() } else { MIN_TEXT_WIDTH };
331✔
2814

2815
        let label = textwrap::fill(&format!("id: {}, len: {}, exts: {:?}, seq: {}\n{}", 
331✔
2816
            self.node_id,
331✔
2817
            self.len(),
331✔
2818
            self.exts(),
331✔
2819
            self.sequence(),
331✔
2820
            data_info
331✔
2821
        ), wrap);
331✔
2822

2823
        format!("[{color}, label=\"{label}\"]")
331✔
2824
    }
331✔
2825

2826
    // get the default properties for json edges
2827
    pub fn edge_json_default<DI>(&self, target_node_id: usize, base: u8, incoming_dir: Dir, flipped: bool) -> String 
106✔
2828
    where SD: SummaryData<DI>
106✔
2829
    {
2830
        // set color based on dir
2831
        let dir = match incoming_dir {
106✔
2832
            Dir::Right => 0,
1✔
2833
            Dir::Left => 1
105✔
2834
        };
2835

2836
        // get base in other dir
2837
        let target_node = self.graph.get_node(target_node_id);
106✔
2838
        let nb_base = if self.graph.base.stranded {
106✔
2839
            // only look at left edges of target node
2840
            let Some(nb_base) = target_node.l_edges().iter().filter_map(|(b, id, _in_dir, _flip)|
106✔
2841
                if *id == self.node_id {
110✔
2842
                    Some(*b)
106✔
2843
                } else { None }
110✔
2844
            ).next() else { panic!("missing neighbor") };
106✔
2845
            nb_base
106✔
2846
        } else {
2847
            // look at edges in either direction
NEW
2848
            let Some(nb_base) = target_node.r_edges().iter().chain(target_node.l_edges().iter()).filter_map(|(b, id, _in_dir, _flip)|
×
NEW
2849
                if *id == self.node_id {
×
NEW
2850
                    Some(*b)
×
NEW
2851
                } else { None }
×
NEW
2852
            ).next() else { panic!("missing neighbor") };
×
NEW
2853
            nb_base
×
2854
        };
2855
        
2856
        let value = if let Some(em) = self.data().edge_mults() {
106✔
2857
            
2858
            let dir = if flipped { 
106✔
2859
                incoming_dir 
1✔
2860
            } else {
2861
                incoming_dir.flip()
105✔
2862
            };
2863

2864
            let count = em.edge_mult(base, dir);
106✔
2865

2866
            format!(", \"strength\": {count}")
106✔
2867
        } else {
NEW
2868
            String::from("")
×
2869
        };
2870

2871
        format!("\"source\": {}, \"target\": {target_node_id}, \"source_b\": \"{}\", \"target_b\": \"{}\", \"dir\": {dir}{value}",
106✔
2872
            self.node_id,
2873
            bits_to_base(base),
106✔
2874
            bits_to_base(nb_base),
106✔
2875
        )
2876
    }
106✔
2877

2878
    /// get default properties for json nodes, based on node data
2879
    pub fn node_json_default<DI>(&self, colors: &Colors<SD, DI>, config: &SummaryConfig, translator: &Translator, translate_id_groups: bool) -> String
111✔
2880
    where SD: SummaryData<DI>
111✔
2881
    {
2882
        // set hue based on node data
2883
        let hue = colors.hue_json(self.data(), config);
111✔
2884
        let translate_id_groups = if translate_id_groups { colors.id_group_ids() } else { None };
111✔
2885

2886
        let data_info = self.data().print_json(translator, config, translate_id_groups);
111✔
2887

2888
        format!("\"id\": {}, \"len\": {}, \"seq\": \"{}\", \"hue\": {hue}, {data_info}",
111✔
2889
            self.node_id,
2890
            self.len(),
111✔
2891
            self.sequence(),
111✔
2892
        )
2893
    }
111✔
2894
}
2895

2896
impl<K: Kmer, D> fmt::Debug for Node<'_, K, D>
2897
where
2898
    D: Debug,
2899
{
2900
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
146✔
2901
        write!(
146✔
2902
            f,
146✔
2903
            "Node {{ id:{}, Exts: {:?}, L:{:?} R:{:?}, Seq: {:?}, Data: {:?} }}",
2904
            self.node_id,
2905
            self.exts(),
146✔
2906
            self.l_edges(),
146✔
2907
            self.r_edges(),
146✔
2908
            self.sequence().len(),
146✔
2909
            self.data()
146✔
2910
        )
2911
    }
146✔
2912
}
2913

2914
pub struct IterComponents<'a, K: Kmer, D> {
2915
    graph: &'a DebruijnGraph<K, D>,
2916
    visited: Vec<bool>,
2917
    pos: usize,
2918
}
2919

2920
impl<K: Kmer, D: Debug> Iterator for IterComponents<'_, K, D> {
2921
    type Item = Vec<usize>;
2922
    fn next(&mut self) -> Option<Self::Item> {
24✔
2923
        while self.pos < self.graph.len() {
656✔
2924
            if !self.visited[self.pos] {
649✔
2925
                let comp = self.graph.component_i(&mut self.visited, self.pos);
17✔
2926
                self.pos += 1;
17✔
2927
                return Some(comp)
17✔
2928
            } else {
632✔
2929
                self.pos += 1;
632✔
2930
            }
632✔
2931
        }
2932
        assert!(self.visited.iter().map(|x| *x as usize).sum::<usize>() == self.graph.len());
649✔
2933
        None
7✔
2934
    }
24✔
2935
    
2936
}
2937

2938
pub struct PathCompIter<'a, K: Kmer, D: Debug, F, F2> 
2939
where 
2940
F: Fn(&D) -> f32,
2941
F2: Fn(&D) -> bool
2942
{
2943
    graph: &'a DebruijnGraph<K, D>,
2944
    component_iterator: IterComponents<'a, K, D>,
2945
    graph_pos: usize,
2946
    score: F,
2947
    solid_path: F2,
2948
}
2949

2950
/// returns the component and the "best" path in the component
2951
impl<K: Kmer, D: Debug, F, F2> Iterator for PathCompIter<'_, K, D, F, F2> 
2952
where 
2953
F: Fn(&D) -> f32,
2954
F2: Fn(&D) -> bool
2955
{
2956
    type Item = (Vec<usize>, VecDeque<(usize, Dir)>,);
2957
    fn next(&mut self) -> Option<Self::Item> {
10✔
2958
        match self.component_iterator.next() {
10✔
2959
            Some(component) => {
7✔
2960
                let current_comp = component;
7✔
2961
                
2962
    
2963
                let mut best_node = current_comp[0];
7✔
2964
                let mut best_score = f32::MIN;
7✔
2965
                for c in current_comp.iter() {
373✔
2966
                    let node = self.graph.get_node(*c);
373✔
2967
                    let node_score = (self.score)(node.data());
373✔
2968
    
2969
                    if node_score > best_score {
373✔
2970
                        best_node = *c;
13✔
2971
                        best_score = node_score;
13✔
2972
                    }
360✔
2973
                }
2974
    
2975
                let oscore = |state| match state {
670✔
2976
                    None => 0.0,
331✔
2977
                    Some((id, _)) => (self.score)(self.graph.get_node(id).data()),
339✔
2978
                };
670✔
2979
    
2980
                let osolid_path = |state| match state {
335✔
2981
                    None => false,
×
2982
                    Some((id, _)) => (self.solid_path)(self.graph.get_node(id).data()),
335✔
2983
                };
335✔
2984
    
2985
                // Now expand in each direction, greedily taking the best path. Stop if we hit a node we've
2986
                // already put into the path
2987
                let mut used_nodes = HashSet::new();
7✔
2988
                let mut path = VecDeque::new();
7✔
2989
    
2990
                // Start w/ initial state
2991
                used_nodes.insert(best_node);
7✔
2992
                path.push_front((best_node, Dir::Left));
7✔
2993
    
2994
                for init in [(best_node, Dir::Left, false), (best_node, Dir::Right, true)].iter() {
14✔
2995
                    let &(start_node, dir, do_flip) = init;
14✔
2996
                    let mut current = (start_node, dir);
14✔
2997
    
2998
                    loop {
2999
                        let mut next = None;
345✔
3000
                        let (cur_id, incoming_dir) = current;
345✔
3001
                        let node = self.graph.get_node(cur_id);
345✔
3002
                        let edges = node.edges(incoming_dir.flip());
345✔
3003
    
3004
                        let mut solid_paths = 0;
345✔
3005
                        for (_, id, dir, _) in edges {
345✔
3006
                            let cand = Some((id, dir));
335✔
3007
                            if osolid_path(cand) {
335✔
3008
                                solid_paths += 1;
335✔
3009

3010
                                // second if clause is outside of first in original code (see max_path) 
3011
                                // but would basically ignore path validity.
3012
                                if oscore(cand) > oscore(next) {
335✔
3013
                                    next = cand;
332✔
3014
                                }
332✔
3015
                            }
×
3016
                        }
3017
                        
3018
                        // break if multiple solid paths are available
3019
                        /* if solid_paths > 1 {
3020
                            break;
3021
                        } */
3022
    
3023
                        match next {
331✔
3024
                            Some((next_id, next_incoming)) if !used_nodes.contains(&next_id) => {
331✔
3025
                                if do_flip {
331✔
3026
                                    path.push_front((next_id, next_incoming.flip()));
180✔
3027
                                } else {
180✔
3028
                                    path.push_back((next_id, next_incoming));
151✔
3029
                                }
151✔
3030
    
3031
                                used_nodes.insert(next_id);
331✔
3032
                                current = (next_id, next_incoming);
331✔
3033
                            }
3034
                            _ => break,
14✔
3035
                        }
3036
                    }
3037
                }
3038
                
3039
                
3040
                Some((current_comp, path))
7✔
3041
            }, 
3042
            None => {
3043
                // should technically not need graph_pos after this 
3044
                self.graph_pos += 1;
3✔
3045
                None
3✔
3046
            }
3047
        }
3048
    }
10✔
3049
}
3050

3051

3052
/// iterator over the edges of the de bruijn graph
3053
pub struct EdgeIter<'a, K: Kmer, D: Debug> {
3054
    graph: &'a DebruijnGraph<K, D>,
3055
    visited_edges: HashSet<(usize, usize)>,
3056
    current_node: usize,
3057
    current_dir: Dir,
3058
    node_edge_iter: smallvec::IntoIter<[(u8, usize, Dir, bool); 4]>
3059
}
3060

3061
impl<K: Kmer, D: Debug> EdgeIter<'_, K, D> {
3062
    pub fn new(graph: &DebruijnGraph<K, D>) -> EdgeIter<'_, K, D>{
17✔
3063
        let node_edge_iter = graph.get_node(0).l_edges().into_iter();
17✔
3064

3065
        EdgeIter { 
17✔
3066
            graph, 
17✔
3067
            visited_edges: HashSet::new(), 
17✔
3068
            current_node: 0, 
17✔
3069
            current_dir: Dir::Left, 
17✔
3070
            node_edge_iter
17✔
3071
        }
17✔
3072
    }
17✔
3073
}
3074

3075
impl<K: Kmer, D: Debug> Iterator for EdgeIter<'_, K, D> {
3076
    type Item = (usize, Dir, u8, usize); // node, direction leaving node, base, target node
3077

3078
    fn next(&mut self) -> Option<Self::Item> {
1,499✔
3079
        loop {
3080
            if let Some((base, nb_node_id, _, _)) = self.node_edge_iter.next() {
6,180✔
3081
                let edge = if self.current_node > nb_node_id { (nb_node_id, self.current_node) } else { (self.current_node, nb_node_id) };
2,964✔
3082

3083
                if self.visited_edges.insert(edge) { return Some((self.current_node, self.current_dir, base, nb_node_id)); } // else simply skip and move on
2,964✔
3084

3085
            } else {
3086
                match self.current_dir {
3,216✔
3087
                    Dir::Left => {
1,608✔
3088
                        // no left edges, switch to right edges
1,608✔
3089
                        self.current_dir = Dir::Right;
1,608✔
3090
                        self.node_edge_iter = self.graph.get_node(self.current_node).r_edges().into_iter();
1,608✔
3091
                        
1,608✔
3092
                    }
1,608✔
3093
                    Dir::Right => {
3094
                        // no right edges, switch to next node left edges
3095
                        self.current_node += 1;
1,608✔
3096

3097
                        // quit if end of graph is reached
3098
                        if self.current_node == self.graph.len() { return None }
1,608✔
3099

3100
                        self.current_dir = Dir::Left;
1,591✔
3101
                        self.node_edge_iter = self.graph.get_node(self.current_node).l_edges().into_iter();
1,591✔
3102
                    }
3103
                }
3104
            }
3105
            
3106
        }
3107
    }
1,499✔
3108
}
3109

3110
#[cfg(test)]
3111
mod test {
3112
    use std::fs::remove_file;
3113

3114
    use crate::{BaseQuality, Exts, build_test_graph, colors::Colors, compression::{CheckCompress, ScmapCompress, compress_kmers_with_hash, uncompressed_graph}, dna_string::DnaString, filter::filter_kmers, kmer::{Kmer6, Kmer16, Kmer22}, reads::{Reads, ReadsPaired}, serde::SerKmers, summarizer::{IDMapEMData, IDMapEMQualityData, IDTag, SampleInfo, SummaryConfig, TagsCountsData, TagsCountsSumData, Translator}, test::random_dna};
3115

3116
    use crate::{summarizer::SummaryData, Dir};
3117

3118

3119
    #[test]
3120
    #[cfg(not(feature = "sample128"))]
3121
    fn test_components() {
3122
        use crate::{kmer::Kmer16, test::build_test_graph};
3123

3124
        let (_, _, ser_graph) = build_test_graph::<Kmer16, TagsCountsSumData, _>();
3125
        let graph = ser_graph.graph();
3126

3127
        let components = graph.iter_components();
3128

3129
        let check_components = [
3130
            vec![0, 7, 43, 24, 47, 22, 37, 89, 25, 79, 63, 95, 64, 9, 96, 13, 11, 86, 74, 71, 92, 51, 94, 45, 12, 76, 21],
3131
            vec![1, 54, 44, 5, 57, 65, 84, 10, 58, 35, 42, 73, 30, 83, 77, 15, 80, 72, 81, 78, 67, 49, 69, 91, 2, 90, 33, 87, 55, 8, 17, 88, 31, 56, 52, 27, 4, 6, 99, 40, 93, 28, 26, 62, 59, 97, 82, 46],
3132
            vec![3, 41, 36, 34, 38, 85, 75, 19, 48, 16, 61, 66, 23, 20, 14, 18, 39, 29, 70, 32, 50, 53, 68, 60, 98],
3133
        ];
3134

3135
        let mut counter = 0;
3136

3137
        for component in components {
3138
            if component.len() > 1 { 
3139
                println!("component: {:?}", component);
3140
                assert_eq!(component, check_components[counter]);
3141
                counter += 1;
3142
            }
3143
        }
3144

3145
        assert_eq!(
3146
            vec![(88, Dir::Left), (17, Dir::Left), (8, Dir::Left), (55, Dir::Left), (87, Dir::Left), (33, Dir::Left), (90, Dir::Left), (2, Dir::Left), (91, Dir::Left), (69, Dir::Left), (49, Dir::Left), (78, Dir::Left), (81, Dir::Left), (72, Dir::Left), (1, Dir::Left), (54, Dir::Left), (44, Dir::Left), (5, Dir::Left), (57, Dir::Left)], 
3147
            graph.max_path(|data| data.sum().unwrap_or(1) as f32, |_| true)
3148
        );
3149
    }
3150

3151
    #[test]
3152
    fn test_iter_edges() {
1✔
3153
        use crate::{compression::uncompressed_graph, filter::filter_kmers, reads::{Reads, ReadsPaired}, summarizer::{SampleInfo, SummaryConfig, TagsData}};
3154

3155
        let read1 = "CAGCATCGATGCGACGAGCGCTCGCATCGA".as_bytes();
1✔
3156
        let read2 = "ACGATCGTACGTAGCTAGCTGACTGAGC".as_bytes();
1✔
3157

3158
        let mut reads = Reads::new(crate::reads::Strandedness::Forward);
1✔
3159
        reads.add_from_bytes(read1, None, 0u8);
1✔
3160
        reads.add_from_bytes(read2, None, 1);
1✔
3161

3162
        let reads_paired = ReadsPaired::Unpaired { reads };
1✔
3163

3164
        let sample_info = SampleInfo::new(0b1, 0b10, vec![12, 12]);
1✔
3165
        let summary_config = SummaryConfig::new(sample_info);
1✔
3166
        let (kmers, _) = filter_kmers::<TagsData, Kmer16, _>(&reads_paired, &summary_config, false, 1., false);
1✔
3167

3168
        let graph = uncompressed_graph(&kmers, true).finish();
1✔
3169

3170
        let check_edges: Vec<(usize, Dir, u8, usize)> = vec![(0, Dir::Left, 2, 16), (0, Dir::Right, 1, 2), (1, Dir::Left, 0, 11), 
1✔
3171
        (1, Dir::Right, 0, 16), (3, Dir::Left, 0, 13), (3, Dir::Right, 3, 10), (4, Dir::Left, 2, 21), (4, Dir::Right, 1, 17), (5, Dir::Left, 1, 27), 
1✔
3172
        (5, Dir::Right, 2, 19), (6, Dir::Left, 1, 24), (6, Dir::Right, 2, 12), (7, Dir::Left, 2, 23), (7, Dir::Right, 3, 11), (8, Dir::Left, 0, 12), 
1✔
3173
        (9, Dir::Left, 3, 17), (9, Dir::Right, 3, 24), (10, Dir::Right, 1, 21), (13, Dir::Left, 1, 18), (14, Dir::Right, 0, 26), 
1✔
3174
        (15, Dir::Left, 2, 20), (15, Dir::Right, 3, 22), (18, Dir::Left, 2, 19), (20, Dir::Left, 1, 26), (22, Dir::Right, 2, 25), (23, Dir::Left, 1, 25)];
1✔
3175

3176
        let edges = graph.iter_edges().collect::<Vec<_>>();
1✔
3177

3178
        assert_eq!(check_edges, edges);
1✔
3179
    }
1✔
3180

3181
    #[test]
3182
    fn test_map_transcripts() {
1✔
3183
        let t_ref_path = "test_data/test_transcriptome_reference.fasta";
1✔
3184
        let (_, ser_kmers, _) = build_test_graph::<Kmer22, u32, _>();
1✔
3185
        let (kmers, mut translator, _) = ser_kmers.dissolve();
1✔
3186

3187
        let unc_graph = uncompressed_graph(&kmers, true).finish();
1✔
3188

3189
        let mut id_strings = translator.id_translator().clone().unwrap().into_iter().map(|(name, _id)| name).collect::<Vec<_>>();
1✔
3190
        id_strings.sort();
1✔
3191

3192
        let t_map = unc_graph.map_transcripts(t_ref_path, &mut translator).unwrap();
1✔
3193
        assert_eq!(t_map.len(), unc_graph.len());
1✔
3194
        assert_eq!(t_map.iter().filter(|&ids| !ids.is_empty()).collect::<Vec<_>>().len(), 439);
487✔
3195

3196
        // repeat the same without a previous existing translator
3197
        let mut new_translator = Translator::empty();
1✔
3198
        let t_map = unc_graph.map_transcripts(t_ref_path, &mut new_translator).unwrap();
1✔
3199
        assert_eq!(t_map.len(), unc_graph.len());
1✔
3200
        assert_eq!(t_map.iter().filter(|&ids| !ids.is_empty()).collect::<Vec<_>>().len(), 439);
487✔
3201
        let mut new_id_strings = new_translator.id_translator().clone().unwrap().into_iter().map(|(name, _id)| name).collect::<Vec<_>>();
1✔
3202
        new_id_strings.sort();
1✔
3203

3204
        assert_eq!(id_strings, new_id_strings);
1✔
3205
    }
1✔
3206

3207
    fn build_reads_quality_test() -> ReadsPaired<IDTag> {
2✔
3208
        let correct1 = "CGATGCTGCTGATGCTGAGTCTGACGTATGCGATCGATCGACGATCGTACTAGCTGACTGTGCAGCTAGCTGACTGATCGTAGCTAGCTACGTGCTAGCTACTAGCACTGATGC";
2✔
3209
        let qu_corr1 = "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC";
2✔
3210
        let incorrect1 =                                     "CGACGATCGTACTAGCTGACTGTGCAGCTAGCTGACTGATCGTGGCTAGCTACGTGCTAGCTA";
2✔
3211
        let qu_incorr1 =                                     "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC-CCCCCCCCCCCCCCCCCCC";
2✔
3212
        let correct2 =         "GCATCGATCGACGACGTTACGTACGATCTACGTAGCTAGCTAGCTGACATGCTAGCTAGCTCTGACTGATCGTGGCTAGCTGACTGACTGTAGCT";
2✔
3213
        let qu_corr2 =         "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC";
2✔
3214
        let incorrect2 = "ATGCTGCTGATGCTGAGTCTGACGTAGGCGATCGATCGACGATCGTACTAGCTGACT";
2✔
3215
        let qu_incorr2 = "CCCCCCCCCCCCCCCCCCCCCCCCCC-CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC";
2✔
3216
        let incorrect3 = "ATGCTGCTGATGCTGACTCTGACGTAGGCGATCGATCGATGATCGTACTAGCTGACT";
2✔
3217
        let qu_incorr3 = "CCCCCCCCCCCCCCCC-CCCCCCCCC-CCCCCCCCCCCC-CCCCCCCCCCCCCCCCC";
2✔
3218

3219
        let incorrect4_ins =                 "GACGTATGCGATCGATCGACGATCGTACTAGCTGACTTGTGCAGCTAGCTGACTGAT";
2✔
3220
        let qu_incorr4_ins =                 "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC-CCCCCCCCCCCCCCCCCCC";
2✔
3221

3222
        let incorrect5_tip =                                                             "AGCTGACTGATCGTAGCTAGCTACGTGCTAGCTACTATCACTGATGC";
2✔
3223
        let qu_incorr5_tip =                                                             "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC-CCCCCCCCC";
2✔
3224

3225

3226
        let mut reads = Reads::new_with_quality(crate::reads::Strandedness::Forward);
2✔
3227

3228
        for _i in 0..20 {
40✔
3229
            reads.add_read(DnaString::from_acgt_bytes(correct1.as_bytes()), None, IDTag::new(1, 0), Some(qu_corr1.as_bytes()));
40✔
3230
        }
40✔
3231

3232
        for _i in 0..40 {
80✔
3233
            reads.add_read(DnaString::from_acgt_bytes(correct2.as_bytes()), None, IDTag::new(2, 0), Some(qu_corr2.as_bytes()));
80✔
3234
        }
80✔
3235

3236
        reads.add_read(DnaString::from_acgt_bytes(incorrect1.as_bytes()), None, IDTag::new(3, 1), Some(qu_incorr1.as_bytes()));
2✔
3237
        reads.add_read(DnaString::from_acgt_bytes(incorrect2.as_bytes()), None, IDTag::new(4, 1), Some(qu_incorr2.as_bytes()));
2✔
3238
        reads.add_read(DnaString::from_acgt_bytes(incorrect3.as_bytes()), None, IDTag::new(5, 1), Some(qu_incorr3.as_bytes()));
2✔
3239
        reads.add_read(DnaString::from_acgt_bytes(incorrect4_ins.as_bytes()), None, IDTag::new(6, 1), Some(qu_incorr4_ins.as_bytes()));
2✔
3240
        reads.add_read(DnaString::from_acgt_bytes(incorrect5_tip.as_bytes()), None, IDTag::new(7, 1), Some(qu_incorr5_tip.as_bytes()));
2✔
3241

3242

3243
        ReadsPaired::Unpaired { reads }
2✔
3244
    }
2✔
3245

3246
    #[test]
3247
    fn test_remove_lq_splits() {
1✔
3248
        let print = false;
1✔
3249

3250
        type K = Kmer16;
3251

3252
        let seqs = build_reads_quality_test();
1✔
3253
        let sample_info = SampleInfo::new(1, 0b111110, vec![1000, 10, 20, 20, 20, 20]);
1✔
3254
        let summary_config = SummaryConfig::new(sample_info);
1✔
3255
        let (kmers, _) = filter_kmers::<IDMapEMQualityData, K, IDTag>(&seqs, &summary_config, false, 1., false);
1✔
3256

3257

3258
        // make uncompressed graph
3259
        let mut unc_graph = uncompressed_graph(&kmers, true).finish();
1✔
3260

3261
        // add "mapped" ids to graph
3262
        for i in 0..unc_graph.len() {
263✔
3263
            let data = unc_graph.mut_data(i);
263✔
3264
            if let Some(ids) = data.ids() {
263✔
3265
                if ids.contains(&1) {
263✔
3266
                    data.set_mapped_ids(vec![1].into());
99✔
3267
                }
99✔
3268
                else if ids.contains(&2) {
164✔
3269
                    data.set_mapped_ids(vec![2].into());
80✔
3270
                }
84✔
NEW
3271
            }
×
3272
        }
3273

3274
        let colors = Colors::new(&unc_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 7 });
1✔
3275

3276
        if print { unc_graph.to_dot("uncompressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3277
        let n_edges = unc_graph.iter_edges().count();
1✔
3278
        unc_graph.remove_lq_splits(BaseQuality::Medium).unwrap();
1✔
3279
        if print { unc_graph.to_dot("uncompressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3280
    
3281
        let n_edges_af = unc_graph.iter_edges().count();
1✔
3282
        assert_eq!(n_edges, n_edges_af + 11);
1✔
3283

3284
        // make compressed graph
3285
        let spec = CheckCompress::new(|d: IDMapEMQualityData, _| d, |d, d1| d.join_test(d1));
260✔
3286
        let mut c_graph = compress_kmers_with_hash(true, &spec, &kmers, false, false).finish();
1✔
3287
    
3288
         // add "mapped" ids to graph
3289
        for i in 0..c_graph.len() {
28✔
3290
            let data = c_graph.mut_data(i);
28✔
3291
            if let Some(ids) = data.ids() {
28✔
3292
                if ids.contains(&1) {
28✔
3293
                    data.set_mapped_ids(vec![1].into());
16✔
3294
                }
16✔
3295
                else if ids.contains(&2) {
12✔
3296
                    data.set_mapped_ids(vec![2].into());
3✔
3297
                }
9✔
NEW
3298
            }
×
3299
        }
3300

3301
        let colors = Colors::new(&c_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 7 });
1✔
3302

3303
        if print { c_graph.to_dot("compressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3304
        let n_edges = c_graph.iter_edges().count();
1✔
3305
        c_graph.remove_lq_splits(BaseQuality::Medium).unwrap();
1✔
3306
        if print { c_graph.to_dot("compressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3307
    
3308
        let n_edges_af = c_graph.iter_edges().count();
1✔
3309
        assert_eq!(n_edges, n_edges_af + 11);
1✔
3310
    }
1✔
3311

3312
    #[test]
3313
    fn test_remove_lq_ladders_tips() {
1✔
3314
        let print = false;
1✔
3315

3316
        type K = Kmer16;
3317

3318
        let seqs = build_reads_quality_test();
1✔
3319
        let sample_info = SampleInfo::new(1, 0b111110, vec![1000, 10, 20, 20, 20, 20]);
1✔
3320
        let summary_config = SummaryConfig::new(sample_info);
1✔
3321
        let (kmers, _) = filter_kmers::<IDMapEMQualityData, K, IDTag>(&seqs, &summary_config, false, 1., false);
1✔
3322

3323

3324
        // make uncompressed graph
3325
        let mut unc_graph = uncompressed_graph(&kmers, true).finish();
1✔
3326

3327
        // add "mapped" ids to graph
3328
        for i in 0..unc_graph.len() {
263✔
3329
            let data = unc_graph.mut_data(i);
263✔
3330
            if let Some(ids) = data.ids() {
263✔
3331
                if ids.contains(&1) {
263✔
3332
                    data.set_mapped_ids(vec![1].into());
99✔
3333
                }
99✔
3334
                else if ids.contains(&2) {
164✔
3335
                    data.set_mapped_ids(vec![2].into());
80✔
3336
                }
84✔
NEW
3337
            }
×
3338
        }
3339

3340
        let colors = Colors::new(&unc_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 7 });
1✔
3341

3342
        if print { unc_graph.to_dot("uncompressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3343
        let n_edges = unc_graph.iter_edges().count();
1✔
3344
        unc_graph.remove_lq_ladders_tips(BaseQuality::Medium, 4).unwrap();
1✔
3345
        if print { unc_graph.to_dot("uncompressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3346
    
3347
        let n_edges_af = unc_graph.iter_edges().count();
1✔
3348
        assert_eq!(n_edges, n_edges_af + 90);
1✔
3349

3350
        // make compressed graph
3351
        let spec = CheckCompress::new(|d: IDMapEMQualityData, _| d, |d, d1| d.join_test(d1));
260✔
3352
        let mut c_graph = compress_kmers_with_hash(true, &spec, &kmers, false, false).finish();
1✔
3353
    
3354
         // add "mapped" ids to graph
3355
        for i in 0..c_graph.len() {
28✔
3356
            let data = c_graph.mut_data(i);
28✔
3357
            if let Some(ids) = data.ids() {
28✔
3358
                if ids.contains(&1) {
28✔
3359
                    data.set_mapped_ids(vec![1].into());
16✔
3360
                }
16✔
3361
                else if ids.contains(&2) {
12✔
3362
                    data.set_mapped_ids(vec![2].into());
3✔
3363
                }
9✔
NEW
3364
            }
×
3365
        }
3366

3367
        let colors = Colors::new(&c_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 7 });
1✔
3368

3369
        if print { c_graph.to_dot("compressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3370
        let n_edges = c_graph.iter_edges().count();
1✔
3371
        c_graph.remove_lq_ladders_tips(BaseQuality::Medium, 4).unwrap();
1✔
3372
        if print { c_graph.to_dot("compressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3373
    
3374
        let n_edges_af = c_graph.iter_edges().count();
1✔
3375
        assert_eq!(n_edges, n_edges_af + 15);
1✔
3376
    }
1✔
3377

3378
    #[test]
3379
    fn test_remove_ladders() {
1✔
3380
        let print = false; 
1✔
3381
        let c_csv = if print { Some("c_ladders.csv") } else { None };
1✔
3382
        let uc_csv = if print { Some("uc_ladders.csv") } else { None };
1✔
3383

3384
        let   correct = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
1✔
3385
        let incorrect = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGGCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
1✔
3386
        let incorrec2 = "TGACAGCTGACGGCTGACTACTACGTCACTGACGATGCTGACAC".as_bytes();
1✔
3387
        let incorrec3 = "AAAAAAAAAGCTGACTGCTGACGGCTG".as_bytes();
1✔
3388
        let incorrec4 = "ACGGCTGACTACTGACTGAAAAAAAAAAA".as_bytes();
1✔
3389

3390
        let insertion = "ACGATCGATCGCGATCGATAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
1✔
3391

3392
        let mut reads = Reads::new(crate::reads::Strandedness::Forward);
1✔
3393
        for _i in 0..1000 {
1,000✔
3394
            reads.add_from_bytes(correct, None, IDTag::new(0, 0));
1,000✔
3395
        }
1,000✔
3396

3397
        for _i in 0..2 {
2✔
3398
            reads.add_from_bytes(incorrect, None, IDTag::new(1, 1)); // should be removed
2✔
3399
        }
2✔
3400

3401
        for _i in 0..15 {
15✔
3402
            reads.add_from_bytes(incorrec2, None, IDTag::new(2, 2)); // should be removed
15✔
3403
        }
15✔
3404

3405
        for _i in 0..25 {
25✔
3406
            reads.add_from_bytes(incorrec3, None, IDTag::new(3, 3)); // should be removed
25✔
3407
        }
25✔
3408

3409
        for _i in 0..30 {
30✔
3410
            reads.add_from_bytes(incorrec4, None, IDTag::new(4, 4)); // should be removed
30✔
3411
        }
30✔
3412

3413

3414
        for _i in 0..1 {
1✔
3415
            reads.add_from_bytes(insertion, None, IDTag::new(1, 3)); // should not be removed
1✔
3416
        }
1✔
3417

3418
        let seqs = ReadsPaired::Unpaired { reads };
1✔
3419
        let sample_info = SampleInfo::new(1, 0b111110, vec![1000, 10, 20, 20, 20, 20]);
1✔
3420
        let summary_config = SummaryConfig::new(sample_info);
1✔
3421
        let (kmers, _) = filter_kmers::<IDMapEMData, Kmer16, IDTag>(&seqs, &summary_config, false, 1., false);
1✔
3422

3423

3424
        // test with uncompressed graph
3425
        let mut unc_graph = uncompressed_graph(&kmers, true).finish();
1✔
3426
        // add ids to graph
3427
        for i in 0..unc_graph.len() {
127✔
3428
            let data = unc_graph.mut_data(i);
127✔
3429
            if let Some(ids) = data.ids() {
127✔
3430
                if ids.contains(&0) {
127✔
3431
                    data.set_mapped_ids(vec![0].into());
50✔
3432
                }
77✔
3433
            }
×
3434
        }
3435
        let colors = Colors::new(&unc_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 5 });
1✔
3436
        if print { unc_graph.to_dot("uncompressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3437
        let n_edges = unc_graph.iter_edges().count();
1✔
3438
        unc_graph.remove_ladders(10, 10., uc_csv).unwrap();
1✔
3439
        if print { unc_graph.to_dot("uncompressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3440
        assert_eq!(n_edges - 10, unc_graph.iter_edges().count());
1✔
3441

3442
        // test with compressed graph
3443
        let spec = CheckCompress::new(|d: IDMapEMData, _| d, |d, d1| d.join_test(d1));
124✔
3444
        let mut c_graph = compress_kmers_with_hash(true, &spec, &kmers, false, false).finish();
1✔
3445
        // add ids to graph
3446
        for i in 0..c_graph.len() {
15✔
3447
            let data = c_graph.mut_data(i);
15✔
3448
            if let Some(ids) = data.ids() {
15✔
3449
                if ids.contains(&0) {
15✔
3450
                    data.set_mapped_ids(vec![0].into());
5✔
3451
                }
10✔
3452
            }
×
3453
        }
3454
        let colors = Colors::new(&c_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 5 });
1✔
3455
        if print { c_graph.to_dot("compressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3456
        let n_edges = c_graph.iter_edges().count();
1✔
3457
        c_graph.remove_ladders(10, 10., c_csv).unwrap();
1✔
3458
        if print { c_graph.to_dot("compressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3459
        assert_eq!(n_edges - 6, c_graph.iter_edges().count());
1✔
3460
    }
1✔
3461

3462
    #[test]
3463
    fn test_remove_tips() {
1✔
3464

3465
        let print = false;
1✔
3466
        let c_csv = if print { Some("c_tips.csv") } else { None };
1✔
3467
        let uc_csv = if print { Some("uc_tips.csv") } else { None };
1✔
3468

3469
        let     correct = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
1✔
3470
        let incorrect_r = "ACGATCGATCGCGATCGTAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTAACGTGAC".as_bytes();
1✔
3471
        let incorrect_l = "ACGTTCGATCGCGATCGTAGCTGACTGCTGACGTCTGACTACTGACTGATGCTAGCTATCGTGAC".as_bytes();
1✔
3472

3473

3474

3475
        let mut reads = Reads::new(crate::reads::Strandedness::Forward);
1✔
3476
        for _i in 0..1000 {
1,000✔
3477
            reads.add_from_bytes(correct, None, IDTag::new(0, 0));
1,000✔
3478
        }
1,000✔
3479

3480
        for _i in 0..10 {
10✔
3481
            reads.add_from_bytes(incorrect_r, None, IDTag::new(1, 1)); // should be removed
10✔
3482
        }
10✔
3483

3484
        for _i in 0..10 {
10✔
3485
            reads.add_from_bytes(incorrect_l, None, IDTag::new(2, 2)); // should be removed
10✔
3486
        }
10✔
3487

3488
        let seqs = ReadsPaired::Unpaired { reads };
1✔
3489
        let sample_info = SampleInfo::new(1, 6, vec![1000, 10, 20]);
1✔
3490
        let summary_config = SummaryConfig::new(sample_info);
1✔
3491
        let (kmers, _) = filter_kmers::<IDMapEMData, Kmer16, IDTag>(&seqs, &summary_config, false, 1., false);
1✔
3492

3493

3494
        // test with uncompressed graph
3495
        let mut unc_graph = uncompressed_graph(&kmers, true).finish();
1✔
3496
        // add ids to graph
3497
        for i in 0..unc_graph.len() {
61✔
3498
            let data = unc_graph.mut_data(i);
61✔
3499
            if let Some(ids) = data.ids() {
61✔
3500
                if ids.contains(&0) {
61✔
3501
                    data.set_mapped_ids(vec![0].into());
50✔
3502
                }
50✔
3503
            }
×
3504
        }
3505

3506
        let colors = Colors::new(&unc_graph, &summary_config, crate::colors::ColorMode::IDS { n_ids: 3 });
1✔
3507
        if print { unc_graph.to_dot("uncompressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3508
        let n_edges = unc_graph.iter_edges().count();
1✔
3509
        
3510
        unc_graph.remove_tips(10, 10., uc_csv).unwrap();
1✔
3511
        if print { unc_graph.to_dot("uncompressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3512
        assert_eq!(n_edges - 11, unc_graph.iter_edges().count());
1✔
3513

3514
        // test with compressed graph
3515
        let spec = CheckCompress::new(|d: IDMapEMData, _| d, |d, d1| d.join_test(d1));
56✔
3516
        let mut c_graph = compress_kmers_with_hash(true, &spec, &kmers, false, false).finish();
1✔
3517
        // add ids to graph
3518
        for i in 0..c_graph.len() {
5✔
3519
            let data = c_graph.mut_data(i);
5✔
3520
            if let Some(ids) = data.ids() {
5✔
3521
                if ids.contains(&0) {
5✔
3522
                    data.set_mapped_ids(vec![0].into());
3✔
3523
                }
3✔
3524
            }
×
3525
        }
3526

3527
        let colors = Colors::new(&c_graph, &summary_config, crate::colors::ColorMode::SampleGroups);
1✔
3528
        if print { c_graph.to_dot("compressed_bf.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3529
        let n_edges = c_graph.iter_edges().count();
1✔
3530
        
3531
        c_graph.remove_tips(10, 10., c_csv).unwrap();
1✔
3532
        if print { c_graph.to_dot("compressed_af.dot", &|node| node.node_dot_default(&colors, &summary_config, &Translator::empty(), false, false), &|node, base, dir, flip| node.edge_dot_default(&colors, base, dir, flip)); }
1✔
3533
        assert_eq!(n_edges - 2, c_graph.iter_edges().count());
1✔
3534

3535
    }
1✔
3536

3537
    #[test]
3538
    fn test_to_tsv() {
1✔
3539
        let reads_us = Reads::from_vmer_vec(
1✔
3540
            (0..10).map(|i| (DnaString::from_bytes(&random_dna(100)), Exts::empty(), i as u8)).collect::<Vec<_>>(), 
10✔
3541
            crate::reads::Strandedness::Unstranded
1✔
3542
        );
3543

3544
        let reads_paired = ReadsPaired::Unpaired { reads: reads_us };
1✔
3545

3546
        let sample_info = SampleInfo::new(0b1111100000, 0b0000011111, vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
1✔
3547
        let summary_config = SummaryConfig::new(sample_info);
1✔
3548
        let (kmers, _) = filter_kmers::<TagsCountsData, Kmer6, _>(&reads_paired, &summary_config, false, 1., false);
1✔
3549

3550
        let graph = compress_kmers_with_hash(false, &ScmapCompress::new(), &kmers, false, false).finish();
1✔
3551

3552
        graph.to_tsv("test_graph_unstranded.tsv", |node| node.data().print_ol(&Translator::empty(), &summary_config, None)).unwrap();
327✔
3553

3554

3555
        let reads_us = Reads::from_vmer_vec(
1✔
3556
            (0..10).map(|i| (DnaString::from_bytes(&random_dna(100)), Exts::empty(), i as u8)).collect::<Vec<_>>(), 
10✔
3557
            crate::reads::Strandedness::Forward
1✔
3558
        );
3559

3560
        let reads_paired = ReadsPaired::Unpaired { reads: reads_us };
1✔
3561

3562
        let sample_info = SampleInfo::new(0b1111100000, 0b0000011111, vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
1✔
3563
        let summary_config = SummaryConfig::new(sample_info);
1✔
3564
        let (kmers, _) = filter_kmers::<TagsCountsData, Kmer6, _>(&reads_paired, &summary_config, false, 1., false);
1✔
3565

3566
        let graph = compress_kmers_with_hash(true, &ScmapCompress::new(), &kmers, false, false).finish();
1✔
3567

3568
        graph.to_tsv("test_graph_stranded.tsv", |node| node.data().print_ol(&Translator::empty(), &summary_config, None)).unwrap();
230✔
3569
    
3570
        remove_file("test_graph_unstranded.tsv").unwrap();
1✔
3571
        remove_file("test_graph_stranded.tsv").unwrap();
1✔
3572
    }
1✔
3573
}
3574

STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc