• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

rust-bio / rust-htslib / 19161687300

07 Nov 2025 07:43AM UTC coverage: 81.912% (-0.02%) from 81.935%
19161687300

Pull #488

github

web-flow
Merge 907dabbe9 into 8f1cdd75c
Pull Request #488: fix: Reason about Send/Sync-ness of types and change Rcs to Arcs

34 of 36 new or added lines in 7 files covered. (94.44%)

20 existing lines in 6 files now uncovered.

2785 of 3400 relevant lines covered (81.91%)

27209.26 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.41
/src/tbx/mod.rs
1
// Copyright 2018 Manuel Holtgrewe, Berlin Institute of Health.
2
// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3
// This file may not be copied, modified, or distributed
4
// except according to those terms.
5

6
//! Module for working with tabix-indexed text files.
7
//!
8
//! This module allows to read tabix-indexed text files (such as BED) in a convenient but in a
9
//! line-based (and thus format-agnostic way). For accessing tabix-inxed VCF files, using the
10
//! `bcf` module is probably a better choice as this module gives you lines from the text files
11
//! which you then have to take care of parsing.
12
//!
13
//! In general, for reading tabix-indexed files, first to open the file by creating a `tbx::Reader`
14
//! objects, possibly translate the chromosome name to its numeric ID in the file, fetch the region
15
//! of interest using `fetch()`, and finally iterate over the records using `records()`.
16
//!
17
//! # Examples
18
//!
19
//! ```rust,no_run
20
//! use rust_htslib::tbx::{self, Read};
21
//!
22
//! // Create a tabix reader for reading a tabix-indexed BED file.
23
//! let path_bed = "file.bed.gz";
24
//! let mut tbx_reader = tbx::Reader::from_path(&path_bed)
25
//!     .expect(&format!("Could not open {}", path_bed));
26
//!
27
//! // Resolve chromosome name to numeric ID.
28
//! let tid = match tbx_reader.tid("chr1") {
29
//!     Ok(tid) => tid,
30
//!     Err(_) => panic!("Could not resolve 'chr1' to contig ID"),
31
//! };
32
//!
33
//! // Set region to fetch.
34
//! tbx_reader
35
//!     .fetch(tid, 0, 100_000)
36
//!     .expect("Could not seek to chr1:1-100,000");
37
//!
38
//! // Read through all records in region.
39
//! for record in tbx_reader.records() {
40
//!     // ... actually do some work
41
//! }
42
//! ```
43

44
use std::ffi;
45
use std::path::Path;
46
use std::ptr;
47
use url::Url;
48

49
use crate::errors::{Error, Result};
50
use crate::htslib;
51
use crate::utils::path_as_bytes;
52

53
/// A trait for a Tabix reader with a read method.
54
pub trait Read: Sized {
55
    /// Read next line into the given `Vec<u8>` (i.e., ASCII string).
56
    ///
57
    /// Use this method in combination with a single allocated record to avoid the reallocations
58
    /// occurring with the iterator.
59
    ///
60
    /// # Arguments
61
    ///
62
    /// * `record` - the `Vec<u8>` to be filled
63
    ///
64
    /// # Returns
65
    /// Ok(true) if record was read, Ok(false) if no more record in file
66
    fn read(&mut self, record: &mut Vec<u8>) -> Result<bool>;
67

68
    /// Iterator over the lines/records of the seeked region.
69
    ///
70
    /// Note that, while being convenient, this is less efficient than pre-allocating a
71
    /// `Vec<u8>` and reading into it with the `read()` method, since every iteration involves
72
    /// the allocation of a new `Vec<u8>`.
73
    fn records(&mut self) -> Records<'_, Self>;
74

75
    /// Return the text headers, split by line.
76
    fn header(&self) -> &Vec<String>;
77
}
78

79
/// A Tabix file reader.
80
///
81
/// This struct and its associated functions are meant for reading plain-text tabix indexed
82
/// by `tabix`.
83
///
84
/// Note that the `tabix` command from `htslib` can actually several more things, including
85
/// building indices and converting BCF to VCF text output.  Both is out of scope here.
86
#[derive(Debug)]
87
pub struct Reader {
88
    /// The header lines (if any).
89
    header: Vec<String>,
90

91
    /// The file to read from.
92
    hts_file: *mut htslib::htsFile,
93
    /// The file format information.
94
    hts_format: htslib::htsExactFormat,
95
    /// The tbx_t structure to read from.
96
    tbx: *mut htslib::tbx_t,
97
    /// The current buffer.
98
    buf: htslib::kstring_t,
99
    /// Iterator over the buffer.
100
    itr: Option<*mut htslib::hts_itr_t>,
101

102
    /// The currently fetch region's tid.
103
    tid: i64,
104
    /// The currently fetch region's 0-based begin pos.
105
    start: i64,
106
    /// The currently fetch region's 0-based end pos.
107
    end: i64,
108
}
109

110
unsafe impl Send for Reader {}
111

112
/// Redefinition of `KS_SEP_LINE` from `htslib/kseq.h`.
113
const KS_SEP_LINE: i32 = 2;
114

115
impl Reader {
116
    /// Create a new Reader from path.
117
    ///
118
    /// # Arguments
119
    ///
120
    /// * `path` - the path to open.
121
    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
10✔
122
        Self::new(&path_as_bytes(path, true)?)
29✔
123
    }
124

125
    pub fn from_url(url: &Url) -> Result<Self> {
×
126
        Self::new(url.as_str().as_bytes())
×
127
    }
128

129
    /// Create a new Reader.
130
    ///
131
    /// # Arguments
132
    ///
133
    /// * `path` - the path.
134
    fn new(path: &[u8]) -> Result<Self> {
9✔
135
        let path = ffi::CString::new(path).unwrap();
33✔
136
        let c_str = ffi::CString::new("r").unwrap();
26✔
137
        let hts_file = unsafe { htslib::hts_open(path.as_ptr(), c_str.as_ptr()) };
34✔
138
        let hts_format: u32 = unsafe {
139
            let file_format: *const hts_sys::htsFormat = htslib::hts_get_format(hts_file);
33✔
140
            (*file_format).format
9✔
141
        };
142

143
        let tbx = unsafe { htslib::tbx_index_load(path.as_ptr()) };
26✔
144
        if tbx.is_null() {
17✔
145
            return Err(Error::TabixInvalidIndex);
4✔
146
        }
147
        let mut header = Vec::new();
11✔
148
        let mut buf = htslib::kstring_t {
149
            l: 0,
150
            m: 0,
151
            s: ptr::null_mut(),
6✔
152
        };
153
        unsafe {
154
            while htslib::hts_getline(hts_file, KS_SEP_LINE, &mut buf) >= 0 {
15✔
155
                if buf.l > 0 && i32::from(*buf.s) == (*tbx).conf.meta_char {
24✔
156
                    header.push(String::from(ffi::CStr::from_ptr(buf.s).to_str().unwrap()));
12✔
157
                } else {
158
                    break;
5✔
159
                }
160
            }
161
        }
162

163
        Ok(Reader {
6✔
164
            header,
11✔
165
            hts_file,
10✔
166
            hts_format,
10✔
167
            tbx,
10✔
168
            buf,
6✔
169
            itr: None,
5✔
170
            tid: -1,
5✔
171
            start: -1,
5✔
172
            end: -1,
5✔
173
        })
174
    }
175

176
    /// Get sequence/target ID from sequence name.
177
    pub fn tid(&self, name: &str) -> Result<u64> {
6✔
178
        let name_cstr = ffi::CString::new(name.as_bytes()).unwrap();
26✔
179
        let res = unsafe { htslib::tbx_name2id(self.tbx, name_cstr.as_ptr()) };
22✔
180
        if res < 0 {
8✔
181
            Err(Error::UnknownSequence {
2✔
182
                sequence: name.to_owned(),
2✔
183
            })
184
        } else {
185
            Ok(res as u64)
5✔
186
        }
187
    }
188

189
    /// Fetch region given by numeric sequence number and 0-based begin and end position.
190
    pub fn fetch(&mut self, tid: u64, start: u64, end: u64) -> Result<()> {
3✔
191
        self.tid = tid as i64;
3✔
192
        self.start = start as i64;
3✔
193
        self.end = end as i64;
3✔
194

195
        if let Some(itr) = self.itr {
3✔
196
            unsafe {
UNCOV
197
                htslib::hts_itr_destroy(itr);
×
198
            }
199
        }
200
        let itr = unsafe {
201
            htslib::hts_itr_query(
202
                (*self.tbx).idx,
3✔
203
                tid as i32,
3✔
204
                start as i64,
2✔
205
                end as i64,
2✔
206
                Some(htslib::tbx_readrec),
2✔
207
            )
208
        };
209
        if itr.is_null() {
6✔
210
            self.itr = None;
×
211
            Err(Error::Fetch)
×
212
        } else {
213
            self.itr = Some(itr);
3✔
214
            Ok(())
3✔
215
        }
216
    }
217

218
    /// Return the sequence contig names.
219
    pub fn seqnames(&self) -> Vec<String> {
2✔
220
        let mut result = Vec::new();
3✔
221

222
        let mut nseq: i32 = 0;
4✔
223
        let seqs = unsafe { htslib::tbx_seqnames(self.tbx, &mut nseq) };
5✔
224
        for i in 0..nseq {
5✔
225
            unsafe {
226
                result.push(String::from(
9✔
227
                    ffi::CStr::from_ptr(*seqs.offset(i as isize))
10✔
228
                        .to_str()
3✔
229
                        .unwrap(),
3✔
230
                ));
231
            }
232
        }
233
        unsafe {
234
            libc::free(seqs as *mut libc::c_void);
2✔
235
        };
236

237
        result
2✔
238
    }
239

240
    /// Activate multi-threaded BGZF read support in htslib. This should permit faster
241
    /// reading of large BGZF files.
242
    ///
243
    /// # Arguments
244
    ///
245
    /// * `n_threads` - number of extra background reader threads to use
246
    pub fn set_threads(&mut self, n_threads: usize) -> Result<()> {
×
247
        assert!(n_threads > 0, "n_threads must be > 0");
×
248

249
        let r = unsafe { htslib::hts_set_threads(self.hts_file, n_threads as i32) };
×
250
        if r != 0 {
×
251
            Err(Error::SetThreads)
×
252
        } else {
253
            Ok(())
×
254
        }
255
    }
256

257
    pub fn hts_format(&self) -> htslib::htsExactFormat {
×
258
        self.hts_format
×
259
    }
260
}
261

262
/// Return whether the two given genomic intervals overlap.
263
fn overlap(tid1: i64, begin1: i64, end1: i64, tid2: i64, begin2: i64, end2: i64) -> bool {
3✔
264
    (tid1 == tid2) && (begin1 < end2) && (begin2 < end1)
7✔
265
}
266

267
impl Read for Reader {
268
    fn read(&mut self, record: &mut Vec<u8>) -> Result<bool> {
5✔
269
        match self.itr {
5✔
270
            Some(itr) => {
5✔
271
                loop {
1✔
272
                    // Try to read next line.
273
                    let ret = unsafe {
274
                        htslib::hts_itr_next(
275
                            htslib::hts_get_bgzfp(self.hts_file),
9✔
276
                            itr,
4✔
277
                            //mem::transmute(&mut self.buf),
278
                            &mut self.buf as *mut htslib::kstring_t as *mut libc::c_void,
5✔
279
                            //mem::transmute(self.tbx),
280
                            self.tbx as *mut libc::c_void,
5✔
281
                        )
282
                    };
283
                    // Handle errors first.
284
                    if ret == -1 {
5✔
285
                        return Ok(false);
3✔
286
                    } else if ret == -2 {
3✔
287
                        return Err(Error::TabixTruncatedRecord);
×
288
                    } else if ret < 0 {
3✔
289
                        panic!("Return value should not be <0 but was: {}", ret);
×
290
                    }
291
                    // Return first overlapping record (loop will stop when `hts_itr_next(...)`
292
                    // returns `< 0`).
293
                    let (tid, start, end) =
8✔
294
                        unsafe { ((*itr).curr_tid, (*itr).curr_beg, (*itr).curr_end) };
4✔
295
                    // XXX: Careful with this tid conversion!!!
296
                    if overlap(self.tid, self.start, self.end, tid as i64, start, end) {
15✔
297
                        *record =
4✔
298
                            unsafe { Vec::from(ffi::CStr::from_ptr(self.buf.s).to_str().unwrap()) };
9✔
299
                        return Ok(true);
3✔
300
                    }
301
                }
302
            }
303
            _ => Err(Error::TabixNoIter),
×
304
        }
305
    }
306

307
    fn records(&mut self) -> Records<'_, Self> {
2✔
308
        Records { reader: self }
309
    }
310

311
    fn header(&self) -> &Vec<String> {
×
312
        &self.header
×
313
    }
314
}
315

316
impl Drop for Reader {
317
    fn drop(&mut self) {
6✔
318
        unsafe {
319
            if self.itr.is_some() {
13✔
320
                htslib::hts_itr_destroy(self.itr.unwrap());
5✔
321
            }
322
            htslib::tbx_destroy(self.tbx);
11✔
323
            htslib::hts_close(self.hts_file);
11✔
324
        }
325
    }
326
}
327

328
/// Iterator over the lines of a tabix file.
329
#[derive(Debug)]
330
pub struct Records<'a, R: Read> {
331
    reader: &'a mut R,
332
}
333

334
impl<R: Read> Iterator for Records<'_, R> {
335
    type Item = Result<Vec<u8>>;
336

337
    #[allow(clippy::read_zero_byte_vec)]
338
    fn next(&mut self) -> Option<Result<Vec<u8>>> {
3✔
339
        let mut record = Vec::new();
5✔
340
        match self.reader.read(&mut record) {
7✔
341
            Ok(false) => None,
2✔
342
            Ok(true) => Some(Ok(record)),
2✔
343
            Err(err) => Some(Err(err)),
×
344
        }
345
    }
346
}
347

348
#[cfg(test)]
349
mod tests {
350
    use super::*;
351

352
    #[test]
353
    fn bed_basic() {
354
        let reader =
355
            Reader::from_path("test/tabix_reader/test_bed3.bed.gz").expect("Error opening file.");
356

357
        // Check sequence name vector.
358
        assert_eq!(
359
            reader.seqnames(),
360
            vec![String::from("chr1"), String::from("chr2")]
361
        );
362

363
        // Check mapping between name and idx.
364
        assert_eq!(reader.tid("chr1").unwrap(), 0);
365
        assert_eq!(reader.tid("chr2").unwrap(), 1);
366
        assert!(reader.tid("chr3").is_err());
367
    }
368

369
    #[test]
370
    fn bed_fetch_from_chr1_read_api() {
371
        let mut reader =
372
            Reader::from_path("test/tabix_reader/test_bed3.bed.gz").expect("Error opening file.");
373

374
        let chr1_id = reader.tid("chr1").unwrap();
375
        assert!(reader.fetch(chr1_id, 1000, 1003).is_ok());
376

377
        let mut record = Vec::new();
378
        assert!(reader.read(&mut record).is_ok());
379
        assert_eq!(record, Vec::from("chr1\t1001\t1002"));
380
        assert_eq!(reader.read(&mut record), Ok(false)); // EOF
381
    }
382

383
    #[test]
384
    fn bed_fetch_from_chr1_iterator_api() {
385
        let mut reader =
386
            Reader::from_path("test/tabix_reader/test_bed3.bed.gz").expect("Error opening file.");
387

388
        let chr1_id = reader.tid("chr1").unwrap();
389
        assert!(reader.fetch(chr1_id, 1000, 1003).is_ok());
390

391
        let records: Vec<Vec<u8>> = reader.records().map(|r| r.unwrap()).collect();
392
        assert_eq!(records, vec![Vec::from("chr1\t1001\t1002")]);
393
    }
394

395
    #[test]
396
    fn test_fails_on_bam() {
397
        let reader = Reader::from_path("test/test.bam");
398
        assert!(reader.is_err());
399
    }
400

401
    #[test]
402
    fn test_fails_on_non_existiant() {
403
        let reader = Reader::from_path("test/no_such_file");
404
        assert!(reader.is_err());
405
    }
406

407
    #[test]
408
    fn test_fails_on_vcf() {
409
        let reader = Reader::from_path("test/test_left.vcf");
410
        assert!(reader.is_err());
411
    }
412

413
    #[test]
414
    fn test_text_header_regions() {
415
        // This file has chromosome, start, and end positions with a header line.
416
        Reader::from_path("test/tabix_reader/genomic_regions_header.txt.gz")
417
            .expect("Error opening file.");
418
    }
419

420
    #[test]
421
    fn test_text_header_positions() {
422
        // This file has chromosome and position with a header line, indexed with
423
        // `tabix -b2 -e2 <file>`.
424
        Reader::from_path("test/tabix_reader/genomic_positions_header.txt.gz")
425
            .expect("Error opening file.");
426
    }
427

428
    #[test]
429
    fn test_text_bad_header() {
430
        // This is a duplicate of the above file but the index file is nonsense text.
431
        Reader::from_path("test/tabix_reader/bad_header.txt.gz")
432
            .expect_err("Invalid index file should fail.");
433
    }
434
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc