• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

griffithlab / regtools / 3689436247

pending completion
3689436247

push

github

GitHub
Merge pull request #169 from griffithlab/doc_update

2407 of 2774 relevant lines covered (86.77%)

3662.66 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

85.89
/src/junctions/junctions_annotator.cc
1
/*  junctions_annotator.cc -- class methods for `junctions annotate`
2

3
    Copyright (c) 2015, The Griffith Lab
4

5
    Author: Avinash Ramu <aramu@genome.wustl.edu>
6

7
Permission is hereby granted, free of charge, to any person obtaining a copy
8
of this software and associated documentation files (the "Software"), to deal
9
in the Software without restriction, including without limitation the rights
10
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
copies of the Software, and to permit persons to whom the Software is
12
furnished to do so, subject to the following conditions:
13

14
The above copyright notice and this permission notice shall be included in
15
all copies or substantial portions of the Software.
16

17
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23
DEALINGS IN THE SOFTWARE.  */
24

25
#include <getopt.h>
26
#include <stdexcept>
27
#include <string>
28
#include "common.h"
29
#include "junctions_annotator.h"
30
#include "htslib/faidx.h"
31

32
using namespace std;
33

34
//Return stream to write output to
35
void JunctionsAnnotator::close_ofstream() {
1✔
36
    if(ofs_.is_open())
1✔
37
        ofs_.close();
1✔
38
}
1✔
39

40
//Return stream to write output to
41
//If output file is not empty, attempt to open
42
//If output file is empty, set to cout
43
void JunctionsAnnotator::set_ofstream_object(ofstream &out) {
1✔
44
    if(output_file_ == "NA") {
1✔
45
        common::copy_stream(cout, out);
×
46
        return;
×
47
    }
48
    ofs_.open(output_file_.c_str());
1✔
49
    if(!ofs_.is_open())
1✔
50
        throw runtime_error("Unable to open " +
×
51
                            output_file_);
×
52
    common::copy_stream(ofs_, out);
1✔
53
}
54

55
//Open junctions file
56
void JunctionsAnnotator::open_junctions() {
7✔
57
    junctions_.Open();
7✔
58
}
7✔
59

60
//Open junctions file
61
void JunctionsAnnotator::close_junctions() {
7✔
62
    junctions_.Close();
7✔
63
}
7✔
64

65
//Adjust the start and end of the junction
66
void JunctionsAnnotator::adjust_junction_ends(BED & line) {
47✔
67
    //Adjust the start and end with block sizes
68
    //The junction start is thick_start + block_size1
69
    //The junction end is thick_end - block_size2 + 1
70
    if(line.fields.size() != 12  || line.fields[10].empty()) {
47✔
71
        stringstream position;
×
72
        position << line.chrom << ":" << line.start;
×
73
        throw runtime_error("BED line not in BED12 format. start: " +
×
74
                            position.str());
×
75
    }
×
76
    string blocksize_field = line.fields[10];
47✔
77
    vector<int> block_sizes;
47✔
78
    Tokenize(blocksize_field, block_sizes, ',');
47✔
79
    line.start += block_sizes[0];
47✔
80
    line.end -= block_sizes[1] - 1;
47✔
81
}
47✔
82

83
//Get a single line from the junctions file
84
bool JunctionsAnnotator::get_single_junction(BED & line) {
54✔
85
    junctions_._status = BED_INVALID;
54✔
86
    if(junctions_.GetNextBed(line) && junctions_._status == BED_VALID) {
54✔
87
        return true;
47✔
88
    } else {
89
        return false;
7✔
90
    }
91
}
92

93
//Get the splice_site bases
94
void JunctionsAnnotator::get_splice_site(AnnotatedJunction & line) {
59✔
95
    string position1 = line.chrom + ":" +
118✔
96
                      common::num_to_str(line.start + 1) + "-" + common::num_to_str(line.start + 2);
236✔
97
    string position2 = line.chrom + ":" +
118✔
98
                      common::num_to_str(line.end - 2) + "-" + common::num_to_str(line.end - 1);
236✔
99
    string seq1, seq2;
59✔
100
    try {
101
        seq1 = get_reference_sequence(position1);
59✔
102
        seq2 = get_reference_sequence(position2);
59✔
103
    } catch (const runtime_error& e) {
×
104
        throw e;
×
105
    }
×
106
    if(line.strand == "-") {
59✔
107
        seq1 = common::rev_comp(seq1);
4✔
108
        seq2 = common::rev_comp(seq2);
4✔
109
        line.splice_site = seq2 + "-" + seq1;
4✔
110
    } else {
111
        line.splice_site = seq1 + "-" + seq2;
55✔
112
    }
113
    return;
118✔
114
}
59✔
115

116
//Extract gtf info
117
bool JunctionsAnnotator::load_gtf() {
1✔
118
    try {
119
        gtf_.load();
1✔
120
    } catch (runtime_error e) {
×
121
        throw e;
×
122
    }
×
123
    return true;
1✔
124
}
125

126
//Find overlap between transcript and junction on the negative strand,
127
//function returns true if either the acceptor or the donor is known
128
bool JunctionsAnnotator::overlap_ps(const vector<BED>& exons,
103✔
129
                                          AnnotatedJunction & junction) {
130
    //skip single exon genes
131
    if(skip_single_exon_genes_ && exons.size() == 1) return false;
103✔
132
    bool junction_start = false;
55✔
133
    bool known_junction = false;
55✔
134
    //check if transcript overlaps with junction
135
    if(exons[0].start > junction.end ||
110✔
136
            exons[exons.size() - 1].end < junction.start)
55✔
137
        return known_junction;
×
138
    for(std::size_t i = 0; i < exons.size(); i++) {
1,156✔
139
        if(exons[i].start > junction.end) {
1,155✔
140
            //No need to look any further
141
            //the rest of the exons are outside the junction
142
            break;
54✔
143
        }
144
        //known junction
145
        if(exons[i].end == junction.start &&
1,154✔
146
                exons[i + 1].start == junction.end) {
53✔
147
            junction.known_acceptor = true;
30✔
148
            junction.known_donor = true;
30✔
149
            junction.known_junction = true;
30✔
150
            known_junction = true;
30✔
151
        }
152
        else {
153
            // YY NOTE: if we have not yet reached the junction on this transcript,
154
            if(!junction_start) {
1,071✔
155
                // then check if we have with this exon (i.e. the 
156
                // exon end is past or at the junction start)
157
                if(exons[i].end >= junction.start) {
1,028✔
158
                    junction_start = true;
55✔
159
                }
160
            }
161
            if(junction_start) {
1,071✔
162
                // YY NOTE: if the exon lies completely within the junction,
163
                //              count it as skipped
164
                if(exons[i].start > junction.start &&
98✔
165
                        exons[i].end < junction.end &&
74✔
166
                        i > 0 && i < (exons.size()-1)) {
172✔
167
                    string exon_coords = common::num_to_str(exons[i].start) + "-" + common::num_to_str(exons[i].end);
42✔
168
                    junction.exons_skipped.insert(exon_coords);
21✔
169
                }
21✔
170
                // YY NOTE: if the exon ends after the junction starts
171
                //              (and junction_start == true), then 
172
                //              count the donor as skipped
173
                        // YY NOTE: maybe it should be junction.end-1? since
174
                        //              if the "donor" and "acceptor" are already
175
                        //              contiguous in the DNA it would get counted
176
                        //              as "skipped" 
177
                if(exons[i].end > junction.start &&
98✔
178
                        exons[i].end < junction.end && 
119✔
179
                        i < (exons.size()-1)) {
21✔
180
                    junction.donors_skipped.insert(exons[i].end);
21✔
181
                }
182
                // YY NOTE: if the exon starts before the junction ends
183
                //              (and junction_start == true), then
184
                //              count the acceptor as skipped
185
                if(exons[i].start < junction.end &&
98✔
186
                        exons[i].start > junction.start &&
98✔
187
                        i > 0) {
188
                    junction.acceptors_skipped.insert(exons[i].start);
22✔
189
                }
190
                if(exons[i].end == junction.start) {
98✔
191
                    junction.known_donor = true;
23✔
192
                }
193
                if(exons[i].start == junction.end) {
98✔
194
                    junction.known_acceptor = true;
52✔
195
                }
196
            }
197
        }
198
    }
199
    annotate_anchor(junction);
55✔
200
    return (junction.anchor != "N");
55✔
201
}
202

203
//YY NOTES
204
/*
205

206
So based on my understanding of the bin search, we should be looking at all the 
207
possible transcripts that might overlap with a junction. That is to say, it 
208
would seem that any discrepancy in acceptors/exons/donors skipped would come 
209
from a problem with the actual counting.
210

211
The overlap function will be run on each transcript, and you can see from the
212
if block how it judges an acceptor/exon/donor as skipped. That logic appears
213
correct to me (see notes above). Doesn't immediately seem like a problem
214
with the actual recording of acceptors/exons/donors skipped (but probably
215
warrants a closer look).
216

217
Then the final place the count could get messed up is in the actual
218
printing to the tsv. The way this works is the skipped elements are 
219
held in sets. Acceptors/donors skipped are held in sets based on their 
220
coordinates while exons skipped are held in sets based on their names.
221
The number of elements skipped is simply the size of the set, so we only 
222
count unique elements. Also don't see a problem immediately with how this works.
223

224
*/
225

226
//Find overlap between transcript and junction on the negative strand,
227
//function returns true if either the acceptor or the donor is known
228
bool JunctionsAnnotator::overlap_ns(const vector<BED> & exons,
8✔
229
                                          AnnotatedJunction & junction) {
230
    //skip single exon genes
231
    if(skip_single_exon_genes_ && exons.size() == 1) return false;
8✔
232
    bool junction_start = false;
4✔
233
    bool known_junction = false;
4✔
234
    //check if transcript overlaps with junction
235
    if(exons[0].end < junction.start ||
8✔
236
            exons[exons.size() - 1].start > junction.end) {
4✔
237
        return known_junction;
×
238
    }
239
    for(std::size_t i = 0; i < exons.size(); i++) {
11✔
240
        if(exons[i].end < junction.start) {
11✔
241
            //No need to look any further
242
            //the rest of the exons are outside the junction
243
            break;
4✔
244
        }
245
        //Check if this is a known junction
246
        if(exons[i].start == junction.end &&
9✔
247
                exons[i + 1].end == junction.start) {
2✔
248
            junction.known_acceptor = true;
1✔
249
            junction.known_donor = true;
1✔
250
            junction.known_junction = true;
1✔
251
            known_junction = true;
1✔
252
        }
253
        else {
254
            if(!junction_start) {
6✔
255
                if(exons[i].start <= junction.end) {
4✔
256
                    junction_start = true;
3✔
257
                }
258
            }
259
            if(junction_start) {
6✔
260
                if(exons[i].start > junction.start &&
5✔
261
                        exons[i].end < junction.end &&
2✔
262
                        i > 0 && i < (exons.size()-1)) {
7✔
263
                    string exon_coords = common::num_to_str(exons[i].start) + "-" + common::num_to_str(exons[i].end);
×
264
                    junction.exons_skipped.insert(exon_coords);
×
265
                }
×
266
                // YY NOTE: if the exon ends after the junction starts
267
                //              (and junction_start == true), then 
268
                //              count the donor as skipped
269
                if(exons[i].end > junction.start &&
5✔
270
                        exons[i].end < junction.end && 
6✔
271
                        i < (exons.size()-1)) {
1✔
272
                    junction.acceptors_skipped.insert(exons[i].end);
1✔
273
                }
274
                // YY NOTE: if the exon starts before the junction ends
275
                //              (and junction_start == true), then
276
                //              count the acceptor as skipped
277
                if(exons[i].start < junction.end &&
9✔
278
                        exons[i].start > junction.start) {
4✔
279
                    junction.donors_skipped.insert(exons[i].start);
1✔
280
                }
281
                if(exons[i].end == junction.start) {
5✔
282
                    junction.known_acceptor = true;
2✔
283
                }
284
                if(exons[i].start == junction.end) {
5✔
285
                    junction.known_donor = true;
1✔
286
                }
287
            }
288
        }
289
    }
290
    annotate_anchor(junction);
4✔
291
    return (junction.anchor != "N");
4✔
292
}
293

294
//Annotate the anchor i.e is this a known/novel donor-acceptor pair
295
void JunctionsAnnotator::annotate_anchor(AnnotatedJunction & junction) {
59✔
296
    junction.anchor = string("N");
59✔
297
    if(junction.known_junction) {
59✔
298
        junction.anchor = "DA";
31✔
299
    } else {
300
        if(junction.known_donor)
28✔
301
            if(junction.known_acceptor)
24✔
302
                junction.anchor = string("NDA");
21✔
303
            else
304
                junction.anchor = string("D");
3✔
305
        else if(junction.known_acceptor)
4✔
306
            junction.anchor = string("A");
2✔
307
    }
308
}
59✔
309

310
//Check for overlap between a transcript and junctions
311
//Check if the junction we saw is a known junction
312
//Calculate exons_skipped, donors_skipped, acceptors_skipped
313
void JunctionsAnnotator::check_for_overlap(string transcript_id, AnnotatedJunction & junction) {
205✔
314
    const vector<BED> & exons =
315
        gtf_.get_exons_from_transcript(transcript_id);
205✔
316
    if(!exons.size()) {
205✔
317
        throw runtime_error("Unexpected error. No exons for transcript "
×
318
                            + transcript_id + "\n\n");
×
319
    }
320
    string transcript_strand = exons[0].strand;
205✔
321
    //Make sure the strands of the junction and transcript match
322
    if(junction.strand != transcript_strand)
205✔
323
        return;
94✔
324
    //Remember exons are sorted from exon1 to last exon
325
    if(junction.strand == "+") {
111✔
326
        if(overlap_ps(exons, junction)) {
103✔
327
            junction.transcripts_overlap.insert(transcript_id);
54✔
328
            junction.genes_overlap.insert(
54✔
329
                    gtf_.get_gene_from_transcript(transcript_id));
108✔
330
        }
331
    } else if(junction.strand == "-") {
8✔
332
        if(overlap_ns(exons, junction)) {
8✔
333
            junction.transcripts_overlap.insert(transcript_id);
3✔
334
            junction.genes_overlap.insert(
3✔
335
                    gtf_.get_gene_from_transcript(transcript_id));
6✔
336
        }
337
    } else {
338
        throw runtime_error("Unknown strand " + junction.strand + "\n\n");
×
339
    }
340
}
205✔
341

342
//Annotate with gtf
343
//Takes a single junction BED and annotates with GTF
344
void JunctionsAnnotator::annotate_junction_with_gtf(AnnotatedJunction & j1) {
59✔
345
    //From BedTools
346
    BIN start_bin, end_bin;
347
    start_bin = (j1.start >> _binFirstShift);
59✔
348
    end_bin = ((j1.end-1) >> _binFirstShift);
59✔
349
    //We loop through each UCSC BIN level for feature A's chrom.
350
    //For each BIN, we loop through each B feature and add it to
351
    // hits if it meets all of the user's requests.
352
    for (BINLEVEL i = 0; i < _binLevels; ++i) {
472✔
353
        BIN offset = _binOffsetsExtended[i];
413✔
354
        for (BIN b = (start_bin + offset); b <= (end_bin + offset); ++b) {
837✔
355
            vector<string> transcripts = gtf_.transcripts_from_bin(j1.chrom, b);
424✔
356
            if(transcripts.size())
424✔
357
                for(std::size_t i = 0; i < transcripts.size(); i++)
299✔
358
                    check_for_overlap(transcripts[i], j1);
205✔
359
        }
424✔
360
        start_bin >>= _binNextShift;
413✔
361
        end_bin >>= _binNextShift;
413✔
362
    }
363
}
59✔
364

365
//Get the reference sequence at a particular coordinate
366
string JunctionsAnnotator::get_reference_sequence(string position) {
118✔
367
    int len;
368
    faidx_t *fai = fai_load(ref_.c_str());
118✔
369
    char *s = fai_fetch(fai, position.c_str(), &len);
118✔
370
    cerr << "position = " << position << endl;
118✔
371
    if(s == NULL)
118✔
372
        throw runtime_error("Unable to extract FASTA sequence "
×
373
                             "for position " + position + "\n\n");
×
374
    std::string seq(s);
118✔
375
    free(s);
118✔
376
    fai_destroy(fai);
118✔
377
    return seq;
236✔
378
}
×
379

380
//Get the name of the GTF file
381
string JunctionsAnnotator::gtf_file() {
1✔
382
    return gtf_.gtffile();
1✔
383
}
384

385
//Parse the options passed to this tool
386
int JunctionsAnnotator::parse_options(int argc, char *argv[]) {
3✔
387
    optind = 1; //Reset before parsing again.
3✔
388
    int c;
389
    stringstream help_ss;
3✔
390
    while((c = getopt(argc, argv, "So:h")) != -1) {
4✔
391
        switch(c) {
2✔
392
            case 'S':
×
393
                skip_single_exon_genes_ = false;
×
394
                break;
×
395
            case 'o':
1✔
396
                output_file_ = string(optarg);
1✔
397
                break;
1✔
398
            case 'h':
1✔
399
                usage(help_ss);
1✔
400
                throw common::cmdline_help_exception(help_ss.str());
1✔
401
            default:
×
402
                usage();
×
403
                throw runtime_error("Error parsing inputs!(1)\n\n");
×
404
        }
405
    }
406
    if(argc - optind >= 3) {
2✔
407
        junctions_.bedFile = string(argv[optind++]);
2✔
408
        ref_ = string(argv[optind++]);
2✔
409
        gtf_.set_gtffile(string(argv[optind++]));
2✔
410
    }
411
    if(optind < argc ||
2✔
412
       ref_ == "NA" ||
4✔
413
       junctions_.bedFile.empty() ||
6✔
414
       gtf_.gtffile().empty()) {
4✔
415
        usage();
×
416
        throw runtime_error("Error parsing inputs!(2)\n\n");
×
417
    }
418
    cerr << "Reference: " << ref_ << endl;
2✔
419
    cerr << "GTF: " << gtf_.gtffile() << endl;
2✔
420
    cerr << "Junctions: " << junctions_.bedFile << endl;
2✔
421
    if(skip_single_exon_genes_)
2✔
422
        cerr << "Skipping single exon genes." << endl;
2✔
423
    if(output_file_ != "NA")
2✔
424
        cerr << "Output file: " << output_file_ << endl;
1✔
425
    cerr << endl;
2✔
426
    return 0;
2✔
427
}
3✔
428

429
//Usage statement for this tool
430
int JunctionsAnnotator::usage(ostream& out) {
1✔
431
    out << "Usage:\t\t" << "regtools junctions annotate [options] junctions.bed ref.fa annotations.gtf" << endl;
1✔
432
    out << "Options:\t" << "-S include single exon genes" << endl;
1✔
433
    out << "\t\t" << "-o FILE\tThe file to write output to. [STDOUT]" << endl;
1✔
434
    out << endl;
1✔
435
    return 0;
1✔
436
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc