• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

mooreryan / pasv / 124

pending completion
124

push

github

mooreryan
Bump version

1 of 1 new or added line in 1 file covered. (100.0%)

662 of 685 relevant lines covered (96.64%)

5015.1 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.16
/lib/check_alignment.ml
1
open! Core
276✔
2
open Mod
3

4
exception Bad_aln_length of string [@@deriving sexp]
5

6
let gap_char = Re2.create_exn "[^a-zA-Z]"
276✔
7
let is_gap_char c = Re2.matches gap_char (String.of_char c)
709,292✔
8
let is_non_gap_char c = not (is_gap_char c)
357,564✔
9

10
let make_zero_raw_aln_pos_map (aln_rec : Position.aln Record.t) =
11
  let _, map =
340✔
12
    aln_rec |> Record.to_fasta_record |> Bio_io.Fasta_record.seq
340✔
13
    |> String.to_array
340✔
14
    |> Array.foldi
15
         ~init:
16
           (Position.zero_raw_of_int 0, Position.Map_wrt.empty_zero_raw_aln ())
340✔
17
         ~f:(fun aln_column (seq_position, map) char ->
18
           let aln_column = Position.zero_aln_of_int aln_column in
351,728✔
19
           if is_gap_char char then (seq_position, map)
77,296✔
20
           else
21
             ( Position.(seq_position + zero_raw_of_int 1),
274,432✔
22
               (* _exn is okay here as new_seq_position is always increasing. *)
23
               Position.Map_wrt.add_exn map ~from:seq_position ~to_:aln_column
274,432✔
24
               (* Map.add_exn map ~key:seq_position ~data:aln_column *) ))
25
  in
26
  map
340✔
27

28
let bad_aln_length_exn seq_i ~expected_len ~actual_len =
29
  let i = seq_i + 1 in
8✔
30
  Bad_aln_length
31
    [%string
32
      "Seq num: %{i#Int}, Expected length: %{expected_len#Int}, Actual length: \
8✔
33
       %{actual_len#Int}"]
8✔
34

35
type alignment_file_data = {
8✔
36
  zero_raw_aln_pos_map :
37
    (Position.zero_indexed, Position.raw, Position.aln) Position.Map_wrt.t;
38
  records : Position.aln Record.t list;
39
  alignment_length : int;
40
  num_records : int;
41
}
42

43
let empty_alignment_data () =
44
  {
348✔
45
    zero_raw_aln_pos_map = Position.Map_wrt.empty_zero_raw_aln ();
348✔
46
    records = [];
47
    alignment_length = 0;
48
    num_records = 0;
49
  }
50

51
let assert_alignment_length_good ~expected_len ~actual_len i =
52
  if actual_len <> expected_len then
4,084✔
53
    raise @@ bad_aln_length_exn i ~expected_len ~actual_len
8✔
54

55
(* Assert some invariants and return it if good. *)
56
let finish_alignment_file_data data infile =
57
  (* If we read the whole file, but didn't hit the main reference, then the map
58
     will be empty. And that is an error. *)
59
  if Position.Map_wrt.length data.zero_raw_aln_pos_map = 0 then
338✔
60
    Or_error.errorf "We didn't find the key ref sequence in '%s'" infile
6✔
61
  else if data.num_records < 1 then
332✔
62
    Or_error.errorf "Should have at least one non-key record in '%s'" infile
6✔
63
  else if data.alignment_length = 0 then
326✔
64
    (* This shouldn't ever happen as the bad alignment length should catch
65
       it. *)
66
    Or_error.errorf "Something bad happened.  Found zero-length seqs in '%s'"
×
67
      infile
68
  else Or_error.return { data with records = List.rev data.records }
326✔
69

70
type alignment_infile = Basic of string | With_pasv_refs of string
71

72
(* Reads everything into memory. With hmmalign, these could be big files.
73
   Because some alignment softwares rearrange sequences, read it all in before
74
   analyzing as you may not get the position map until the end. *)
75
let parse_alignment_file_basic infile =
76
  let open Bio_io in
18✔
77
  let f () =
78
    Fasta_in_channel.with_file_foldi_records_exn infile
18✔
79
      ~init:(empty_alignment_data ()) ~f:(fun i aln record ->
18✔
80
        let this_aln_len = String.length (Fasta_record.seq record) in
404✔
81
        (* Regardless of whether it is a key ref, pasv ref, or query, all aln
82
           lengths should match the first record's length. *)
83
        if i = 0 then
404✔
84
          let zero_raw_aln_pos_map =
18✔
85
            make_zero_raw_aln_pos_map @@ Record.aln_of_fasta_record record
18✔
86
          in
87
          { aln with alignment_length = this_aln_len; zero_raw_aln_pos_map }
18✔
88
        else (
386✔
89
          assert_alignment_length_good i ~expected_len:aln.alignment_length
90
            ~actual_len:this_aln_len;
91
          {
384✔
92
            aln with
93
            num_records = aln.num_records + 1;
94
            records = Record.aln_of_fasta_record record :: aln.records;
384✔
95
          }))
96
  in
97
  match f () with
98
  (* Catches every exception. *)
99
  | exception exn ->
2✔
100
      Or_error.of_exn exn |> Or_error.tag ~tag:"Error parsing alignment"
2✔
101
  | aln -> finish_alignment_file_data aln infile
16✔
102

103
let parse_alignment_file_with_pasv_refs infile =
104
  let is_key_reference = Utils.is_key_reference in
330✔
105
  let is_reference = Utils.is_reference in
106
  let open Bio_io in
107
  let f () =
108
    Fasta_in_channel.with_file_foldi_records_exn infile
330✔
109
      ~init:(empty_alignment_data ()) ~f:(fun i aln record ->
330✔
110
        let this_aln_len = String.length (Fasta_record.seq record) in
3,698✔
111
        (* Regardless of whether it is a key ref, pasv ref, or query, all aln
112
           lengths should match the first record's length. *)
113
        let aln =
3,698✔
114
          if i = 0 then { aln with alignment_length = this_aln_len } else aln
328✔
115
        in
116
        assert_alignment_length_good i ~expected_len:aln.alignment_length
117
          ~actual_len:this_aln_len;
118
        (* Match against the expected ID rather than the first index as some
119
           aligners will rearrange the order of the sequences in the alignment
120
           file. *)
121
        match (is_key_reference record, is_reference record) with
3,692✔
122
        | true, true -> assert false
123
        | true, false ->
322✔
124
            let zero_raw_aln_pos_map =
125
              make_zero_raw_aln_pos_map @@ Record.aln_of_fasta_record record
322✔
126
            in
127
            { aln with zero_raw_aln_pos_map }
322✔
128
        (* In the pasv1 style msa, you will have refs in here to ignore. *)
129
        | false, true -> aln
2,634✔
130
        | false, false ->
736✔
131
            {
132
              aln with
133
              num_records = aln.num_records + 1;
134
              records = Record.aln_of_fasta_record record :: aln.records;
736✔
135
            })
136
  in
137
  match f () with
138
  (* Catches every exception. *)
139
  | exception exn ->
8✔
140
      Or_error.of_exn exn |> Or_error.tag ~tag:"Error parsing alignment"
8✔
141
  | aln -> finish_alignment_file_data aln infile
322✔
142

143
let parse_alignment_file = function
144
  | Basic infile -> parse_alignment_file_basic infile
18✔
145
  | With_pasv_refs infile -> parse_alignment_file_with_pasv_refs infile
330✔
146

147
let get_aln_positions zero_raw_aln_pos_map zero_indexed_positions =
148
  Position.List.zero_raw_to_zero_aln zero_raw_aln_pos_map zero_indexed_positions
326✔
149
  |> Or_error.tag
150
       ~tag:
151
         "The zero-indexed raw position does not have an in zero-indexed \
152
          alignment position map.  Check the residues...are they out of \
153
          bounds?"
154

155
let make_signature_file_header
156
    (positions : (Position.one_indexed, Position.raw) Position.List.t) : string
157
    =
158
  let posns = Position.List.to_list positions in
108✔
159
  let posns = List.map posns ~f:(fun i -> "pos_" ^ Int.to_string i) in
108✔
160
  let posns = String.concat posns ~sep:"\t" in
108✔
161
  [%string "name\t%{posns}\tsignature\tspans_start\tspans_end\tspans"]
108✔
162

163
type spans_boundary = Yes | No | Na
164
let spans_boundary_to_string = function Yes -> "Yes" | No -> "No" | Na -> "NA"
296✔
165

166
type spannning_info = Both | Start | End | Neither | Na
167

168
let spanning_info_to_string = function
169
  | Both -> "Both"
142✔
170
  | Start -> "Start"
52✔
171
  | End -> "End"
52✔
172
  | Neither -> "Neither"
76✔
173
  | Na -> "NA"
738✔
174

175
let get_spanning_info ~spans_start ~spans_end =
176
  match (spans_start, spans_end) with
1,060✔
177
  | Yes, Yes -> Both
142✔
178
  | No, Yes -> End
52✔
179
  | Yes, No -> Start
52✔
180
  | No, No -> Neither
76✔
181
  | Na, Yes | Na, No | Yes, Na | No, Na | Na, Na -> Na
20✔
182

183
let spans s =
184
  let non_gap_char_count =
740✔
185
    s |> String.to_list |> List.count ~f:is_non_gap_char
740✔
186
  in
187
  if non_gap_char_count > 0 then Yes else No
296✔
188

189
let spans_roi_start (record : Position.aln Record.t) = function
190
  | Some (roi_start : Position.zero_indexed_aln) ->
370✔
191
      let record = Record.to_fasta_record record in
192
      let prefix_len = Position.to_int roi_start + 1 in
370✔
193
      let prefix = String.prefix (Bio_io.Fasta_record.seq record) prefix_len in
370✔
194
      spans prefix
370✔
195
  | None -> Na
690✔
196

197
let spans_roi_end (record : Position.aln Record.t) = function
198
  | Some (roi_end : Position.zero_indexed_aln) ->
370✔
199
      let record = Record.to_fasta_record record in
200
      let seq = Bio_io.Fasta_record.seq record in
370✔
201
      let suffix =
370✔
202
        let pos = Position.to_int roi_end in
203
        let len = String.length seq - pos in
370✔
204
        String.sub seq ~pos ~len
370✔
205
      in
206
      spans suffix
207
  | None -> Na
690✔
208

209
let map_roi_boundary map = function
210
  | Some boundary -> (
374✔
211
      match Position.Map_wrt.find_or_error map boundary with
212
      | Ok roi -> Or_error.return @@ Some roi
372✔
213
      | Error err -> Error err)
2✔
214
  | None -> Or_error.return None
276✔
215

216
let check_alignment ~infile ~roi_start ~roi_end ~positions =
217
  let f () =
348✔
218
    let open Or_error.Let_syntax in
348✔
219
    let zi_positions = Position.List.one_to_zero positions in
220
    let%bind aln_file = parse_alignment_file infile in
348✔
221
    let aln_records = Array.of_list aln_file.records in
326✔
222
    let%bind aln_positions =
223
      get_aln_positions aln_file.zero_raw_aln_pos_map zi_positions
326✔
224
    in
225
    let%bind roi_start =
226
      map_roi_boundary aln_file.zero_raw_aln_pos_map roi_start
324✔
227
    in
228
    let%map roi_end = map_roi_boundary aln_file.zero_raw_aln_pos_map roi_end in
324✔
229
    let make_aln_record_info record =
324✔
230
      let id = Bio_io.Fasta_record.id @@ Record.to_fasta_record record in
1,060✔
231
      let spans_start = spans_roi_start record roi_start in
1,060✔
232
      let spans_end = spans_roi_end record roi_end in
1,060✔
233
      let spans_region = get_spanning_info ~spans_start ~spans_end in
1,060✔
234
      let key_residues = Record.get_aln_residues aln_positions record in
235
      let signature = String.concat key_residues ~sep:"" in
1,060✔
236
      let key_residues = String.concat key_residues ~sep:"\t" in
1,060✔
237
      let line =
1,060✔
238
        let ss = spans_boundary_to_string spans_start in
239
        let se = spans_boundary_to_string spans_end in
1,060✔
240
        let sr = spanning_info_to_string spans_region in
1,060✔
241
        String.concat [ id; key_residues; signature; ss; se; sr ] ~sep:"\t"
1,060✔
242
      in
243
      line
244
    in
245
    Array.map aln_records ~f:make_aln_record_info
246
  in
247
  match f () with
248
  | Ok signatures -> Or_error.return signatures
324✔
249
  | Error err -> Error err |> Or_error.tag ~tag:"error in check_alignment"
24✔
250

251
let write_signatures ~filename ~header signatures =
252
  Out_channel.with_file filename ~f:(fun out_chan ->
32✔
253
      Out_channel.output_string out_chan (header ^ "\n");
32✔
254
      Array.iter signatures ~f:(fun signature ->
32✔
255
          Out_channel.output_string out_chan (signature ^ "\n")))
552✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc