• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

mooreryan / InteinFinder / 29

pending completion
29

push

github

Ryan M. Moore
Add yaml to tiny_config deps

1309 of 1389 relevant lines covered (94.24%)

4869.03 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.53
/lib/alignment.ml
1
(** Dealing with fasta records in the alignment in/out files. *)
2

3
open! Core
4
module Record = Bio_io_ext.Fasta.Record
5
module C = Coord
6

7
let gap_char = '-'
8

9
[@@@coverage off]
10

11
type aln_out =
12
  {query: Record.query_aln; intein: Record.intein_aln; file_name: string}
13
[@@deriving sexp_of]
14

15
[@@@coverage on]
16

17
let is_gap = function '-' -> true | _ -> false
109,003✔
18

19
let is_non_gap = Fn.non is_gap
23✔
20

21
module El = struct
22
  [@@@coverage off]
23

24
  type t = Residue of char | Gap [@@deriving sexp_of]
25

26
  [@@@coverage on]
27

28
  (* Note that we treat El.t option None case as a gap. *)
29

30
  let to_string = function Residue c -> Char.to_string c | Gap -> "-"
130✔
31

32
  (** Explicitly tell the user about None. Otherwise like [to_string]. *)
33
  let to_string' = function None -> "None" | Some el -> to_string el
85✔
34

35
  (** Treat None as a gap. Useful in the internal checks functions. *)
36
  let to_string_none_is_gap = function None -> "-" | Some el -> to_string el
8✔
37

38
  (** Concat, but treat none as a gap. Used in the internal checks functions. *)
39
  let concat_none_is_gap el1 el2 =
40
    to_string_none_is_gap el1 ^ to_string_none_is_gap el2
75✔
41
end
42

43
module Intein_query_info = struct
44
  [@@@coverage off]
45

46
  (** Stuff on the query sequence that corresponds to where the intein aligned. *)
47
  type t =
48
    { intein_start_minus_one: El.t option
49
    ; intein_start: El.t option
50
    ; intein_penultimate: El.t option
51
    ; intein_end: El.t option
52
    ; intein_end_plus_one: El.t option
53
    ; intein_start_index: C.zero_aln option
54
    ; intein_end_index: C.zero_aln option
55
    ; intein_start_to_the_right_of_hit_region_start: bool
56
    ; intein_end_to_the_left_of_hit_region_end: bool }
57
  [@@deriving sexp_of, fields]
58

59
  [@@@coverage on]
60

61
  let to_string t =
62
    let conv to_s acc _ _ v = to_s v :: acc in
64✔
63
    let conv_el_opt = conv El.to_string' in
64
    (* let conv_zero_aln_opt = let f x = match x with None -> "None" | Some x ->
65
       C.to_one_indexed_string x in conv f in *)
66
    let ignore acc _ _ _ = acc in
64✔
67
    let l =
68
      Fields.Direct.fold
69
        t
70
        ~init:[]
71
        ~intein_start_minus_one:conv_el_opt
72
        ~intein_start:conv_el_opt
73
        ~intein_penultimate:conv_el_opt
74
        ~intein_end:conv_el_opt
75
        ~intein_end_plus_one:conv_el_opt
76
        ~intein_start_index:ignore
77
        ~intein_end_index:ignore
78
        ~intein_start_to_the_right_of_hit_region_start:ignore
79
        ~intein_end_to_the_left_of_hit_region_end:ignore
80
    in
81
    l |> List.rev |> String.concat ~sep:"\t"
64✔
82

83
  let to_string_with_info t ~query_name ~intein_name
84
      ~(region_index : Zero_indexed_int.t) =
85
    String.concat
64✔
86
      ~sep:"\t"
87
      [ query_name
88
      ; Zero_indexed_int.to_one_indexed_string region_index
64✔
89
      ; intein_name
90
      ; to_string t ]
64✔
91

92
  let to_string_with_info_header () =
93
    [ "query"
7✔
94
    ; "region"
95
    ; "intein_target"
96
    ; "intein_start_minus_one"
97
    ; "intein_start"
98
    ; "intein_penultimate"
99
    ; "intein_end"
100
    ; "intein_end_plus_one" ]
101
end
102

103
type left_to_right_checks =
104
  { intein_start_to_the_right_of_hit_region_start: bool
105
  ; intein_start: El.t option
106
  ; intein_start_index: C.zero_aln option
107
  ; intein_start_minus_one: El.t option }
108

109
type right_to_left_checks =
110
  { intein_end_to_the_left_of_hit_region_end: bool
111
  ; intein_end: El.t option
112
  ; intein_end_index: C.zero_aln option
113
  ; intein_end_plus_one: El.t option
114
  ; intein_penultimate: El.t option }
115

116
(** You can't have a first without a last, that's why the both come in the
117
    option. If this is None, then you have an aligned sequence with all gaps,
118
    which is very bad an (probably) should take down the program. *)
119
let first_and_last_non_gap_positions :
120
    Record.query_aln -> (C.zero_aln * C.zero_aln) option =
121
 fun (Record.Query_aln record) ->
122
  let first_non_gap, last_non_gap =
80✔
123
    String.foldi
124
      (Record.seq record)
80✔
125
      ~init:(None, 0)
126
      ~f:(fun aln_i (first_non_gap, last_non_gap) c ->
127
        if is_non_gap c then
42,544✔
128
          match first_non_gap with
35,240✔
129
          | None ->
74✔
130
              (Some aln_i, aln_i)
131
          | Some _ ->
35,166✔
132
              (first_non_gap, aln_i)
133
        else (first_non_gap, last_non_gap) )
7,304✔
134
  in
135
  match first_non_gap with
80✔
136
  | None ->
6✔
137
      (* This happens if the sequence was all gaps. *)
138
      None
139
  | Some first_non_gap ->
74✔
140
      Some (C.zero_aln_exn first_non_gap, C.zero_aln_exn last_non_gap)
74✔
141

142
let find_next_non_gap_index :
143
    string -> current_index:C.zero_aln -> C.zero_aln option =
144
 fun s ~current_index ->
145
  (* Fail if index is too big. *)
146
  if C.to_int current_index >= String.length s then
1,823✔
147
    raise (Invalid_argument "current index >= string length") ;
2✔
148
  (* We want to start the search AFTER the current index. *)
149
  let pos = C.(incr current_index |> to_int) in
1,821✔
150
  if pos >= String.length s then None
7✔
151
  else
152
    let%map.Option i = String.lfindi ~pos s ~f:(fun _ c -> is_non_gap c) in
1,814✔
153
    (* lfindi will only ever return a valid index, so exn is safe here. *)
154
    C.zero_aln_exn i
1,810✔
155

156
let find_previous_non_gap_index :
157
    string -> current_index:C.zero_aln -> C.zero_aln option =
158
 fun s ~current_index ->
159
  (* Fail if index is too big. *)
160
  if C.to_int current_index >= String.length s then
1,823✔
161
    raise (Invalid_argument "current index >= string length") ;
2✔
162
  let open Option.Let_syntax in
1,821✔
163
  (* We want to start the search BEFORE the current index. *)
164
  let%bind pos = C.decr current_index in
1,821✔
165
  let%map i =
166
    String.rfindi ~pos:(C.to_int pos) s ~f:(fun _ c -> is_non_gap c)
1,814✔
167
  in
168
  (* lfindi will only ever return a valid index, so exn is safe here. *)
169
  C.zero_aln_exn i
1,810✔
170

171
module Leftover = struct
172
  type t =
173
    { aln_i: C.zero_aln
174
    ; prev_non_gap_index: C.zero_aln option
175
    ; next_non_gap_index: C.zero_aln option }
176

177
  let v ~aln_i ~prev_non_gap_index ~next_non_gap_index =
178
    {aln_i; next_non_gap_index; prev_non_gap_index}
1,804✔
179

180
  (** Makes a new [t] with the data from the aligned query sequence. (Call when
181
      generating the map from aln to raw.) *)
182
  let create ~query ~aln_i =
183
    let next_non_gap_index =
1,804✔
184
      find_next_non_gap_index query ~current_index:aln_i
185
    in
186
    let prev_non_gap_index =
1,804✔
187
      find_previous_non_gap_index query ~current_index:aln_i
188
    in
189
    v ~aln_i ~prev_non_gap_index ~next_non_gap_index
1,804✔
190
end
191

192
(* Note: This uses add exn in places where it is a bug if the key is already
193
   present. *)
194
let aln_to_raw : Record.query_aln -> C.Query_aln_to_raw.t =
195
 fun (Record.Query_aln query_record as query_aln) ->
196
  let module Map = C.Query_aln_to_raw in
73✔
197
  let open C.Query_aln_to_raw.Value in
198
  let first_non_gap_pos, last_non_gap_pos =
199
    Option.value_exn
200
      ~error:
201
        (Error.createf
73✔
202
           "first_and_last_non_gap_positions returned None.  This can only \
203
            happen if the aligned query sequence was all gaps.  If you see \
204
            this message, there is definitely something wrong with your input \
205
            data.  The bad query was '%s'."
206
           (Record.id query_record) )
73✔
207
    @@ first_and_last_non_gap_positions query_aln
73✔
208
  in
209
  let query = Record.seq query_record in
70✔
210
  (* Tracks the befores, afters, ats and the leftovers. *)
211
  let first_pass_reducer aln_i (raw_i, map, leftovers) query_char =
70✔
212
    let aln_i = C.zero_aln_exn aln_i in
42,516✔
213
    if C.(aln_i < first_non_gap_pos) then (
884✔
214
      (* If this assertion fails, there is either a bug in the
215
         first_and_last_non_gap_positions function, or you passed the wrong
216
         sequence to that function and to this one. Same with the assertion
217
         below this one. *)
218
      assert (is_gap query_char) ;
884✔
219
      let next_map = Map.add_exn map ~key:aln_i ~data:Before in
220
      (raw_i, next_map, leftovers) )
884✔
221
    else if C.(aln_i > last_non_gap_pos) then (
4,596✔
222
      assert (is_gap query_char) ;
4,596✔
223
      let next_map = Map.add_exn map ~key:aln_i ~data:After in
224
      (raw_i, next_map, leftovers) )
4,596✔
225
    else if is_gap query_char then
37,036✔
226
      (* let next_non_gap_index = find_next_non_gap_index query
227
         ~current_index:aln_i in let prev_non_gap_index =
228
         find_previous_non_gap_index query ~current_index:aln_i in let leftover
229
         = Leftover.v ~aln_i ~raw_i ~prev_non_gap_index ~next_non_gap_index
230
         in *)
231
      let leftover = Leftover.create ~query ~aln_i in
1,804✔
232
      let next_leftovers = leftover :: leftovers in
233
      (raw_i, map, next_leftovers)
234
    else
235
      (* Non-gap *)
236
      let next_raw_i = raw_i + 1 in
35,232✔
237
      let data = At (C.zero_raw_exn raw_i) in
35,232✔
238
      let next_map = Map.add_exn map ~key:aln_i ~data in
239
      (next_raw_i, next_map, leftovers)
35,232✔
240
  in
241
  (* Second pass we fill in the other betweens (gap columns). *)
242
  (* Take the map and a leftover gap column and add it to the map. *)
243
  let add_leftover_gap_column map
244
      {Leftover.aln_i; prev_non_gap_index; next_non_gap_index} =
245
    (* Note: We don't use the opt method here because, if this key is not found,
246
       it is a bug. *)
247
    let find i =
1,804✔
248
      match Map.find_exn map i with
3,608✔
249
      | At x ->
3,608✔
250
          x
251
      | _ ->
252
          Utils.impossible
253
            "non-gap column index should always be 'At' some raw index in the \
254
             query" [@coverage off]
255
    in
256
    let next_non_gap_index_raw = Option.map next_non_gap_index ~f:find in
257
    let prev_non_gap_index_raw = Option.map prev_non_gap_index ~f:find in
1,804✔
258
    let data = Between (prev_non_gap_index_raw, next_non_gap_index_raw) in
1,804✔
259
    Map.add_exn map ~key:aln_i ~data
260
  in
261
  let first_pass query =
262
    let _raw_i, map, leftovers =
70✔
263
      let raw_i = 0 in
264
      let map = Map.empty in
265
      let leftovers = [] in
266
      String.foldi query ~init:(raw_i, map, leftovers) ~f:first_pass_reducer
70✔
267
    in
268
    (map, leftovers)
269
  in
270
  let second_pass (map, leftovers) =
271
    List.fold leftovers ~init:map ~f:add_leftover_gap_column
70✔
272
  in
273
  query |> first_pass |> second_pass
70✔
274

275
let add_prefix : string -> Record.t -> Record.t =
276
 fun prefix record ->
277
  let open Record in
192✔
278
  let id = id record in
279
  let new_id = prefix ^ id in
192✔
280
  with_id new_id record
281

282
let clip_sequence (Record.Query_raw record) clipping_region =
283
  (* Convert 1-indexed to 0-indexed. *)
284
  let start = C.to_zero_indexed_int @@ Region.start clipping_region in
64✔
285
  let stop = C.to_zero_indexed_int @@ Region.end_ clipping_region in
64✔
286
  (* Basic assumptions with the region and seq length.. *)
287
  assert (start >= 0 && start < Record.seq_length record) ;
64✔
288
  assert (stop >= 0 && stop < Record.seq_length record) ;
64✔
289
  let seq = Record.seq record in
290
  let clipped_seq =
64✔
291
    String.sub seq ~pos:start ~len:(Region.length clipping_region)
64✔
292
  in
293
  let clipped_seq_id = "clipped___" ^ Record.id record in
64✔
294
  Record.create ~id:clipped_seq_id ~desc:None ~seq:clipped_seq
295
  |> Record.clipped_query_raw
296

297
(* Write the 3 sequences that will be aligned. (query, intein target, and
298
   clipped query). *)
299
let write_aln_in_file ~query:(Record.Query_raw query)
300
    ~clipped_query:(Record.Clipped_query_raw clipped_query)
301
    ~intein:(Record.Intein_raw intein) ~file_name =
302
  let query_seq = add_prefix Record.query_prefix query in
64✔
303
  let clipped_query_seq =
64✔
304
    add_prefix Record.clipped_query_prefix clipped_query
305
  in
306
  let intein_seq = add_prefix Record.intein_prefix intein in
64✔
307
  Out_channel.write_lines
64✔
308
    file_name
309
    [ Record.to_string intein_seq
64✔
310
    ; Record.to_string clipped_query_seq
64✔
311
    ; Record.to_string query_seq ]
64✔
312

313
(* Note: you can't trust that the order of the aligned sequences is correct.
314

315
   Note: this asserts the lengths of the aligment as well. *)
316
let read_aln_out_file file_name =
317
  let assert_aln_lengths (Record.Query_aln query)
72✔
318
      (Record.Clipped_query_aln clipped_query) (Record.Intein_aln intein) =
319
    let query_len = Record.seq_length query in
70✔
320
    let clipped_query_len = Record.seq_length clipped_query in
70✔
321
    let intein_len = Record.seq_length intein in
70✔
322
    assert (query_len = clipped_query_len && query_len = intein_len)
70✔
323
  in
324
  let parse_seqs r1 r2 r3 =
325
    (* Note: this is generated code. *)
326
    match
71✔
327
      ( Record.query_clipped_or_intein_of_record r1
71✔
328
      , Record.query_clipped_or_intein_of_record r2
71✔
329
      , Record.query_clipped_or_intein_of_record r3 )
71✔
330
    with
331
    | Ok (Query query), Ok (Clipped_query clipped_query), Ok (Intein intein)
1✔
332
    | Ok (Query query), Ok (Intein intein), Ok (Clipped_query clipped_query)
1✔
333
    | Ok (Clipped_query clipped_query), Ok (Query query), Ok (Intein intein)
1✔
334
    | Ok (Clipped_query clipped_query), Ok (Intein intein), Ok (Query query)
1✔
335
    | Ok (Intein intein), Ok (Query query), Ok (Clipped_query clipped_query)
1✔
336
    | Ok (Intein intein), Ok (Clipped_query clipped_query), Ok (Query query) ->
65✔
337
        assert_aln_lengths query clipped_query intein ;
338
        {query; intein; file_name}
70✔
339
    | _ ->
1✔
340
        (* Should be okay as there SHOULD never be a case in practice where this
341
           happens. If it does it's a pretty programmer error. *)
342
        let file_contents = In_channel.read_all file_name in
343
        failwithf
1✔
344
          "Expected one query, one clipped query, and one intein, but got \
345
           something else in alignment out file:\n\
346
           %s"
347
          file_contents
348
          ()
349
  in
350
  match Bio_io.Fasta.In_channel.with_file_records file_name with
351
  | [r1; r2; r3] ->
71✔
352
      parse_seqs r1 r2 r3
353
  | l ->
1✔
354
      let num_seqs = List.length l in
355
      failwithf
1✔
356
        "expected three sequences in the alignment output file but got %d"
357
        num_seqs
358
        ()
359

360
let left_to_right_range aln_len =
361
  List.range 0 aln_len ~start:`inclusive ~stop:`exclusive
99✔
362
  |> List.map ~f:C.zero_aln_exn
363

364
let right_to_left_range aln_len =
365
  List.range aln_len 0 ~start:`exclusive ~stop:`inclusive ~stride:(-1)
99✔
366
  |> List.map ~f:C.zero_aln_exn
367

368
let string_get_aln : string -> C.zero_aln -> char =
369
 fun s i -> String.get s (C.to_int i)
35,042✔
370

371
let update_raw_query_idx ~query_char ~raw_query_idx ~start_idx ~index_updater =
372
  if is_non_gap query_char then
17,521✔
373
    match !raw_query_idx with
17,219✔
374
    | Some raw_query_idx' ->
17,070✔
375
        raw_query_idx := Some (index_updater raw_query_idx')
17,070✔
376
    | None ->
149✔
377
        raw_query_idx := Some (C.zero_raw_exn start_idx)
149✔
378

379
(* TODO: intein_start_to_the_right_of_hit_region_start name is misleading as it
380
   actually does check that it is on the region boundary ...and counts that as
381
   good. (Same with the before the hit region one. *)
382
let check_if_intein_start_on_the_query_is_after_hit_region_start
383
    ~hit_region_start ~raw_query_idx
384
    ~intein_start_to_the_right_of_hit_region_start =
385
  match !raw_query_idx with
99✔
386
  | Some raw_query_idx ->
86✔
387
      if C.(hit_region_start <= raw_query_idx) then
388
        intein_start_to_the_right_of_hit_region_start := true
53✔
389
  | None ->
13✔
390
      ()
391

392
let check_if_intein_end_on_the_query_is_before_hit_region_end ~hit_region_end
393
    ~raw_query_idx ~intein_end_to_the_left_of_hit_region_end =
394
  match !raw_query_idx with
99✔
395
  | Some raw_query_idx ->
63✔
396
      if C.(raw_query_idx <= hit_region_end) then
397
        intein_end_to_the_left_of_hit_region_end := true
44✔
398
  | None ->
36✔
399
      ()
400

401
(** Set intein start index to the current alignment index. *)
402
let set_intein_start_index ~intein_start_index ~aln_idx =
403
  intein_start_index := Some aln_idx
99✔
404

405
let set_intein_end_index ~intein_end_index ~aln_idx =
406
  intein_end_index := Some aln_idx
99✔
407

408
let set_intein_start_char ~intein_start ~query_char =
409
  if is_gap query_char then intein_start := Some El.Gap
25✔
410
  else intein_start := Some (El.Residue query_char)
74✔
411

412
let set_intein_end_char ~intein_end ~query_char =
413
  if is_gap query_char then intein_end := Some El.Gap
48✔
414
  else intein_end := Some (El.Residue query_char)
51✔
415

416
let set_intein_start_minus_one ~query_char ~raw_query_chars
417
    ~intein_start_minus_one =
418
  if is_gap query_char then
99✔
419
    (* If there is a gap in the query char then the intein start minus one
420
       cannot be determined. *)
421
    intein_start_minus_one := None
25✔
422
  else
423
    match !raw_query_chars with
74✔
424
    | minus_one :: _ ->
34✔
425
        (* If there exists a residue on the query before the current residue,
426
           that is the minus_one. *)
427
        intein_start_minus_one := Some (El.Residue minus_one)
428
    | [] ->
40✔
429
        (* If there are no previous residues on the query, then you know that
430
           the intein is at the very start of the query. So there is no
431
           minus-one. In full-length proteins, this cannot happen. But in
432
           metagenomic derived sequenecs you could get a fragment that happens
433
           to start at an intein. It should be a rare event though. *)
434
        (* Should already be none, but be explicit about it. *)
435
        intein_start_minus_one := None
436

437
let set_intein_end_plus_one ~query_char ~raw_query_chars ~intein_end_plus_one =
438
  if is_gap query_char then
99✔
439
    (* If there is a gap in the query char then the intein end plus one cannot
440
       be determined. *)
441
    intein_end_plus_one := None
48✔
442
  else
443
    match !raw_query_chars with
51✔
444
    | plus_one :: _ ->
35✔
445
        intein_end_plus_one := Some (El.Residue plus_one)
446
    | [] ->
16✔
447
        intein_end_plus_one := None
448

449
let track_non_gap_query_chars ~query_char ~raw_query_chars =
450
  if is_non_gap query_char then
17,323✔
451
    raw_query_chars := query_char :: !raw_query_chars
17,094✔
452

453
let left_to_right_checks_v ~intein_start ~intein_start_index
454
    ~intein_start_minus_one ~intein_start_to_the_right_of_hit_region_start =
455
  { intein_start_to_the_right_of_hit_region_start=
99✔
456
      !intein_start_to_the_right_of_hit_region_start
457
  ; intein_start= !intein_start
458
  ; intein_start_index= !intein_start_index
459
  ; intein_start_minus_one= !intein_start_minus_one }
460

461
let right_to_left_checks_v ~intein_end_to_the_left_of_hit_region_end
462
    ~intein_end_index ~intein_end ~intein_end_plus_one ~intein_penultimate =
463
  { intein_end_to_the_left_of_hit_region_end=
99✔
464
      !intein_end_to_the_left_of_hit_region_end
465
  ; intein_end_index= !intein_end_index
466
  ; intein_end= !intein_end
467
  ; intein_end_plus_one= !intein_end_plus_one
468
  ; intein_penultimate= !intein_penultimate }
469

470
let scan_left_to_right :
471
       aln_len:int
472
    -> query_seq:string
473
    -> intein_seq:string
474
    -> hit_region_start:C.one_raw
475
    -> left_to_right_checks =
476
 fun ~aln_len ~query_seq ~intein_seq ~hit_region_start ->
477
  (* This is what we are tracking in this function. *)
478
  let intein_start_to_the_right_of_hit_region_start = ref false in
99✔
479
  let intein_start = ref None in
480
  let intein_start_index = ref None in
481
  let intein_start_minus_one = ref None in
482
  let raw_query_idx = ref None in
483
  (* The hd of this list will be the previous raw residue. Use it to get the
484
     penultimate and the minus one. *)
485
  let raw_query_chars = ref [] in
486
  let iter_fun aln_idx =
487
    let query_char = string_get_aln query_seq aln_idx in
7,402✔
488
    let intein_char = string_get_aln intein_seq aln_idx in
7,402✔
489
    update_raw_query_idx
7,402✔
490
      ~query_char
491
      ~raw_query_idx
492
      ~start_idx:0
493
      ~index_updater:C.incr ;
494
    (* Note: we don't have to check if we're in the intein, because the first
495
       non_gap_char we see will cause a raise Exit at the end of this [if]. *)
496
    if is_non_gap intein_char then (
99✔
497
      check_if_intein_start_on_the_query_is_after_hit_region_start
498
        ~hit_region_start
499
        ~raw_query_idx
500
        ~intein_start_to_the_right_of_hit_region_start ;
501
      set_intein_start_index ~intein_start_index ~aln_idx ;
502
      set_intein_start_char ~intein_start ~query_char ;
503
      set_intein_start_minus_one
504
        ~query_char
505
        ~raw_query_chars
506
        ~intein_start_minus_one ;
507
      raise Exit ) ;
508
    track_non_gap_query_chars ~query_char ~raw_query_chars
7,303✔
509
  in
510
  let range = left_to_right_range aln_len in
511
  let () = try List.iter range ~f:iter_fun with Exit -> () in
×
512
  left_to_right_checks_v
513
    ~intein_start
514
    ~intein_start_index
515
    ~intein_start_minus_one
516
    ~intein_start_to_the_right_of_hit_region_start
517

518
let scan_right_to_left :
519
       aln_len:int
520
    -> query_seq:string
521
    -> intein_seq:string
522
    -> raw_query_length:int
523
    -> hit_region_end:C.one_raw
524
    -> right_to_left_checks =
525
 fun ~aln_len ~query_seq ~intein_seq ~raw_query_length ~hit_region_end ->
526
  let intein_end_to_the_left_of_hit_region_end = ref false in
99✔
527
  let intein_end = ref None in
528
  let intein_end_index = ref None in
529
  let intein_end_plus_one = ref None in
530
  let raw_query_idx = ref None in
531
  let intein_penultimate = ref None in
532
  (* The hd of this list will be the previous raw residue. Use it to get the
533
     penultimate and the minus one. *)
534
  let raw_query_chars = ref [] in
535
  (* Set this to true when we are read to look for penultimate residue. *)
536
  let find_the_penultimate = ref false in
537
  let f aln_idx =
538
    let query_char = string_get_aln query_seq aln_idx in
10,119✔
539
    let intein_char = string_get_aln intein_seq aln_idx in
10,119✔
540
    update_raw_query_idx
10,119✔
541
      ~query_char
542
      ~raw_query_idx
543
      ~start_idx:(raw_query_length - 1)
544
      ~index_updater:C.decr_exn ;
545
    if !find_the_penultimate && Option.is_none !intein_penultimate then
226✔
546
      if is_non_gap intein_char then (
99✔
547
        if is_gap query_char then intein_penultimate := Some El.Gap
48✔
548
        else intein_penultimate := Some (El.Residue query_char) ;
51✔
549
        raise Exit ) ;
550
    if is_non_gap intein_char then (
99✔
551
      check_if_intein_end_on_the_query_is_before_hit_region_end
552
        ~hit_region_end
553
        ~raw_query_idx
554
        ~intein_end_to_the_left_of_hit_region_end ;
555
      set_intein_end_index ~intein_end_index ~aln_idx ;
556
      find_the_penultimate := true ;
557
      set_intein_end_char ~intein_end ~query_char ;
558
      set_intein_end_plus_one ~query_char ~raw_query_chars ~intein_end_plus_one
559
      ) ;
560
    track_non_gap_query_chars ~query_char ~raw_query_chars
10,020✔
561
  in
562
  let range = right_to_left_range aln_len in
563
  let () = try List.iter range ~f with Exit -> () in
×
564
  right_to_left_checks_v
565
    ~intein_end_to_the_left_of_hit_region_end
566
    ~intein_end_index
567
    ~intein_end
568
    ~intein_end_plus_one
569
    ~intein_penultimate
570

571
(* You need to look at the alignment, and figure out all the interesting
572
   residues on the query seq as defined by the intein alignment. *)
573
let parse_aln_out :
574
       hit_region_start:C.one_raw
575
    -> hit_region_end:C.one_raw
576
    -> raw_query_length:int
577
    -> aln_out
578
    -> Intein_query_info.t =
579
 fun ~hit_region_start
580
     ~hit_region_end
581
     ~raw_query_length
582
     {query= Record.Query_aln query; intein= Record.Intein_aln intein; _} ->
583
  (* NOTE: for paper/docs...if the intein start/end is in a gap on the query,
584
     then then minus/plus one will have to be None, as we can't even place the
585
     start and end on the query. There are some tests in there about that. *)
586

587
  (* You know aln lengths are good because of assertions elsewhere. *)
588
  let aln_len = Record.seq_length query in
99✔
589
  let query_seq = Record.seq query in
99✔
590
  let intein_seq = Record.seq intein in
99✔
591
  (* Scan left to right. *)
592
  let { intein_start_to_the_right_of_hit_region_start
99✔
593
      ; intein_start
594
      ; intein_start_index
595
      ; intein_start_minus_one } =
596
    scan_left_to_right ~aln_len ~query_seq ~intein_seq ~hit_region_start
597
  in
598
  (* Scan right to left. *)
599
  let { intein_end_to_the_left_of_hit_region_end
600
      ; intein_end
601
      ; intein_end_index
602
      ; intein_end_plus_one
603
      ; intein_penultimate } =
604
    scan_right_to_left
605
      ~aln_len
606
      ~query_seq
607
      ~intein_seq
608
      ~raw_query_length
609
      ~hit_region_end
610
  in
611
  { intein_start_minus_one
612
  ; intein_start
613
  ; intein_penultimate
614
  ; intein_end
615
  ; intein_end_plus_one
616
  ; intein_start_index
617
  ; intein_end_index
618
  ; intein_start_to_the_right_of_hit_region_start
619
  ; intein_end_to_the_left_of_hit_region_end }
620

621
module Checks = struct
622
  (* Note: for docs/paper, we lose some info here as we treat None as a "gap".
623
     Gaps and none will both appear as -. *)
624

625
  module Position_check = struct
626
    [@@@coverage off]
627

628
    (* Positions in the region are None if the intein lined up with a gap in the
629
       query. *)
630
    type t =
631
      | Pass of C.Query_aln_to_raw.Value.t
632
      | Fail of C.Query_aln_to_raw.Value.t
633
      | Fail_none
634
    [@@deriving sexp_of, variants]
635

636
    [@@@coverage on]
637

638
    let pass t = is_pass t
85✔
639

640
    let to_tier_or_fail : t -> Tier.Tier_or_fail.t = function
641
      | Pass _ ->
57✔
642
          Tier Tier.t1
643
      | Fail _ | Fail_none ->
×
644
          Fail
645

646
    let to_string = function
647
      | Pass v ->
57✔
648
          let x = C.Query_aln_to_raw.Value.to_string v in
649
          [%string "Pass (%{x})"]
57✔
650
      | Fail v ->
71✔
651
          let x = C.Query_aln_to_raw.Value.to_string v in
652
          [%string "Fail (%{x})"]
71✔
653
      | Fail_none ->
×
654
          "Fail (None)"
655

656
    let check_intein_start_index :
657
           intein_start_index:C.zero_aln option
658
        -> intein_start_to_the_right_of_hit_region_start:bool
659
        -> aln_to_raw:C.Query_aln_to_raw.t
660
        -> t =
661
     fun ~intein_start_index
662
         ~intein_start_to_the_right_of_hit_region_start
663
         ~aln_to_raw ->
664
      match
64✔
665
        (intein_start_index, intein_start_to_the_right_of_hit_region_start)
666
      with
667
      | None, false | None, true ->
×
668
          Fail_none
669
      | Some aln_idx, false ->
31✔
670
          let raw_idx = C.Query_aln_to_raw.find_exn aln_to_raw aln_idx in
671
          Fail raw_idx
31✔
672
      | Some aln_idx, true ->
33✔
673
          (* NOTE: you can pass the region test but not be trimmable if the
674
             Value is between and not At. *)
675
          let raw_idx = C.Query_aln_to_raw.find_exn aln_to_raw aln_idx in
676
          Pass raw_idx
33✔
677

678
    let check_intein_end_index :
679
           intein_end_index:C.zero_aln option
680
        -> intein_end_to_the_left_of_hit_region_end:bool
681
        -> aln_to_raw:C.Query_aln_to_raw.t
682
        -> t =
683
     fun ~intein_end_index ~intein_end_to_the_left_of_hit_region_end ~aln_to_raw ->
684
      match (intein_end_index, intein_end_to_the_left_of_hit_region_end) with
64✔
685
      | None, false | None, true ->
×
686
          Fail_none
687
      | Some aln_idx, false ->
40✔
688
          let raw_idx = C.Query_aln_to_raw.find_exn aln_to_raw aln_idx in
689
          Fail raw_idx
40✔
690
      | Some aln_idx, true ->
24✔
691
          let raw_idx = C.Query_aln_to_raw.find_exn aln_to_raw aln_idx in
692
          Pass raw_idx
24✔
693
  end
694

695
  module Full_region_check = struct
696
    [@@@coverage off]
697

698
    type t = Pass | Start_pass | End_pass | Fail
699
    [@@deriving sexp_of, variants]
700

701
    [@@@coverage on]
702

703
    let pass t = is_pass t
42✔
704

705
    let to_string = Variants.to_name
706

707
    let check :
708
        start_position:Position_check.t -> end_position:Position_check.t -> t =
709
     fun ~start_position ~end_position ->
710
      match (start_position, end_position) with
64✔
711
      | Pass _, Pass _ ->
21✔
712
          Pass
713
      | Pass _, Fail _ | Pass _, Fail_none ->
×
714
          Start_pass
715
      | Fail _, Pass _ | Fail_none, Pass _ ->
×
716
          End_pass
717
      | Fail _, Fail _
28✔
718
      | Fail _, Fail_none
×
719
      | Fail_none, Fail _
×
720
      | Fail_none, Fail_none ->
×
721
          Fail
722

723
    let to_tier_or_fail : t -> Tier.Tier_or_fail.t = function
724
      | Pass ->
21✔
725
          Tier Tier.t1
726
      | Start_pass | End_pass | Fail ->
3✔
727
          Fail
728
  end
729

730
  (* TODO: add to hacking...should have t, should have check, should have pass,
731
     to_string, etc etc. *)
732
  module Start_residue_check = struct
733
    [@@@coverage off]
734

735
    type t = Pass of (Tier.t * char) | Fail of char
736
    [@@deriving sexp_of, variants]
737

738
    [@@@coverage on]
739

740
    let pass t = is_pass t
85✔
741

742
    let of_tier_or_fail : char -> Tier.Tier_or_fail.t -> t =
743
     fun c tof -> match tof with Tier t -> Pass (t, c) | Fail -> Fail c
12✔
744

745
    let to_tier_or_fail : t -> Tier.Tier_or_fail.t = function
746
      | Pass (tier, _) ->
46✔
747
          Tier tier
748
      | Fail _ ->
18✔
749
          Fail
750

751
    let check' c tier_map =
752
      of_tier_or_fail c @@ Tier.Map.find tier_map @@ String.of_char c
58✔
753

754
    let check : El.t option -> tier_map:Tier.Map.t -> t =
755
     fun el ~tier_map ->
756
      match el with
64✔
757
      | None | Some Gap ->
×
758
          Fail '-'
759
      | Some (Residue c) ->
58✔
760
          check' c tier_map
761

762
    let to_string = function
763
      | Pass (tier, residue) ->
46✔
764
          [%string "Pass (%{tier#Tier} %{residue#Char})"]
765
      | Fail v ->
18✔
766
          [%string "Fail (%{v#Char})"]
767
  end
768

769
  module End_residues_check = struct
770
    [@@@coverage off]
771

772
    type t = Pass of (Tier.t * string) | Fail of string
773
    [@@deriving sexp_of, variants]
774

775
    [@@@coverage on]
776

777
    let pass t = is_pass t
67✔
778

779
    let of_tier_or_fail : string -> Tier.Tier_or_fail.t -> t =
780
     fun s tof -> match tof with Tier t -> Pass (t, s) | Fail -> Fail s
25✔
781

782
    let to_tier_or_fail : t -> Tier.Tier_or_fail.t = function
783
      | Pass (tier, _) ->
25✔
784
          Tier tier
785
      | Fail _ ->
39✔
786
          Fail
787

788
    let check' s tier_map = of_tier_or_fail s @@ Tier.Map.find tier_map s
64✔
789

790
    let check :
791
        penultimate:El.t option -> end_:El.t option -> tier_map:Tier.Map.t -> t
792
        =
793
     fun ~penultimate ~end_ ~tier_map ->
794
      let el = El.concat_none_is_gap penultimate end_ in
64✔
795
      check' el tier_map
64✔
796

797
    let to_string = function
798
      | Pass (tier, residues) ->
25✔
799
          [%string "Pass (%{tier#Tier} %{residues})"]
800
      | Fail v ->
39✔
801
          [%string "Fail (%{v})"]
802
  end
803

804
  module End_plus_one_residue_check = struct
805
    [@@@coverage off]
806

807
    (** [Na] will happen if the end of the intein is at the end of the query
808
        alignment (or beyond the end). It will be NA in these cases because
809
        there is no C-terminal extein sequence that we can find, so there is no
810
        way to tell its first residue. *)
811
    type t = Pass of (Tier.t * char) | Fail of char | Na
812
    [@@deriving sexp_of, variants]
813

814
    [@@@coverage on]
815

816
    let pass = function Pass _ | Na -> true | Fail _ -> false
×
817

818
    let of_tier_or_fail : char -> Tier.Tier_or_fail.t -> t =
819
     fun c tof -> match tof with Tier t -> Pass (t, c) | Fail -> Fail c
8✔
820

821
    (* WARNING: a little counterintuitive, but the [Na] case goes to Tier 1
822
       pass. Generally this should only be used along with the lowest tier...so
823
       treating no data as the highest tier will not affect the calculation of
824
       the lowest tier. *)
825
    let to_tier_or_fail : t -> Tier.Tier_or_fail.t = function
826
      | Pass (tier, _) ->
14✔
827
          Tier tier
828
      | Fail _ ->
8✔
829
          Fail
830
      | Na ->
42✔
831
          Tier Tier.t1
832

833
    (* TODO: there is logic below to handle the NA condition...should it be
834
       moved here? *)
835

836
    let check' c tier_map =
837
      of_tier_or_fail c @@ Tier.Map.find tier_map @@ String.of_char c
22✔
838

839
    let check : El.t option -> tier_map:Tier.Map.t -> t =
840
     fun el ~tier_map ->
841
      match el with
22✔
842
      | None | Some Gap ->
×
843
          Fail '-'
844
      | Some (Residue c) ->
22✔
845
          check' c tier_map
846

847
    let to_string = function
848
      | Pass (tier, residue) ->
14✔
849
          [%string "Pass (%{tier#Tier} %{residue#Char})"]
850
      | Fail v ->
8✔
851
          [%string "Fail (%{v#Char})"]
852
      | Na ->
42✔
853
          "NA"
854
  end
855

856
  [@@@coverage off]
857

858
  (** These all refer to the intein on the query. *)
859
  type t =
860
    { intein_length: int option
861
    ; start_residue: Start_residue_check.t
862
    ; end_residues: End_residues_check.t
863
    ; end_plus_one_residue: End_plus_one_residue_check.t
864
    ; start_position: Position_check.t
865
    ; end_position: Position_check.t
866
    ; region: Full_region_check.t }
867
  [@@deriving sexp_of, fields]
868

869
  [@@@coverage on]
870

871
  (* Note that order of these fields defines the order of the fields fold
872
     to_string function below.*)
873

874
  let of_intein_query_info :
875
      Intein_query_info.t -> C.Query_aln_to_raw.t -> int -> config:Config.t -> t
876
      =
877
   fun { intein_start_minus_one= _
878
       ; intein_start
879
       ; intein_penultimate
880
       ; intein_end
881
       ; intein_end_plus_one
882
       ; intein_start_index
883
       ; intein_end_index
884
       ; intein_start_to_the_right_of_hit_region_start
885
       ; intein_end_to_the_left_of_hit_region_end }
886
       aln_to_raw
887
       raw_query_len
888
       ~config ->
889
    let intein_start_index_raw =
64✔
890
      let%map.Option i = intein_start_index in
891
      C.Query_aln_to_raw.find_exn aln_to_raw i
64✔
892
    in
893
    let intein_end_index_raw =
64✔
894
      let%map.Option i = intein_end_index in
895
      C.Query_aln_to_raw.find_exn aln_to_raw i
64✔
896
    in
897
    let intein_length =
64✔
898
      let%bind.Option start, end_ =
899
        Option.both intein_start_index_raw intein_end_index_raw
64✔
900
      in
901
      C.Query_aln_to_raw.Value.length ~start ~end_
64✔
902
    in
903
    let start_residue =
64✔
904
      Start_residue_check.check
905
        intein_start
906
        ~tier_map:config.checks.start_residue
907
    in
908
    let end_residues =
64✔
909
      End_residues_check.check
910
        ~penultimate:intein_penultimate
911
        ~end_:intein_end
912
        ~tier_map:config.checks.end_residues
913
    in
914
    let end_plus_one_residue =
915
      let check intein_end_plus_one =
916
        End_plus_one_residue_check.check
22✔
917
          intein_end_plus_one
918
          ~tier_map:config.checks.end_plus_one_residue
919
      in
920
      match intein_end_index_raw with
921
      | Some C.Query_aln_to_raw.Value.After ->
29✔
922
          End_plus_one_residue_check.Na
923
      | Some (C.Query_aln_to_raw.Value.At i) ->
35✔
924
          if C.(incr i |> to_int) = raw_query_len then
35✔
925
            End_plus_one_residue_check.Na
13✔
926
          else check intein_end_plus_one
22✔
927
      | _ ->
×
928
          check intein_end_plus_one
×
929
    in
930
    let start_position =
931
      Position_check.check_intein_start_index
932
        ~intein_start_index
933
        ~intein_start_to_the_right_of_hit_region_start
934
        ~aln_to_raw
935
    in
936
    let end_position =
937
      Position_check.check_intein_end_index
938
        ~intein_end_index
939
        ~intein_end_to_the_left_of_hit_region_end
940
        ~aln_to_raw
941
    in
942
    let region = Full_region_check.check ~start_position ~end_position in
943
    { intein_length
944
    ; start_residue
945
    ; end_residues
946
    ; end_plus_one_residue
947
    ; start_position
948
    ; end_position
949
    ; region }
950

951
  let pass : t -> bool =
952
   fun t ->
953
    let use pass _ _ v = pass v in
85✔
954
    let ignore _ _ _ = true in
85✔
955
    Fields.Direct.for_all
956
      t
957
      ~start_residue:(use Start_residue_check.pass)
85✔
958
      ~end_residues:(use End_residues_check.pass)
85✔
959
      ~end_plus_one_residue:(use End_plus_one_residue_check.pass)
85✔
960
      ~start_position:(use Position_check.pass)
85✔
961
      ~end_position:(use Position_check.pass)
85✔
962
      ~region:(use Full_region_check.pass)
85✔
963
      ~intein_length:ignore
964

965
  (* Fold the record down to Pass with tier or fail. *)
966
  let to_tier_or_fail_list t : Tier.Tier_or_fail.t list =
967
    let conv to_tier_or_fail acc _ _ v = to_tier_or_fail v :: acc in
64✔
968
    let ignore acc _ _ _ = acc in
64✔
969
    Fields.Direct.fold
970
      t
971
      ~init:[]
972
      ~start_residue:(conv Start_residue_check.to_tier_or_fail)
64✔
973
      ~end_residues:(conv End_residues_check.to_tier_or_fail)
64✔
974
      ~end_plus_one_residue:(conv End_plus_one_residue_check.to_tier_or_fail)
64✔
975
      ~start_position:(conv Position_check.to_tier_or_fail)
64✔
976
      ~end_position:(conv Position_check.to_tier_or_fail)
64✔
977
      ~region:(conv Full_region_check.to_tier_or_fail)
64✔
978
      ~intein_length:ignore
979

980
  (* TODO: rename *)
981
  let overall_pass_string : t -> string =
982
   fun t ->
983
    t
64✔
984
    |> to_tier_or_fail_list
64✔
985
    |> Tier.Tier_or_fail.worst_tier
64✔
986
    |> Option.value_exn ~message:"tier or fail list should never be empty"
64✔
987
    |> Tier.Tier_or_fail.to_string
988

989
  let to_string t =
990
    let conv to_s acc _ _ v = to_s v :: acc in
64✔
991
    let l =
992
      Fields.Direct.fold
993
        t
994
        ~init:[]
995
        ~intein_length:(conv Utils.string_of_int_option)
64✔
996
        ~start_residue:(conv Start_residue_check.to_string)
64✔
997
        ~end_residues:(conv End_residues_check.to_string)
64✔
998
        ~end_plus_one_residue:(conv End_plus_one_residue_check.to_string)
64✔
999
        ~start_position:(conv Position_check.to_string)
64✔
1000
        ~end_position:(conv Position_check.to_string)
64✔
1001
        ~region:(conv Full_region_check.to_string)
64✔
1002
    in
1003
    let l = overall_pass_string t :: l in
64✔
1004
    l |> List.rev |> String.concat ~sep:"\t"
64✔
1005

1006
  let to_string_header () =
1007
    [ "intein_length"
7✔
1008
    ; "start_residue_check"
1009
    ; "end_residues_check"
1010
    ; "end_plus_one_residue_check"
1011
    ; "start_position_check"
1012
    ; "end_position_check"
1013
    ; "region_check"
1014
    ; "overall_check" ]
1015
end
1016

1017
let uber_header () =
1018
  String.concat ~sep:"\t"
7✔
1019
  @@ Intein_query_info.to_string_with_info_header ()
7✔
1020
  @ Checks.to_string_header ()
7✔
1021

1022
module Mafft = struct
1023
  module Sh = Shexp_process
1024

1025
  type opts = {exe: string; in_file: string; out_file: string}
1026

1027
  let opts ~exe ~in_file ~out_file = {exe; in_file; out_file}
64✔
1028

1029
  type proc = {prog: string; args: string list}
1030

1031
  let proc {exe; in_file; _} =
1032
    let args = ["--quiet"; "--auto"; "--thread"; "1"; in_file] in
64✔
1033
    {prog= exe; args}
1034

1035
  (* Printable representation of a command run by Process.run *)
1036
  let cmd_to_string {prog; args; _} =
1037
    let args = String.concat args ~sep:" " in
×
1038
    [%string "%{prog} %{args}"]
×
1039

1040
  (** Returns the output file name if successful. *)
1041
  let run : opts:opts -> log_base:string -> string Async.Deferred.Or_error.t =
1042
   fun ~opts ~log_base ->
1043
    let open Async in
65✔
1044
    let in_file_check =
1045
      if Sys_unix.file_exists_exn opts.in_file then Deferred.Or_error.return ()
64✔
1046
      else
1047
        Deferred.Or_error.errorf
1✔
1048
          "expected file '%s' to exist, but it did not"
1049
          opts.in_file
1050
    in
1051
    let out_file_check =
1052
      if Sys_unix.file_exists_exn opts.out_file then
1053
        Deferred.Or_error.errorf
1✔
1054
          "expected file '%s' not to exist, but it did"
1055
          opts.out_file
1056
      else Deferred.Or_error.return ()
64✔
1057
    in
1058
    let%bind.Deferred.Or_error () =
1059
      Deferred.Or_error.all_unit [in_file_check; out_file_check]
65✔
1060
    in
1061
    let log = Utils.log_name ~log_base ~desc:"mafft" in
64✔
1062
    let ({prog; args} as proc) = proc opts in
1063
    match%bind Process.run ~prog ~args () with
64✔
1064
    | Ok stdout ->
64✔
1065
        let%bind _ =
1066
          Writer.with_file opts.out_file ~f:(fun writer ->
64✔
1067
              Deferred.Or_error.return
64✔
1068
              @@ Writer.write_line writer
64✔
1069
              @@ String.strip stdout )
64✔
1070
        in
1071
        Deferred.Or_error.return opts.out_file
64✔
1072
    | Error e ->
×
1073
        Writer.with_file log ~f:(fun writer ->
1074
            let cmd = cmd_to_string proc in
×
1075
            let msg =
×
1076
              [%string "There was a problem running the following command"]
1077
            in
1078
            Writer.write_line writer msg ;
1079
            Writer.write_line writer cmd ;
×
1080
            Writer.write_line writer "ERROR:" ;
×
1081
            Writer.write_line writer (Error.to_string_hum e) ;
×
1082
            let%bind () = Writer.flushed writer in
×
1083
            Deferred.Or_error.error_string (Error.to_string_hum e) )
×
1084
end
1085

1086
let aln_io_file_names ~aln_dir ~query_name ~intein_name
1087
    ~(region_index : Zero_indexed_int.t) ~hit_index =
1088
  let basename in_or_out =
64✔
1089
    sprintf
128✔
1090
      "mafft_%s___%s___%s___%d___%d.fa"
1091
      in_or_out
1092
      query_name
1093
      intein_name
1094
      (Zero_indexed_int.to_one_indexed_int region_index)
128✔
1095
      (* TODO: indexing *)
1096
      (hit_index + 1)
1097
  in
1098
  let aln_in_file = aln_dir ^/ basename "in" in
64✔
1099
  let aln_out_file = aln_dir ^/ basename "out" in
64✔
1100
  (aln_in_file, aln_out_file)
64✔
1101

1102
module Trim_inteins = struct
1103
  let should_trim alignment_checks = Checks.pass alignment_checks
21✔
1104

1105
  let trim_intein :
1106
         alignment_checks:Checks.t
1107
      -> raw_query:Record.query_raw
1108
      -> (string * string) option =
1109
   fun ~alignment_checks ~raw_query:(Record.Query_raw raw_query) ->
1110
    if should_trim alignment_checks then
21✔
1111
      match
21✔
1112
        (alignment_checks.start_position, alignment_checks.end_position)
1113
      with
1114
      | Pass (At start_coord as start), Pass (At end_coord as end_) ->
21✔
1115
          let%map.Option len =
1116
            Coord.Query_aln_to_raw.Value.length ~start ~end_
1117
          in
1118
          let intein =
21✔
1119
            String.sub
1120
              (Bio_io.Fasta.Record.seq raw_query)
21✔
1121
              ~pos:(Coord.to_zero_indexed_int start_coord)
21✔
1122
              ~len
1123
          in
1124
          let region_string =
21✔
1125
            let start = Coord.to_one_indexed_string start_coord in
1126
            let end_ = Coord.to_one_indexed_string end_coord in
21✔
1127
            [%string "start_%{start}___end_%{end_}"]
21✔
1128
          in
1129
          (intein, region_string)
1130
      | _ ->
×
1131
          None
1132
    else None
×
1133

1134
  let trim_and_write_intein :
1135
         alignment_checks:Checks.t
1136
      -> raw_query:Record.query_raw
1137
      -> query_name:string
1138
      -> region_index:Zero_indexed_int.t
1139
      -> writer:Async.Writer.t
1140
      -> unit Async.Deferred.t =
1141
   fun ~alignment_checks
1142
       ~raw_query:(Record.Query_raw _ as raw_query)
1143
       ~query_name
1144
       ~region_index
1145
       ~writer ->
1146
    match trim_intein ~alignment_checks ~raw_query with
21✔
1147
    | None ->
×
1148
        Async.Deferred.return ()
1149
    | Some (intein, region) ->
21✔
1150
        let region_index =
1151
          Zero_indexed_int.to_one_indexed_string region_index
1152
        in
1153
        let region_index = [%string "region_%{region_index}"] in
21✔
1154
        let id = [%string ">%{query_name}___%{region_index}___%{region}\n"] in
21✔
1155
        let record = [%string "%{id}%{intein}"] in
21✔
1156
        Async.Writer.write_line writer record ;
21✔
1157
        Async.Writer.flushed writer
21✔
1158
end
1159

1160
(** Writes the alignment checks, and if necessary the trimmed intein. *)
1161
let write_aln_checks aln_out_file ~hit_region ~raw_query_length ~raw_query
1162
    ~query_new_name_to_old_name ~query_name ~intein_name
1163
    ~(region_index : Zero_indexed_int.t) ~intein_checks_writer
1164
    ~should_remove_aln_files ~trimmed_inteins_writer ~config =
1165
  let aln_out = read_aln_out_file aln_out_file in
64✔
1166
  let intein_query_info : Intein_query_info.t =
64✔
1167
    parse_aln_out
64✔
1168
      ~hit_region_start:hit_region.Region.start
1169
      ~hit_region_end:hit_region.end_
1170
      ~raw_query_length
1171
      aln_out
1172
  in
1173
  let aln_to_raw = aln_to_raw aln_out.query in
1174
  let checks =
64✔
1175
    Checks.of_intein_query_info
1176
      intein_query_info
1177
      aln_to_raw
1178
      raw_query_length
1179
      ~config
1180
  in
1181
  let query_name = Map.find_exn query_new_name_to_old_name query_name in
64✔
1182
  let first =
64✔
1183
    Intein_query_info.to_string_with_info
1184
      intein_query_info
1185
      ~query_name
1186
      ~intein_name
1187
      ~region_index
1188
  in
1189
  let second = Checks.to_string checks in
64✔
1190
  Async.Writer.write_line intein_checks_writer [%string "%{first}\t%{second}"] ;
64✔
1191
  let%bind.Async.Deferred () = Async.Writer.flushed intein_checks_writer in
64✔
1192
  let%bind.Async.Deferred () =
1193
    if should_remove_aln_files then Async_unix.Sys.remove aln_out_file
25✔
1194
    else Async.Deferred.return ()
39✔
1195
  in
1196
  if Checks.pass checks then
64✔
1197
    let%map.Async.Deferred () =
1198
      Trim_inteins.trim_and_write_intein
1199
        ~alignment_checks:checks
1200
        ~raw_query
1201
        ~query_name
1202
        ~region_index
1203
        ~writer:trimmed_inteins_writer
1204
    in
1205
    raise Exit
21✔
1206
  else Async.Deferred.return ()
43✔
1207

1208
(* Note: some of these values I can calulate from others, but they are also
1209
   known from the call site. Clean up at some point. *)
1210
let run_alignment_and_write_checks ~aln_dir ~(region_index : Zero_indexed_int.t)
1211
    ~hit_index ~query_name ~query_seq:(Record.Query_raw query_seq' as query_seq)
1212
    ~intein_seq:(Record.Intein_raw intein_record as intein_seq) ~log_base
1213
    ~clip_region_padding ~region ~query_new_name_to_old_name
1214
    ~intein_checks_writer ~trimmed_inteins_writer ~should_remove_aln_files
1215
    ~config =
1216
  let intein_name intein_seq' = Record.id intein_seq' in
64✔
1217
  let query_len = Record.seq_length query_seq' in
1218
  (* Function to actually run the alignment. *)
1219
  let run_alignment () =
64✔
1220
    let clipping_region =
64✔
1221
      Region.clip_region region ~padding:clip_region_padding ~query_len ()
1222
    in
1223
    let clipped_query_seq = clip_sequence query_seq clipping_region in
64✔
1224
    let aln_in_file, aln_out_file =
64✔
1225
      aln_io_file_names
1226
        ~aln_dir
1227
        ~query_name
1228
        ~intein_name:(intein_name intein_record)
64✔
1229
        ~region_index
1230
        ~hit_index
1231
    in
1232
    let () =
1233
      write_aln_in_file
1234
        ~query:query_seq
1235
        ~clipped_query:clipped_query_seq
1236
        ~intein:intein_seq
1237
        ~file_name:aln_in_file
1238
    in
1239
    let%bind.Async.Deferred mafft_out =
1240
      Mafft.run
1241
        ~opts:
1242
          (Mafft.opts ~exe:"mafft" ~in_file:aln_in_file ~out_file:aln_out_file)
1243
        ~log_base
1244
    in
1245
    (* We always remove the aln_in file, even if the user wants to keep the
1246
       intermediate aln files. *)
1247
    let%bind.Async.Deferred aln_file_exists =
1248
      Async_unix.Sys.file_exists_exn aln_in_file
64✔
1249
    in
1250
    let%map.Async.Deferred () =
1251
      if aln_file_exists then Async_unix.Sys.remove aln_in_file
64✔
1252
      else Async.Deferred.return ()
×
1253
    in
1254
    mafft_out
64✔
1255
  in
1256
  let write_aln_checks =
1257
    write_aln_checks
1258
      ~hit_region:region
1259
      ~raw_query_length:query_len
1260
      ~query_new_name_to_old_name
1261
      ~query_name
1262
      ~intein_name:(intein_name intein_record)
64✔
1263
      ~region_index
1264
      ~intein_checks_writer
1265
      ~should_remove_aln_files
1266
      ~raw_query:query_seq
1267
      ~trimmed_inteins_writer
1268
      ~config
1269
  in
1270
  Utils.iter_if_ok (run_alignment ()) ~f:write_aln_checks
64✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc