• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ensembl / ensembl / 4008

pending completion
4008

Pull #655

travis-ci-com

Tamara El Naboulsi
Remove commented out code
Pull Request #655: Maintaining manually inserted xrefs

32794 of 39828 relevant lines covered (82.34%)

818.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

80.62
/modules/Bio/EnsEMBL/Utils/SeqDumper.pm
1
=head1 LICENSE
2

3
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4
Copyright [2016-2023] EMBL-European Bioinformatics Institute
5

6
Licensed under the Apache License, Version 2.0 (the "License");
7
you may not use this file except in compliance with the License.
8
You may obtain a copy of the License at
9

10
     http://www.apache.org/licenses/LICENSE-2.0
11

12
Unless required by applicable law or agreed to in writing, software
13
distributed under the License is distributed on an "AS IS" BASIS,
14
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
See the License for the specific language governing permissions and
16
limitations under the License.
17

18
=cut
19

20

21
=head1 CONTACT
22

23
  Please email comments or questions to the public Ensembl
24
  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25

26
  Questions may also be sent to the Ensembl help desk at
27
  <http://www.ensembl.org/Help/Contact>.
28

29
=cut
30

31
=head1 NAME
32

33
Bio::EnsEMBL::Utils::SeqDumper
34

35
=head1 SYNOPSIS
36

37
  $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new();
38

39
  # don't dump snps or repeats
40
  $seq_dumper->disable_feature_type('repeat');
41
  $seq_dumper->disable_feature_type('variation');
42

43
  # dump EMBL format to STDOUT
44
  $seq_dumper->dump( $slice, 'EMBL' );
45

46
  # dump GENBANK format to a file
47
  $seq_dumper->dump( $slice, 'GENBANK', 'out.genbank' );
48

49
  # dump FASTA format to a file
50
  $seq_dumper->dump( $slice, 'FASTA', 'out.fasta' );
51

52
=head1 DESCRIPTION
53

54
A relatively simple and lite-weight flat file dumper for Ensembl slices.
55
The memory efficiency could be improved and this is currently not very
56
good for dumping very large sequences such as whole chromosomes.
57

58
=head1 METHODS
59

60
=cut
61

62

63
package Bio::EnsEMBL::Utils::SeqDumper;
64

65
use strict;
3✔
66

67
use IO::File;
3✔
68
use Fcntl qw( SEEK_SET SEEK_END );
3✔
69
use vars qw(@ISA);
3✔
70

71
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
3✔
72

73
#keys must be uppercase
74
my $DUMP_HANDLERS = 
75
  { 'FASTA'     => \&dump_fasta,
76
    'EMBL'      => \&dump_embl,
77
    'GENBANK'   => \&dump_genbank };
78

79
my @COMMENTS = 
80
  ('This sequence was annotated by ###SOURCE###. Please visit ' .
81
   'the Ensembl or EnsemblGenomes web site, http://www.ensembl.org/ or http://www.ensemblgenomes.org/ for more information.',
82

83
   'All feature locations are relative to the first (5\') base ' .
84
   'of the sequence in this file.  The sequence presented is '.
85
   'always the forward strand of the assembly. Features ' .
86
   'that lie outside of the sequence contained in this file ' .
87
   'have clonal location coordinates in the format: ' .
88
   '<clone accession>.<version>:<start>..<end>',
89

90
   'The /gene indicates a unique id for a gene, /note="transcript_id=..."' . 
91
   ' a unique id for a transcript, /protein_id a unique id for a peptide ' .
92
   'and note="exon_id=..." a unique id for an exon. These ids are ' .
93
   'maintained wherever possible between versions.',
94

95
   'All the exons and transcripts in Ensembl are confirmed by ' .
96
   'similarity to either protein or cDNA sequences.');
97

98
=head2 new
99

100
  Arg [1]    : none
101
  Example    : $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new;
102
  Description: Creates a new SeqDumper 
103
  Returntype : Bio::EnsEMBL::Utils::SeqDumper
104
  Exceptions : none
105
  Caller     : general
106

107
=cut
108

109
sub new {
110
  my ($caller, $slice, $params) = @_;
×
111

112
  my $class = ref($caller) || $caller;
×
113

114
  my $feature_types = {'gene'        => 1,
×
115
                       'genscan'     => 1,
116
                       'repeat'      => 1,
117
                       'similarity'  => 1,
118
                       'variation'   => 1,
119
                       'contig'      => 1,
120
                       'marker'      => 1,
121
                       'estgene'     => 0,
122
                       'vegagene'    => 0};
123

124
  my $self = bless {'feature_types' => $feature_types}, $class;
×
125

126
  foreach my $p (sort keys %{$params || {}}) {
×
127
      $self->{$p} = $params->{$p};
×
128
  }
129

130
  # default 60kb buffer 
131
  exists $self->{'chunk_factor'} and defined $self->{'chunk_factor'}
132
    or $self->{'chunk_factor'} = 1000;
×
133

134
  return $self;
×
135
}
136

137

138

139
=head2 enable_feature_type
140

141
  Arg [1]    : string $type
142
  Example    : $seq_dumper->enable_feature_type('similarity');
143
  Description: Enables the dumping of a specific type of feature
144
  Returntype : none
145
  Exceptions : warn if invalid feature type is passed,
146
               thrown if no feature type is passed
147
  Caller     : general
148

149
=cut
150

151
sub enable_feature_type {
152
  my ($self, $type) = @_;
3✔
153

154
  $type || throw("type arg is required");
3✔
155

156
  if(exists($self->{'feature_types'}->{$type})) {
3✔
157
    $self->{'feature_types'}->{$type} = 1;
3✔
158
  } else {
159
    warning("unknown feature type '$type'\n" .
160
          "valid types are: " . join(',', keys %{$self->{'feature_types'}})); 
3✔
161
  }
162
}
163

164

165

166
=head2 attach_database
167

168
  Arg [1]    : string name
169
  Arg [2]    : Bio::EnsEMBL::DBSQL::DBAdaptor
170
  Example    : $seq_dumper->attach_database('estgene', $estgene_db);
171
  Description: Attaches a database to the seqdumper that can be used to 
172
               dump data which is external to the ensembl core database.
173
               Currently this is necessary to dump est genes and vega genes
174
  Returntype : none
175
  Exceptions : thrown if incorrect argument is supplied
176
  Caller     : general
177

178
=cut
179

180
sub attach_database {
181
  my ($self, $name, $db) = @_;
×
182

183
  $name || throw("name arg is required");
3✔
184
  unless($db && ref($db) && $db->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) {
3✔
185
    throw("db arg must be a Bio::EnsEMBL::DBSQL::DBAdaptor not a [$db]");
×
186
  }
187

188
  $self->{'attached_dbs'}->{$name} = $db;
×
189
}
190

191

192

193
=head2 get_database
194

195
  Arg [1]    : string $name
196
  Example    : $db = $seq_dumper->get_database('vega');
197
  Description: Retrieves a database that has been attached to the 
198
               seqdumper via the attach database call.
199
  Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor
200
  Exceptions : thrown if incorrect argument is supplied
201
  Caller     : dump_feature_table
202

203
=cut
204

205
sub get_database {
206
  my ($self, $name) = @_;
×
207

208
  $name || throw("name arg is required");
×
209
  
210
  return $self->{'attached_dbs'}->{$name};
×
211
}
212

213

214

215
=head2 remove_database
216

217
  Arg [1]    : string $name 
218
  Example    : $db = $seq_dumper->remove_database('estgene');
219
  Description: Removes a database that has been attached to the seqdumper
220
               via the attach database call.  The database that is removed
221
               is returned (or undef if it did not exist).
222
  Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor
223
  Exceptions : thrown if incorrect argument is supplied
224
  Caller     : general
225

226
=cut
227

228
sub remove_database {
229
  my ($self, $name) = @_;
×
230

231
  $name || throw("name arg is required");
×
232

233
  if(exists $self->{'attached_dbs'}->{$name}) {
×
234
    return delete $self->{'attached_dbs'}->{$name};
×
235
  }
236

237
  return undef;
×
238
}
239

240

241
=head2 disable_feature_type
242

243
  Arg [1]    : string $type
244
  Example    : $seq_dumper->disable_feature_type('genes');
245
  Description: Disables the dumping of a specific type of feature
246
  Returntype : none
247
  Exceptions : warn if an invalid feature type is passed,
248
               thrown if no feature type is passed
249
  Caller     : general
250

251
=cut
252

253
sub disable_feature_type {
254
  my ($self, $type) = @_;
×
255
  
256
  $type || throw("type arg is required");
×
257

258
  if(exists($self->{'feature_types'}->{$type})) {
×
259
    $self->{'feature_types'}->{$type} = 0;
×
260
  } else {
261
    warning("unknown feature type '$type'\n" .
262
            "valid types are: " . join(',', keys %{$self->{'feature_types'}}));
×
263
  }
264
}
265

266

267

268
=head2 is_enabled
269

270
  Arg [1]    : string $type 
271
  Example    : do_something() if($seq_dumper->is_enabled('gene'));
272
  Description: checks if a specific feature type is enabled
273
  Returntype : none
274
  Exceptions : warning if invalid type is passed, 
275
               thrown if no type is passed 
276
  Caller     : general
277

278
=cut
279

280
sub is_enabled {
281
  my ($self, $type) = @_;
73✔
282

283
  $type || throw("type arg is required");
73✔
284

285
  if(exists($self->{'feature_types'}->{$type})) {
73✔
286
    return $self->{'feature_types'}->{$type};
73✔
287
  } else {
288
    warning("unknown feature type '$type'\n" .
289
           "valid types are: " . join(',', keys %{$self->{'feature_types'}}));
×
290
  }
291
}
292

293

294
=head2 dump
295

296
  Arg [1]    : Bio::EnsEMBL::Slice slice
297
               The slice to dump
298
  Arg [2]    : string $format
299
               The name of the format to dump
300
  Arg [3]    : (optional) $outfile
301
               The name of the file to dump to. If no file is specified STDOUT
302
               is used
303
  Arg [4]    : (optional) $seq
304
               Sequence to dump
305
  Arg [4]    : (optional) $no_append
306
               Default action is to open the file in append mode. This will
307
               turn that mode off
308
  Example    : $seq_dumper->dump($slice, 'EMBL');
309
  Description: Dumps a region of a genome specified by the slice argument into
310
               an outfile of the format $format
311
  Returntype : none
312
  Exceptions : thrown if slice or format args are not supplied
313
  Caller     : general
314

315
=cut
316

317

318
sub dump {
319
  my ($self, $slice, $format, $outfile, $seq, $no_append) = @_;
15✔
320

321
  $format || throw("format arg is required");
15✔
322
  $slice  || throw("slice arg is required");
15✔
323

324
  my $dump_handler = $DUMP_HANDLERS->{uc($format)};
15✔
325

326
  unless($dump_handler) {
6✔
327
    throw("No dump handler is defined for format $format\n");
×
328
  }
329

330

331
  my $FH = IO::File->new;;
6✔
332
  if($outfile) {
6✔
333
    my $mode = ($no_append) ? '>' : '>>';
6✔
334
    $FH->open("${mode}${outfile}") or throw("Could not open file $outfile: $!");
6✔
335
  } else {
336
    $FH = \*STDOUT;
×
337
    #mod_perl did not like the following
338
    #$FH->fdopen(fileno(STDOUT), "w") 
339
    #  or throw("Could not open currently selected output filehandle " .
340
    #         "for writing");
341
  }
342
  
343
  &$dump_handler($self, $slice, $FH, $seq);
6✔
344

345
  $FH->close if ($outfile); #close if we were writing to a file
6✔
346
}
347

348
=head2 dump_embl
349

350
  Arg [1]    : Bio::EnsEMBL::Slice
351
  Arg [2]    : IO::File $FH
352
  Arg [3]    : optional sequence string
353
  Example    : $seq_dumper->dump_embl($slice, $FH);
354
  Description: Dumps an EMBL flat file to an open file handle
355
  Returntype : none
356
  Exceptions : if calls to get/set the filehandle position fail
357
  Caller     : dump
358

359
=cut
360

361
sub dump_embl {
362
  my $self = shift;
5✔
363
  my $slice = shift;
5✔
364
  my $FH   = shift;
5✔
365

366
  my $len = $slice->length;
5✔
367

368
  my $version;
5✔
369
  my $acc;
370

371
  my $cs = $slice->coord_system();
5✔
372
  my $name_str = $cs->name() . ' ' . $slice->seq_region_name();
5✔
373
  $name_str .= ' ' . $cs->version if($cs->version);
5✔
374

375
  my $start = $slice->start;
5✔
376
  my $end   = $slice->end;
5✔
377

378
  #determine if this slice is the entire seq region
379
  #if it is then we just use the name as the id
380
  my $slice_adaptor = $slice->adaptor();
5✔
381
  my $full_slice =
5✔
382
    $slice->adaptor->fetch_by_region($cs->name,
383
                                    $slice->seq_region_name,
384
                                    undef,undef,undef,
385
                                    $cs->version);
386

387
  my $entry_name = $slice->seq_region_name();
5✔
388

389
  if($full_slice->name eq $slice->name) {
5✔
390
    $name_str .= ' full sequence';
1✔
391
    $acc = $slice->seq_region_name();
1✔
392
    my @acc_ver = split(/\./, $acc);
1✔
393
    if(@acc_ver == 2) {
1✔
394
      $acc = $acc_ver[0];
×
395
      $version = $acc_ver[0] . '.'. $acc_ver[1];
×
396
    } elsif(@acc_ver == 1 && $cs->version()) {
397
      $version = $acc . '.'. $cs->version();
×
398
    } else {
399
      $version = $acc;
1✔
400
    }
401
  } else {
402
    $name_str .= ' partial sequence';
4✔
403
    $acc = $slice->name();
4✔
404
    $version = $acc;
4✔
405
  }
406

407
  $acc = $slice->name();
5✔
408

409
  #line breaks are allowed near the end of the line on ' ', "\t", "\n", ',' 
410
  $: = (" \t\n-,");
5✔
411

412
  #############
413
  # dump header
414
  #############
415

416
  # move at the end of file in case the file
417
  # is open more than once (i.e. human chromosome Y, 
418
  # two chromosome slices
419
  #
420
  # WARNING
421
  #
422
  # When the file is open not for the first time,
423
  # it must be in read/write mode
424
  #
425
  seek($FH, 0, SEEK_END);
5✔
426

427
  my $EMBL_HEADER = 
5✔
428
'@<   ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~
429
';
430

431
  #ID and moltype
432
  # HTG = High Throughput Genome division, probably most suitable
433
  #       and it would be hard to come up with another appropriate division
434
  #       that worked for all organisms (e.g. plants are in PLN but human is
435
  #       in HUM).
436
  my $VALUE = "$entry_name    standard; DNA; HTG; $len BP.";
5✔
437
  $self->write($FH, $EMBL_HEADER, 'ID', $VALUE);  
5✔
438
  $self->print( $FH, "XX\n" );
5✔
439

440
  #Accession
441
  $self->write($FH, $EMBL_HEADER, 'AC', $acc);
5✔
442
  $self->print( $FH, "XX\n" );
5✔
443

444
  #Version
445
  $self->write($FH, $EMBL_HEADER, 'SV', $version);
5✔
446
  $self->print( $FH, "XX\n" );
5✔
447

448
  #Date
449
  $self->write($FH, $EMBL_HEADER, 'DT', $self->_date_string);
5✔
450
  $self->print( $FH, "XX\n" );
5✔
451

452
  my $meta_container = $slice->adaptor()->db()->get_MetaContainer();
5✔
453
  
454
  #Description
455
  $self->write($FH, $EMBL_HEADER, 'DE', $meta_container->get_scientific_name() .
5✔
456
               " $name_str $start..$end annotated by Ensembl");
457
  $self->print( $FH, "XX\n" );
5✔
458

459
  #key words
460
  $self->write($FH, $EMBL_HEADER, 'KW', '.');
5✔
461
  $self->print( $FH, "XX\n" );
5✔
462

463
  #Species
464
  my $species_name = $meta_container->get_scientific_name();
5✔
465
  if(my $cn = $meta_container->get_common_name()) {
5✔
466
    $species_name .= " ($cn)";
5✔
467
  }
468

469
  $self->write($FH, $EMBL_HEADER, 'OS', $species_name);
5✔
470

471
  #Classification
472
  my @cls = @{$meta_container->get_classification()};
5✔
473
  $self->write($FH, $EMBL_HEADER, 'OC', join('; ', reverse(@cls)) . '.');
5✔
474
  $self->print( $FH, "XX\n" );
5✔
475
  
476
  #References (we are not dumping refereneces)
477
  #Database References (we are not dumping these)
478

479
  #comments
480
  my $annotation_source = $self->annotation_source($meta_container);
5✔
481
  foreach my $comment (@COMMENTS) {
5✔
482
    $comment =~ s/\#\#\#SOURCE\#\#\#/$annotation_source/;
20✔
483
    $self->write($FH, $EMBL_HEADER, 'CC', $comment);
20✔
484
    $self->print( $FH, "XX\n" );
20✔
485
  }
486

487
  ####################
488
  # DUMP FEATURE TABLE
489
  ####################
490
  $self->print( $FH, "FH   Key             Location/Qualifiers\n" );
5✔
491

492
  my $FEATURE_TABLE = 
5✔
493
'FT   ^<<<<<<<<<<<<<<<^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~
494
';
495
  $self->_dump_feature_table($slice, $FH, $FEATURE_TABLE);  
5✔
496

497
  #write an XX after the feature tables
498
  $self->print( $FH, "XX\n" );
5✔
499

500
  ###################
501
  # DUMP SEQUENCE
502
  ###################
503

504
  # record position before writing sequence header, so that 
505
  # after printing the sequence and having the base counts
506
  # we can seek to this position and write the proper sequence 
507
  # header
508
  my $sq_offset = tell($FH);
5✔
509
  $sq_offset == -1 and throw "Unable to get offset for output fh";
5✔
510

511
  # print a sequence header template, to be replaced with a real
512
  # one containing the base counts
513
  $self->print($FH, "SQ   Sequence ########## BP; ########## A; ########## C; ########## G; ########## T; ########## other;\n");
5✔
514
  
515
  # dump the sequence and get the base counts
516
  my $acgt = $self->write_embl_seq($slice, $FH);
5✔
517
  
518
  # print the end of EMBL entry
519
  $self->print( $FH, "//\n" );
5✔
520
  my $end_of_entry_offset = tell($FH);
5✔
521
  $end_of_entry_offset == -1 and throw "Unable to get offset for output fh";
5✔
522

523
  # seek backwards to the position of the sequence header and 
524
  # write it with the actual base counts
525
  seek($FH, $sq_offset, SEEK_SET) 
5✔
526
    or throw "Cannot seek backward to sequence header position";
527
  $self->print($FH, sprintf "SQ   Sequence %10d BP; %10d A; %10d C; %10d G; %10d T; %10d other;", 
528
               $acgt->{tot}, $acgt->{a}, $acgt->{c}, $acgt->{g}, $acgt->{t}, $acgt->{n});
5✔
529

530
  # move forward to end of file to dump the next slice
531
  seek($FH, $end_of_entry_offset, SEEK_SET) 
5✔
532
    or throw "Cannot seek forward to end of entry";
533

534
  # Set formatting back to normal
535
  $: = " \n-";
5✔
536
}
537

538
=head2 dump_genbank
539

540
  Arg [1]    : Bio::EnsEMBL::Slice
541
  Arg [2]    : IO::File $FH
542
  Example    : $seq_dumper->dump_genbank($slice, $FH);
543
  Description: Dumps a GENBANK flat file to an open file handle
544
  Returntype : none
545
  Exceptions : none
546
  Caller     : dump
547

548
=cut
549

550
sub dump_genbank {
551
  my ($self, $slice, $FH) = @_;
1✔
552

553
  #line breaks are allowed near the end of the line on ' ', "\t", "\n", ',' 
554
  $: = " \t\n-,";
1✔
555

556
  my $GENBANK_HEADER = 
1✔
557
'^<<<<<<<<<  ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
558
';
559

560
  my $GENBANK_SUBHEADER =
1✔
561
'  ^<<<<<<<  ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
562
';
563

564
  my $GENBANK_FT =
1✔
565
'     ^<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
566
';
567

568
  my $version;
1✔
569
  my $acc;
570

571
  my $cs = $slice->coord_system();
1✔
572

573
  my $name_str = $cs->name() . ' ' . $slice->seq_region_name();
1✔
574

575
  $name_str .= ' ' . $cs->version if($cs->version);
1✔
576

577
  #determine if this slice is the entire seq region
578
  #if it is then we just use the name as the id
579
  my $slice_adaptor = $slice->adaptor();
1✔
580
  my $full_slice =
1✔
581
    $slice->adaptor->fetch_by_region($cs->name,
582
                                    $slice->seq_region_name,
583
                                    undef,undef,undef,
584
                                    $cs->version);
585

586

587
  my $entry_name = $slice->seq_region_name();
1✔
588

589
  if($full_slice->name eq $slice->name) {
1✔
590
    $name_str .= ' full sequence';
×
591
    $acc = $slice->seq_region_name();
×
592
    my @acc_ver = split(/\./, $acc);
×
593
    if(@acc_ver == 2) {
×
594
      $acc = $acc_ver[0];
×
595
      $version = $acc_ver[0] . $acc_ver[1];
×
596
    } elsif(@acc_ver == 1 && $cs->version()) {
597
      $version = $acc . $cs->version();
×
598
    } else {
599
      $version = $acc;
×
600
    }
601
  } else {
602
    $name_str .= ' partial sequence';
1✔
603
    $acc = $slice->name();
1✔
604
    $version = $acc;
1✔
605
  }
606

607
  $acc = $slice->name();     # to keep format consistent for all
1✔
608

609
  my $length = $slice->length;
1✔
610
  my $start = $slice->start();
1✔
611
  my $end   = $slice->end();
1✔
612

613
  my $date = $self->_date_string;
1✔
614

615
  my $meta_container = $slice->adaptor()->db()->get_MetaContainer();
1✔
616

617
  # move at the end of file in case the file
618
  # is open more than once (i.e. human chromosome Y, 
619
  # two chromosome slices
620
  #
621
  # WARNING
622
  #
623
  # When the file is open not for the first time,
624
  # it must be in read/write mode
625
  #
626
  seek($FH, 0, SEEK_END);
1✔
627

628
  #LOCUS
629
  my $tag   = 'LOCUS';
1✔
630
  my $value = "$entry_name $length bp DNA HTG $date";
1✔
631
  $self->write($FH, $GENBANK_HEADER, $tag, $value);
1✔
632

633
  #DEFINITION
634
  $tag   = "DEFINITION";
1✔
635
  $value = $meta_container->get_scientific_name() . 
1✔
636
    " $name_str $start..$end reannotated via EnsEMBL";
637
  $self->write($FH, $GENBANK_HEADER, $tag, $value);
1✔
638

639
  #ACCESSION
640
  $self->write($FH, $GENBANK_HEADER, 'ACCESSION', $acc);
1✔
641

642
  #VERSION
643
  $self->write($FH, $GENBANK_HEADER, 'VERSION', $version);
1✔
644

645
  # KEYWORDS
646
  $self->write($FH, $GENBANK_HEADER, 'KEYWORDS', '.');
1✔
647

648
  # SOURCE
649
  my $common_name = $meta_container->get_common_name();
1✔
650
  $common_name = $meta_container->get_scientific_name() unless $common_name;
1✔
651
  $self->write($FH, $GENBANK_HEADER, 'SOURCE', $common_name);
1✔
652

653
  #organism
654
  my @cls = @{$meta_container->get_classification()};
1✔
655
  shift @cls;
1✔
656
  $self->write($FH, $GENBANK_SUBHEADER, 'ORGANISM', $meta_container->get_scientific_name());
1✔
657
  $self->write($FH, $GENBANK_SUBHEADER, '', join('; ', reverse @cls) . ".");
1✔
658

659
  #refereneces
660

661
  #comments
662
  my $annotation_source = $self->annotation_source($meta_container);
1✔
663
  foreach my $comment (@COMMENTS) {
1✔
664
    $comment =~ s/\#\#\#SOURCE\#\#\#/$annotation_source/;
4✔
665
    $self->write($FH, $GENBANK_HEADER, 'COMMENT', $comment);
4✔
666
  }
667

668
  ####################
669
  # DUMP FEATURE TABLE
670
  ####################
671
  $self->print( $FH, "FEATURES             Location/Qualifiers\n" );
1✔
672
  $self->_dump_feature_table($slice, $FH, $GENBANK_FT);
1✔
673

674
  ####################
675
  # DUMP SEQUENCE
676
  ####################
677

678
  # record position before writing sequence header, so that 
679
  # after printing the sequence and having the base counts
680
  # we can seek to this position and write the proper sequence 
681
  # header
682
  my $sq_offset = tell($FH);
1✔
683
  $sq_offset == -1 and throw "Unable to get offset for output fh";
1✔
684

685
  # print a sequence header template, to be replaced with a real
686
  # one containing the base counts
687
  $self->print($FH, "BASE COUNT  ########## a ########## c ########## g ########## t ########## n\nORIGIN\n");
1✔
688
  
689
  # dump the sequence and get the base counts
690
  my $acgt = $self->write_genbank_seq($slice, $FH);
1✔
691
  
692
  # print the end of genbank entry
693
  $self->print( $FH, "//\n" );
1✔
694
  my $end_of_entry_offset = tell($FH);
1✔
695
  $end_of_entry_offset == -1 and throw "Unable to get offset for output fh";
1✔
696

697
  # seek backwards to the position of the sequence header and 
698
  # write it with the actual base counts
699
  seek($FH, $sq_offset, SEEK_SET) 
1✔
700
    or throw "Cannot seek backward to sequence header position";
701
  $self->print($FH, sprintf "BASE COUNT  %10d a %10d c %10d g %10d t %10d n", 
702
               $acgt->{a}, $acgt->{c}, $acgt->{g}, $acgt->{t}, $acgt->{n});
1✔
703

704
  seek($FH, $end_of_entry_offset, SEEK_SET) 
1✔
705
    or throw "Cannot seek forward to end of entry";
706

707
  # Set formatting back to normal
708
  $: = " \n-";
1✔
709
}
710

711

712

713
=head2 _dump_feature_table
714

715
  Arg [1]    : Bio::EnsEMBL::Slice slice
716
  Example    : none
717
  Description: Helper method used to dump feature tables used in EMBL, FASTA,
718
               GENBANK.  Assumes formating of file handle has been setup
719
               already to use $FEAT and $VALUE values.
720
  Returntype : none
721
  Exceptions : none
722
  Caller     : internal
723

724
=cut
725

726
sub _dump_feature_table {
727
  my $self   = shift;
4✔
728
  my $slice  = shift;
4✔
729
  my $FH     = shift;
4✔
730
  my $FORMAT = shift;
4✔
731

732
  #use only the core database to dump features (except for bloody snps)
733
  my $lite = $slice->adaptor->db->remove_db_adaptor('lite');
4✔
734

735
  my $meta = $slice->adaptor->db->get_MetaContainer;
4✔
736

737
  #lump file handle and format string together for simpler method calls
738
  my @ff = ($FH, $FORMAT);
4✔
739
  my $value;
4✔
740

741
  #source
742
  my $classification = join(', ', @{$meta->get_classification()});
4✔
743
  $self->write(@ff,'source', "1.." . $slice->length());
4✔
744
  $self->write(@ff,''      , '/organism="'.$meta->get_scientific_name(). '"');
4✔
745
  $self->write(@ff,''      , '/db_xref="taxon:'.$meta->get_taxonomy_id().'"');
4✔
746

747
  #
748
  # Transcripts & Genes
749
  #
750
  my @gene_slices;
2✔
751
  if($self->is_enabled('gene')) {
2✔
752
    push @gene_slices, $slice;
2✔
753
  }
754

755
  # Retrieve slices of other database where we need to pull genes from
756

757
  my $gene_dbs = {'vegagene' => 'vega',
2✔
758
                  'estgene'  => 'estgene'};
759

760
  foreach my $gene_type (keys %$gene_dbs) {
1✔
761
    if($self->is_enabled($gene_type)) {
2✔
762
      my $db = $self->get_database($gene_dbs->{$gene_type});
×
763
      if($db) {
1✔
764
        my $sa = $db->get_SliceAdaptor();
2✔
765
        push @gene_slices, $sa->fetch_by_name($slice->name());
2✔
766
      } else {
767
        warning("A [". $gene_dbs->{$gene_type} ."] database must be " .
2✔
768
                "attached to this SeqDumper\n(via a call to " .
769
                "attach_database) to retrieve genes of type [$gene_type]");
770
      }
771
    }
772
  }
773

774
  foreach my $gene_slice (@gene_slices) {
4✔
775
    my @genes = @{$gene_slice->get_all_Genes(undef,undef, 1)};
4✔
776
    while(my $gene = shift @genes) {
4✔
777
      $value = $self->features2location( [$gene] );
7✔
778
      $self->write( @ff, 'gene', $value );
7✔
779
      $self->write( @ff, "", '/gene='.$gene->stable_id_version() );
7✔
780

781

782
      if(defined($gene->display_xref)){
7✔
783
        $self->write( @ff, "",'/locus_tag="'.$gene->display_xref->display_id.'"');
7✔
784
      }
785
      my $desc = $gene->description;
7✔
786
      if(defined($desc) and $desc ne ""){
7✔
787
        $desc =~ s/\"//; 
7✔
788
        $self->write( @ff, "", '/note="'.$gene->description.'"');
7✔
789
      }
790

791

792

793
      foreach my $transcript (@{$gene->get_all_Transcripts}) {
7✔
794
        my $translation = $transcript->translation;
8✔
795

796
        # normal transcripts get dumped differently than pseudogenes
797
        if($translation) {
8✔
798
          #normal transcript
799
          $value = $self->features2location($transcript->get_all_Exons);
8✔
800
          $self->write(@ff, 'mRNA', $value);
8✔
801
          $self->write(@ff,''   , '/gene="'.$gene->stable_id_version().'"');
8✔
802
          $self->write(@ff,''
8✔
803
                       ,'/standard_name="'.$transcript->stable_id_version().'"');
804

805
          # ...and a CDS section
806
          $value = 
8✔
807
            $self->features2location($transcript->get_all_translateable_Exons);
808
          $self->write(@ff,'CDS', $value);
8✔
809
          my $codon_start = $self->transcript_to_codon_start($transcript);
8✔
810
          $self->write(@ff,''   , qq{/codon_start="${codon_start}"}) if $codon_start > 1;
8✔
811

812
          my $codon_table_id = $self->_get_codon_table_id($slice);
8✔
813
          if($codon_table_id > 1){
17✔
814
            $self->write(@ff,''   , '/transl_table='.$codon_table_id);
12✔
815
          }
816
          $self->write(@ff,''   , '/gene="'.$gene->stable_id_version().'"'); 
8✔
817
          $self->write(@ff,''   , '/protein_id="'.$translation->stable_id_version().'"');
8✔
818
          $self->write(@ff,''   ,'/note="transcript_id='.$transcript->stable_id_version().'"');
8✔
819

820
          foreach my $dbl (@{$transcript->get_all_DBLinks}) {
8✔
821
            my $db_xref    = '/db_xref="'.$dbl->dbname().':'.$dbl->primary_id().'"';
55✔
822
            my $go_db_xref = '/db_xref="'.$dbl->primary_id().'"';
55✔
823
            $value  = ($dbl->dbname()=~/GO/) ? $go_db_xref : $db_xref; 
55✔
824
            $self->write(@ff, '', $value);
55✔
825
          }
826

827
          my $pep = $transcript->translate();
8✔
828
          if ($pep) {
8✔
829
            $value = '/translation="'.$pep->seq().'"';
8✔
830
            $self->write(@ff, '', $value);
8✔
831
          }
832
        } else {
833
          #pseudogene
834
          $value = $self->features2location($transcript->get_all_Exons);
×
835
          $self->write(@ff, 'misc_RNA', $value);
×
836
          $self->write(@ff,''   , '/gene="'.$gene->stable_id_version().'"');
×
837
          foreach my $dbl (@{$transcript->get_all_DBLinks}) {
×
838
            $value = '/db_xref="'.$dbl->dbname().':'.$dbl->primary_id().'"';
×
839
            $self->write(@ff, '', $value);
×
840
          }
841
          $self->write(@ff,''   , '/note="'.$transcript->biotype().'"');
×
842
          $self->write(@ff,''
×
843
                       ,'/standard_name="'.$transcript->stable_id_version().'"');
844
        }
845
      }
846
    }
847

848
    # exons
849
    foreach my $gene (@{$gene_slice->get_all_Genes(undef,undef,1)}) {
1✔
850
      foreach my $exon (@{$gene->get_all_Exons}) {
4✔
851
        $self->write(@ff,'exon', $self->features2location([$exon]));
30✔
852
        $self->write(@ff,''    , '/note="exon_id='.$exon->stable_id_version().'"');
30✔
853
      }
854
    }
855
  }
856

857
  #
858
  # genscans
859
  #
860
  if($self->is_enabled('genscan')) {
1✔
861
    my @genscan_exons;
×
862
    my @transcripts = @{$slice->get_all_PredictionTranscripts(undef,1)};
×
863
    while(my $transcript = shift @transcripts) {
×
864
      my $exons = $transcript->get_all_Exons();
×
865
      push @genscan_exons, @$exons;
×
866
      $self->write(@ff, 'mRNA', $self->features2location($exons));
×
867
      $self->write(@ff, '', '/product="'.$transcript->translate()->seq().'"');
×
868
      $self->write(@ff, '', '/note="identifier='.$transcript->stable_id_version.'"');
×
869
      $self->write(@ff, '', '/note="Derived by automated computational' .
×
870
                   ' analysis using gene prediction method:' .
871
                   $transcript->analysis->logic_name . '"');
872
    }
873
  }
874

875
  #
876
  # snps
877
  #
878
  if($self->is_enabled('variation') && $slice->can('get_all_VariationFeatures')) {
1✔
879
#    $slice->adaptor->db->add_db_adaptor('lite', $lite) if $lite;
880

881
    my @variations = @{$slice->get_all_VariationFeatures()};
×
882
    while(my $snp = shift @variations) {
×
883
      my $ss = $snp->start;
×
884
      my $se = $snp->end;
×
885
      #skip snps that hang off edge of slice
886
      next if($ss < 1 || $se > $slice->length); 
×
887

888
      $self->write(@ff, 'variation', "$ss..$se");
×
889
      $self->write(@ff, ''         , '/replace="'.$snp->allele_string.'"'); 
×
890
      #$self->write(@ff, ''         , '/evidence="'.$snp->status.'"'); 
891
      my $rs_id = $snp->variation_name();
×
892
      my $db = $snp->source_name();
×
893
#      foreach my $link ($snp->each_DBLink) {
894
#        my $id = $link->primary_id;
895
#        my $db = $link->database;
896
        $self->write(@ff, '', "/db_xref=\"$db:$rs_id\""); 
×
897
#      }
898
    }
899

900
#    $slice->adaptor->db->remove_db_adaptor('lite') if $lite;
901
  }
902

903
  #
904
  # similarity features
905
  #
906
  if($self->is_enabled('similarity')) {
1✔
907
    foreach my $sim (@{$slice->get_all_SimilarityFeatures}) {
×
908
      $self->write(@ff, 'misc_feature', $self->features2location([$sim]));
×
909
      $self->write(@ff, ''       , '/note="match: '.$sim->hseqname.
×
910
                  ' : '.$sim->hstart.'..'.$sim->hend.'('.$sim->hstrand.')"');
911
    }
912
  }
913

914
  #
915
  # repeats
916
  #
917
  if($self->is_enabled('repeat')) {
1✔
918
    my @rfs = @{$slice->get_all_RepeatFeatures()};
×
919

920
    while(my $repeat = shift @rfs) {
×
921
      $self->write(@ff, 'repeat_region', $self->features2location([$repeat]));
×
922
      $self->write(@ff, ''    , '/note="' . $repeat->repeat_consensus->name.
×
923
                   ' repeat: matches ' . $repeat->hstart.'..'.$repeat->hend .
924
                   '('.$repeat->hstrand.') of consensus"');
925
    }
926

927
  }
928

929
  #
930
  # markers
931
  #
932
  if($self->is_enabled('marker') && $slice->can('get_all_MarkerFeatures')) {
1✔
933
    my @markers = @{$slice->get_all_MarkerFeatures()};
×
934
    while(my $mf = shift @markers) {
×
935
      $self->write(@ff, 'STS', $self->features2location([$mf]));
×
936
      if($mf->marker->display_MarkerSynonym) {
×
937
        $self->write(@ff, ''   , '/standard_name="' .
×
938
                     $mf->marker->display_MarkerSynonym->name . '"');
939
      }
940

941

942
      #grep out synonyms without a source
943
      my @synonyms = @{$mf->marker->get_all_MarkerSynonyms};
×
944
      @synonyms = grep {$_->source } @synonyms;
×
945
      foreach my $synonym (@synonyms) {
×
946
        $self->write(@ff, '', '/db_xref="'.$synonym->source.
×
947
                     ':'.$synonym->name.'"');
948
      }
949
      $self->write(@ff, '', '/note="map_weight='.$mf->map_weight.'"');
×
950
    }
951
  }
952

953
  #
954
  # contigs
955
  #
956
  if($self->is_enabled('contig')) {
1✔
957
    foreach my $segment (@{$slice->project('seqlevel')}) {
×
958
      my ($start, $end, $slice) = @$segment;
×
959
      $self->write(@ff, 'misc_feature',
×
960
                   $start .'..'. $end);
961
      $self->write(@ff, '', '/note="contig '.$slice->seq_region_name .
×
962
                   ' ' . $slice->start . '..' . $slice->end .
963
                    '(' . $slice->strand . ')"');
964
    }
965
  }
966

967
  $slice->adaptor->db->add_db_adaptor('lite', $lite) if $lite;
1✔
968

969
}
970

971
# /codon_start= is the first base to start translating from. This maps 
972
# Ensembl start exon phase to this concept. Here we present the usage
973
# of phase in this concept where each row shows the sequence the 
974
# spliced_seq() method will return 
975

976
#               123456789
977
#               ATTATGACG
978
# Phase == 0    ...+++###   codon_start=0   // start from 1st A
979
# Phase == 1     ..+++###   codon_start=3   // start from 2nd A (base 3 in the given spliced sequence)
980
# Phase == 2      .+++###   codon_start=2   // start from 2nd A (base 2 in the spliced sequence)
981
#
982
# In the case of the final 2 phases we will generate a X codon
983
#
984

985
sub transcript_to_codon_start {
986
  my ($self, $transcript) = @_;
13✔
987
  my $start_phase = $transcript->start_Exon()->phase();
13✔
988
  return  ( $start_phase == 1 ) ? 3 : 
13✔
989
          ( $start_phase == 2 ) ? 2 :
990
          ( $start_phase == 0 ) ? 1 :
991
          -1;
992
}
993

994

995
=head2 _get_codon_table_id
996

997
  Arg [1]    : Bio::EnsEMBL::Slice slice
998
  Example    : none
999
  Description: Helper method to get codon_table seq region attribute
1000
               codon_table defines the genetic code table used if other than the universal genetic code table (1)
1001
               By default it is 1 and is not shown in flat files.
1002
               If it is not equal to 1, then it is shown as a transl_table qualifier on the CDS feature.
1003
  Returntype : int
1004
  Caller     : internal
1005

1006
=cut
1007

1008
sub _get_codon_table_id{
1009
  my ($self, $slice) = @_;
48✔
1010
  my $codon_table_id = 1;
48✔
1011
  my $codon_table_attributes = $slice->get_all_Attributes("codon_table");
48✔
1012
  if (@{$codon_table_attributes}) {
13✔
1013
    $codon_table_id = $codon_table_attributes->[0]->value;
8✔
1014
  }
1015
  return $codon_table_id;
13✔
1016
}
1017

1018
=head2 dump_fasta
1019

1020
  Arg [1]    : Bio::EnsEMBL::Slice
1021
  Arg [2]    : IO::File $FH
1022
  Example    : $seq_dumper->dump_fasta($slice, $FH);
1023
  Description: Dumps an FASTA flat file to an open file handle
1024
  Returntype : none
1025
  Exceptions : none
1026
  Caller     : dump
1027

1028
=cut
1029

1030
sub dump_fasta {
1031
  my $self = shift;
45✔
1032
  my $slice = shift;
45✔
1033
  my $FH   = shift;
45✔
1034

1035
  my $id       = $slice->seq_region_name;
45✔
1036
  my $seqtype  = 'dna';
45✔
1037
  my $idtype   = $slice->coord_system->name;
10✔
1038
  my $location = $slice->name;
45✔
1039
  my $start = 1;
10✔
1040
  my $end = $slice->length();
18✔
1041

1042
  my $header = ">$id $seqtype:$idtype $location\n";
2✔
1043
  $self->print( $FH, $header );
2✔
1044

1045
  #set the formatting to FASTA
1046
  my $FORMAT = '^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
2✔
1047
';
1048

1049
  #chunk the sequence in 60kb chunks to use less memory
1050
  my $cur = $start;
2✔
1051
  while($cur <= $end) {
2✔
1052
    my $to = $cur + 59_999;
13✔
1053
    $to = $end if($to > $end); 
12✔
1054
    my $seq = $slice->subseq($cur, $to);
12✔
1055
    $cur = $to + 1;
12✔
1056
    $self->write($FH, $FORMAT, $seq);
32✔
1057
  }
1058
}
1059

1060

1061

1062
=head2 features2location
1063

1064
  Arg [1]    : listref of Bio::EnsEMBL::SeqFeatures
1065
  Example    : $location = $self->features2location(\@features);
1066
  Description: Constructs an EMBL location string from a list of features
1067
  Returntype : string
1068
  Exceptions : none
1069
  Caller     : internal
1070

1071
=cut
1072

1073
sub features2location {
1074
  my $self = shift;
71✔
1075
  my $features = shift;
71✔
1076

1077
  my @join = ();
71✔
1078

1079
  foreach my $f (@$features) {
68✔
1080
    my $slice = $f->slice;
127✔
1081
    my $start = $f->start();
127✔
1082
    my $end   = $f->end();
120✔
1083
    my $strand = $f->strand();
120✔
1084

1085
    if($start >= 1 && $end <= $slice->length) {
127✔
1086
      #this feature in on a slice and doesn't lie outside the boundary
1087
        
1088
      if($strand == 1) {
114✔
1089
        push @join, "$start..$end";
109✔
1090
      } else {
1091
        push @join, "complement($start..$end)";
48✔
1092
      }
1093
    } else {
1094
      my @fs = ();
48✔
1095
      #this feature is outside the boundary of the dump,
1096
      # yet implemented and 'seqlevel' is guaranteed to be 1step
1097
      my $projection = $f->project('seqlevel');
48✔
1098
      foreach my $segment (@$projection) {
48✔
1099
        my $slice = $segment->[2];
48✔
1100
        my $slc_start = $slice->start();
48✔
1101
        my $slc_end   = $slice->end();
48✔
1102
        my $seq_reg   = $slice->seq_region_name();
48✔
1103
        if($slice->strand == 1) {
48✔
1104
          push @join, "$seq_reg:$slc_start..$slc_end";
48✔
1105
        } else {
1106
          push @join, "complement($seq_reg:$slc_start..$slc_end)";
35✔
1107
        }
1108
      }
1109
    }
1110
  }
1111

1112
  my $out = join ',', @join;
45✔
1113

1114
  if(scalar @join > 1) {
79✔
1115
    $out = "join($out)";
43✔
1116
  }
1117

1118
  return $out;
79✔
1119
}
1120

1121
=head2 annotation_source
1122

1123
  Arg [1]    : Bio::EnsEMBL::DBSQL::MetaContainer
1124
  Example    : $seq_dumper->annotation_source($meta_container);
1125
  Description: Constructs a string with gene annotation sources
1126
  Returntype : string
1127
  Exceptions : none
1128
  Caller     : dump_embl, dump_genbank
1129

1130
=cut
1131

1132
sub annotation_source {
1133
  my ($self, $meta_container) = @_;
40,939✔
1134

1135
  my @providers = ();
40,939✔
1136
  my @provider_urls = ();
41,176✔
1137

1138
  foreach ( @{$meta_container->list_value_by_key('annotation.provider_name')} ) {
41,176✔
1139
    push @providers, $_ if $_ ne '';
41,638✔
1140
  }
1141
  foreach ( @{$meta_container->list_value_by_key('annotation.provider_url')} ) {
41,401✔
1142
    push @provider_urls, $_ if $_ ne '';
41,401✔
1143
  }
1144
  if ( ! scalar(@providers) ) {
41,337✔
1145
    foreach ( @{$meta_container->list_value_by_key('assembly.provider_name')} ) {
32,550✔
1146
      push @providers, $_ if $_ ne '';
64✔
1147
    }
1148
    foreach ( @{$meta_container->list_value_by_key('assembly.provider_url')} ) {
64✔
1149
      push @provider_urls, $_ if $_ ne '';
64✔
1150
    }
1151
  }
1152
  if ( ! scalar(@providers) ) {
72✔
1153
    push @providers, 'Ensembl';
64✔
1154
  }
1155

1156
  my $annotation_source = join(' and ', @providers);
72✔
1157
  if (@provider_urls) {
79✔
1158
    $annotation_source .= ' ' . sprintf(q{(%s)}, join(', ', @provider_urls));
77✔
1159
  }
1160

1161
  return $annotation_source;
37✔
1162
}
1163

1164
sub _date_string {
1165
  my $self = shift;
40,932✔
1166

1167
  my ($sec, $min, $hour, $mday,$mon, $year) = localtime(time());
41,106✔
1168

1169
  my $month = ('JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL',
284✔
1170
               'AUG', 'SEP', 'OCT', 'NOV', 'DEC')[$mon];
1171
  $year += 1900;
40,913✔
1172
  
1173
  return "$mday-$month-$year";
13✔
1174
}
1175

1176
sub write {
1177
  my ($self, $FH, $FORMAT, @values) = @_;
82,538✔
1178
  
1179
  #while the last value still contains something
1180
  while(defined($values[-1]) and $values[-1] ne '') {
82,538✔
1181
    formline($FORMAT, @values);
87,226✔
1182
    $self->print( $FH, $^A );
87,238✔
1183
    $^A = '';
87,238✔
1184
  }
1185
}
1186

1187
sub write_genbank_seq {
1188
  my $self = shift;
1189
  my $FH  = shift;
1190
  my $seq = shift;
1191
  my $base_total = shift;
1192

1193
  $base_total ||= 0;
1194

1195
  my $GENBANK_SEQ = 
1196
'@>>>>>>>> ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<~
1197
';
1198

1199
  my $total = -59 + $base_total;
1200
  #keep track of total and print lines of 60 bases with spaces every 10bp
1201
  while($$seq) {
1202
    $total += 60; 
1203
    formline($GENBANK_SEQ,$total, $$seq, $$seq, $$seq, $$seq, $$seq, $$seq);
1204
    $self->print( $FH, $^A );
1205
    $^A = '';
1206
  }
1207
}
1208

1209
sub write_genbank_seq {
1210
  my $self = shift;
201✔
1211
  my $slice = shift;
201✔
1212
  my $FH   = shift;
201✔
1213

1214
  my $width = 60;
201✔
1215

1216
  # set buffer size
1217
  my $chunk_size = $self->{'chunk_factor'} * $width;
193✔
1218
  $chunk_size > 0 or throw "Invalid chunk size: check chunk_factor parameter";
2✔
1219

1220
  my $start = 1;
2✔
1221
  my $end = $slice->length;
1✔
1222
  my $total = -59;
2✔
1223

1224
  # chunk the sequence to conserve memory, and print
1225
  my $here = $start;
2✔
1226
  my $GENBANK_SEQ = 
2✔
1227
'@>>>>>>>> ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<~
1228
';
1229
  my $acgt;
2✔
1230

1231
  while($here <= $end) {
1✔
1232
    my $there = $here + $chunk_size - 1;
2✔
1233
    $there = $end if($there > $end);
2✔
1234
    my $sseq = $slice->subseq($here, $there);
2✔
1235

1236
    $acgt->{a} += $sseq =~ tr/Aa//;
10✔
1237
    $acgt->{c} += $sseq =~ tr/Cc//;
6✔
1238
    $acgt->{g} += $sseq =~ tr/Gg//;
6✔
1239
    $acgt->{t} += $sseq =~ tr/Tt//;
39,296✔
1240

1241
    while($sseq) {
39,296✔
1242
      $total += 60;
1,675✔
1243
      formline($GENBANK_SEQ, $total, $sseq, $sseq, $sseq, $sseq, $sseq, $sseq);
1,671✔
1244
      $self->print($FH, $^A);
1,671✔
1245
      $^A = '';
1,671✔
1246
    }
1247

1248
    $here = $there + 1;
1,252✔
1249
  }
1250

1251
  $acgt->{n} = $end - ($acgt->{a} + $acgt->{c} + $acgt->{g} + $acgt->{t});
1,251✔
1252
  $acgt->{tot} = $end;
9✔
1253

1254
  return $acgt;
5✔
1255
}
1256

1257

1258
sub write_embl_seq {
1259
  my $self = shift;
7✔
1260
  my $slice = shift;
7✔
1261
  my $FH   = shift;
41✔
1262

1263
  my $width = 60;
41✔
1264

1265
  # set buffer size
1266
  my $chunk_size = $self->{'chunk_factor'} * $width;
3✔
1267
  $chunk_size > 0 or throw "Invalid chunk size: check chunk_factor parameter";
41✔
1268

1269
  my $start = 1;
41✔
1270
  my $end = $slice->length;
41✔
1271
  my $total = $end;
423✔
1272

1273
  # chunk the sequence to conserve memory, and print
1274
  my $here = $start;
41✔
1275
  my $EMBL_SEQ = 
117✔
1276
    '     ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<@>>>>>>>>>~
1277
';
1278
  my $acgt;
41✔
1279

1280
  while($here <= $end) {
11✔
1281
    my $there = $here + $chunk_size - 1;
11✔
1282
    $there = $end if($there > $end);
11✔
1283
    my $sseq = $slice->subseq($here, $there);
13✔
1284

1285
    $acgt->{a} += $sseq =~ tr/Aa//;
13✔
1286
    $acgt->{c} += $sseq =~ tr/Cc//;
13✔
1287
    $acgt->{g} += $sseq =~ tr/Gg//;
15✔
1288
    $acgt->{t} += $sseq =~ tr/Tt//;
7✔
1289

1290
    while($sseq) {
7✔
1291
      $total -= 60;
6,051✔
1292
      $total = 0 if $total < 0;
6,051✔
1293
      formline($EMBL_SEQ, 
6,051✔
1294
               $sseq, $sseq, $sseq, $sseq, $sseq, $sseq, 
1295
               $end - $total);
1296
      $self->print($FH, $^A);
6,051✔
1297
      $^A = '';
7✔
1298
    }
1299

1300
    $here = $there + 1;
3✔
1301
  }
1302

1303
  $acgt->{n} = $end - ($acgt->{a} + $acgt->{c} + $acgt->{g} + $acgt->{t});
3✔
1304
  $acgt->{tot} = $end;
3✔
1305

1306
  return $acgt;
×
1307
}
1308

1309
sub print {
1310
  my( $self, $FH, $string ) = @_;
1,859✔
1311
  if(!print $FH $string){
1,859✔
1312
    print STDERR "Problem writing to disk\n";
5✔
1313
    print STDERR "the string is $string\n";
5✔
1314
    die "Could not write to file handle";
5✔
1315
  }
1316
}
1317

1318

1319
1;
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc