• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

openvax / pyensembl / 25711309577

12 May 2026 03:21AM UTC coverage: 83.547% (-0.1%) from 83.649%
25711309577

Pull #340

github

iskandr
SequenceData ensures cache directory exists before pickling

Now that Genome hands the resolved download-cache dir (rather than the
FASTA's parent dir) to SequenceData, the target directory may not exist
yet when no remote files have been downloaded. Mirror Database.create's
ensure_dir call before dump_pickle.

Fixes test_missing_genome_sources.{test_transcript_fasta_only,test_protein_fasta_only}.
Pull Request #340: Fix #219: Database cache dir defaults to download cache, not GTF directory

4 of 4 new or added lines in 2 files covered. (100.0%)

2 existing lines in 2 files now uncovered.

1371 of 1641 relevant lines covered (83.55%)

3.34 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

81.01
/pyensembl/sequence_data.py
1
# Licensed under the Apache License, Version 2.0 (the "License");
2
# you may not use this file except in compliance with the License.
3
# You may obtain a copy of the License at
4
#
5
#     http://www.apache.org/licenses/LICENSE-2.0
6
#
7
# Unless required by applicable law or agreed to in writing, software
8
# distributed under the License is distributed on an "AS IS" BASIS,
9
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
# See the License for the specific language governing permissions and
11
# limitations under the License.
12

13
from os import remove
4✔
14
from os.path import dirname, exists, abspath, split, join
4✔
15

16
import datacache
4✔
17
import logging
4✔
18
from collections import Counter
4✔
19
import pickle
4✔
20
from .common import load_pickle, dump_pickle
4✔
21
from .fasta import parse_fasta_dictionary
4✔
22

23

24
logger = logging.getLogger(__name__)
4✔
25

26

27
class SequenceData(object):
4✔
28
    """
29
    Container for reference nucleotide and amino acid sequenes.
30
    """
31

32
    def __init__(self, fasta_paths, cache_directory_path=None):
4✔
33
        if type(fasta_paths) is str:
4✔
34
            fasta_paths = [fasta_paths]
×
35

36
        self.fasta_paths = [abspath(path) for path in fasta_paths]
4✔
37
        self.fasta_directory_paths = [split(path)[0] for path in self.fasta_paths]
4✔
38
        self.fasta_filenames = [split(path)[1] for path in self.fasta_paths]
4✔
39
        if cache_directory_path:
4✔
40
            self.cache_directory_paths = [cache_directory_path] * len(self.fasta_paths)
4✔
41
        else:
UNCOV
42
            self.cache_directory_paths = self.fasta_directory_paths
×
43
        for path in self.fasta_paths:
4✔
44
            if not exists(path):
4✔
45
                raise ValueError("Couldn't find FASTA file %s" % (path,))
×
46
        self.fasta_dictionary_filenames = [
4✔
47
            filename + ".pickle" for filename in self.fasta_filenames
48
        ]
49
        self.fasta_dictionary_pickle_paths = [
4✔
50
            join(cache_path, filename)
51
            for cache_path, filename in zip(
52
                self.cache_directory_paths, self.fasta_dictionary_filenames
53
            )
54
        ]
55
        self._init_lazy_fields()
4✔
56

57
    def _init_lazy_fields(self):
4✔
58
        self._fasta_dictionary = None
4✔
59
        self._fasta_keys = None
4✔
60

61
    def clear_cache(self):
4✔
62
        self._init_lazy_fields()
4✔
63
        for path in self.fasta_dictionary_pickle_paths:
4✔
64
            if exists(path):
4✔
65
                remove(path)
4✔
66

67
    def __str__(self):
4✔
68
        return "SequenceData(fasta_paths=%s)" % (self.fasta_paths,)
×
69

70
    def __repr__(self):
4✔
71
        return str(self)
×
72

73
    def __contains__(self, sequence_id):
4✔
74
        if self._fasta_keys is None:
×
75
            self._fasta_keys = set(self.fasta_dictionary.keys())
×
76
        return sequence_id in self._fasta_keys
×
77

78
    def __eq__(self, other):
4✔
79
        # test to see if self.fasta_paths and other.fasta_paths contain
80
        # the same list of paths, regardless of order
81
        return (other.__class__ is SequenceData) and Counter(
×
82
            self.fasta_paths
83
        ) == Counter(other.fasta_paths)
84

85
    def __hash__(self):
4✔
86
        return hash(self.fasta_paths)
×
87

88
    def _add_to_fasta_dictionary(self, fasta_dictionary_tmp):
4✔
89
        for identifier, sequence in fasta_dictionary_tmp.items():
4✔
90
            if identifier in self._fasta_dictionary:
4✔
91
                logger.warn(
×
92
                    "Sequence identifier %s is duplicated in your FASTA files!"
93
                    % identifier
94
                )
95
                continue
×
96
            self._fasta_dictionary[identifier] = sequence
4✔
97

98
    def _load_or_create_fasta_dictionary_pickle(self):
4✔
99
        self._fasta_dictionary = dict()
4✔
100
        for fasta_path, pickle_path in zip(
4✔
101
            self.fasta_paths, self.fasta_dictionary_pickle_paths
102
        ):
103
            if exists(pickle_path):
4✔
104
                # try loading the cached file
105
                # but we'll fall back on recreating it if loading fails
106
                try:
4✔
107
                    fasta_dictionary_tmp = load_pickle(pickle_path)
4✔
108
                    self._add_to_fasta_dictionary(fasta_dictionary_tmp)
4✔
109
                    logger.info("Loaded sequence dictionary from %s", pickle_path)
4✔
110
                    continue
4✔
111
                except (pickle.UnpicklingError, AttributeError):
×
112
                    # catch either an UnpicklingError or an AttributeError
113
                    # resulting from pickled objects refering to classes
114
                    # that no longer exists
115
                    logger.warn(
×
116
                        "Failed to load %s, attempting to read FASTA directly",
117
                        pickle_path,
118
                    )
119
            logger.info("Parsing sequences from FASTA file at %s", fasta_path)
4✔
120

121
            fasta_dictionary_tmp = parse_fasta_dictionary(fasta_path)
4✔
122
            self._add_to_fasta_dictionary(fasta_dictionary_tmp)
4✔
123
            logger.info("Saving sequence dictionary to %s", pickle_path)
4✔
124
            datacache.ensure_dir(dirname(pickle_path))
4✔
125
            dump_pickle(fasta_dictionary_tmp, pickle_path)
4✔
126

127
    def index(self, overwrite=False):
4✔
128
        if overwrite:
4✔
129
            self.clear_cache()
×
130
        self._load_or_create_fasta_dictionary_pickle()
4✔
131

132
    @property
4✔
133
    def fasta_dictionary(self):
4✔
134
        if not self._fasta_dictionary:
4✔
135
            self._load_or_create_fasta_dictionary_pickle()
4✔
136
        return self._fasta_dictionary
4✔
137

138
    def get(self, sequence_id):
4✔
139
        """Get sequence associated with given ID or return None if missing"""
140
        return self.fasta_dictionary.get(sequence_id)
4✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc