• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

openvax / gtfparse / 13576134035

27 Feb 2025 09:23PM UTC coverage: 86.391% (-13.6%) from 100.0%
13576134035

push

github

timodonnell
Fix linter error

1 of 1 new or added line in 1 file covered. (100.0%)

23 existing lines in 3 files now uncovered.

146 of 169 relevant lines covered (86.39%)

2.59 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.19
/gtfparse/create_missing_features.py
1
# Licensed under the Apache License, Version 2.0 (the "License");
2
# you may not use this file except in compliance with the License.
3
# You may obtain a copy of the License at
4
#
5
#     http://www.apache.org/licenses/LICENSE-2.0
6
#
7
# Unless required by applicable law or agreed to in writing, software
8
# distributed under the License is distributed on an "AS IS" BASIS,
9
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
# See the License for the specific language governing permissions and
11
# limitations under the License.
12

13
import logging
3✔
14
from collections import OrderedDict
3✔
15

16
import pandas as pd
3✔
17

18
logging.basicConfig(level=logging.INFO)
3✔
19
logger = logging.getLogger(__name__)
3✔
20

21

22
def create_missing_features(
3✔
23
        dataframe,
24
        unique_keys={},
25
        extra_columns={},
26
        missing_value=None):
27
    """
28
    Helper function used to construct a missing feature such as 'transcript'
29
    or 'gene'. Some GTF files only have 'exon' and 'CDS' entries, but have
30
    transcript_id and gene_id annotations which allow us to construct those
31
    missing features.
32

33
    Parameters
34
    ----------
35
    dataframe : pandas.DataFrame
36
        Should contain at least the core GTF columns, such as "seqname",
37
        "start", and "end"
38

39
    unique_keys : dict
40
        Mapping from feature names to the name of the column which should
41
        act as a unique key for that feature. Example: {"gene": "gene_id"}
42

43
    extra_columns : dict
44
        By default the constructed feature row will include only the 8
45
        core columns and its unique key. Any other columns that should
46
        be included should be associated with the feature name in this
47
        dict.
48

49
    missing_value : any
50
        Which value to fill in for columns that we don't infer values for.
51

52
    Returns original dataframe (converted to Pandas if necessary) along with all 
53
    extra rows created for missing features.
54
    """
55
    if hasattr(dataframe, "to_pandas"):
3✔
UNCOV
56
        dataframe = dataframe.to_pandas()
×
57
  
58
    extra_dataframes = []
3✔
59

60
    existing_features = set(dataframe["feature"])
3✔
61
    existing_columns = set(dataframe.columns)
3✔
62
 
63
    for (feature_name, groupby_key) in unique_keys.items():
3✔
64
        
65
        if feature_name in existing_features:
3✔
UNCOV
66
            logging.info(
×
67
                "Feature '%s' already exists in GTF data" % feature_name)
UNCOV
68
            continue
×
69
        logging.info("Creating rows for missing feature '%s'" % feature_name)
3✔
70

71
        # don't include rows where the groupby key was missing
72
        missing = pd.Series([
3✔
73
            x is None or x == ""
74
            for x in dataframe[groupby_key]])
75
        not_missing = ~missing
3✔
76
        row_groups = dataframe[not_missing].groupby(groupby_key)
3✔
77

78
        # Each group corresponds to a unique feature entry for which the
79
        # other columns may or may not be uniquely defined. Start off by
80
        # assuming the values for every column are missing and fill them in
81
        # where possible.
82
        feature_values = OrderedDict([
3✔
83
            (column_name, [missing_value] * row_groups.ngroups)
84
            for column_name in dataframe.keys()
85
        ])
86

87
        # User specifies which non-required columns should we try to infer
88
        # values for
89
        feature_columns = list(extra_columns.get(feature_name, []))
3✔
90

91
        for i, (feature_id, group) in enumerate(row_groups):
3✔
92
            # fill in the required columns by assuming that this feature
93
            # is the union of all intervals of other features that were
94
            # tagged with its unique ID (e.g. union of exons which had a
95
            # particular gene_id).
96
            feature_values["feature"][i] = feature_name
3✔
97
            feature_values[groupby_key][i] = feature_id
3✔
98
            # set the source to 'gtfparse' to indicate that we made this
99
            # entry up from other data
100
            feature_values["source"][i] = "gtfparse"
3✔
101
            feature_values["start"][i] = group["start"].min()
3✔
102
            feature_values["end"][i] = group["end"].max()
3✔
103

104
            # assume that seqname and strand are the same for all other
105
            # entries in the GTF which shared this unique ID
106
            feature_values["seqname"][i] = group["seqname"].iat[0]
3✔
107
            feature_values["strand"][i] = group["strand"].iat[0]
3✔
108

109
            # there's probably no rigorous way to set the values of
110
            # 'score' or 'frame' columns so leave them empty
111
            for column_name in feature_columns:
3✔
112
                if column_name not in existing_columns:
3✔
UNCOV
113
                    raise ValueError(
×
114
                        "Column '%s' does not exist in GTF, columns = %s" % (
115
                            column_name, existing_columns))
116

117
                # expect that all entries related to a reconstructed feature
118
                # are related and are thus within the same interval of
119
                # positions on the same chromosome
120
                unique_values = group[column_name].dropna().unique()
3✔
121
                if len(unique_values) == 1:
3✔
122
                    feature_values[column_name][i] = unique_values[0]
3✔
123
        extra_dataframes.append(pd.DataFrame(feature_values))
3✔
124
    return pd.concat([dataframe] + extra_dataframes, ignore_index=True)
3✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc