13576134035

Committed 27 Feb 2025 09:23PM UTC coverage: 86.391% (-13.6%) from 100.0%

Build # 13576134035

Build Type

push

github

Committed by

timodonnell

Commit Message

Fix linter error

Run Details

1 of 1 new or added line in 1 file covered. (100.0%)

23 existing lines in 3 files now uncovered.

146 of 169 relevant lines covered (86.39%)

2.59 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.19

/gtfparse/create_missing_features.py

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from collections import OrderedDict

import pandas as pd

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def create_missing_features(
        dataframe,
        unique_keys={},
        extra_columns={},
        missing_value=None):
    """
    Helper function used to construct a missing feature such as 'transcript'
    or 'gene'. Some GTF files only have 'exon' and 'CDS' entries, but have
    transcript_id and gene_id annotations which allow us to construct those
    missing features.

    Parameters
    ----------
    dataframe : pandas.DataFrame
        Should contain at least the core GTF columns, such as "seqname",
        "start", and "end"

    unique_keys : dict
        Mapping from feature names to the name of the column which should
        act as a unique key for that feature. Example: {"gene": "gene_id"}

    extra_columns : dict
        By default the constructed feature row will include only the 8
        core columns and its unique key. Any other columns that should
        be included should be associated with the feature name in this
        dict.

    missing_value : any
        Which value to fill in for columns that we don't infer values for.

    Returns original dataframe (converted to Pandas if necessary) along with all 
    extra rows created for missing features.
    """
    if hasattr(dataframe, "to_pandas"):
        dataframe = dataframe.to_pandas()
  
    extra_dataframes = []

    existing_features = set(dataframe["feature"])
    existing_columns = set(dataframe.columns)
 
    for (feature_name, groupby_key) in unique_keys.items():
        
        if feature_name in existing_features:
            logging.info(
                "Feature '%s' already exists in GTF data" % feature_name)
            continue
        logging.info("Creating rows for missing feature '%s'" % feature_name)

        # don't include rows where the groupby key was missing
        missing = pd.Series([
            x is None or x == ""
            for x in dataframe[groupby_key]])
        not_missing = ~missing
        row_groups = dataframe[not_missing].groupby(groupby_key)

        # Each group corresponds to a unique feature entry for which the
        # other columns may or may not be uniquely defined. Start off by
        # assuming the values for every column are missing and fill them in
        # where possible.
        feature_values = OrderedDict([
            (column_name, [missing_value] * row_groups.ngroups)
            for column_name in dataframe.keys()
        ])

        # User specifies which non-required columns should we try to infer
        # values for
        feature_columns = list(extra_columns.get(feature_name, []))

        for i, (feature_id, group) in enumerate(row_groups):
            # fill in the required columns by assuming that this feature
            # is the union of all intervals of other features that were
            # tagged with its unique ID (e.g. union of exons which had a
            # particular gene_id).
            feature_values["feature"][i] = feature_name
            feature_values[groupby_key][i] = feature_id
            # set the source to 'gtfparse' to indicate that we made this
            # entry up from other data
            feature_values["source"][i] = "gtfparse"
            feature_values["start"][i] = group["start"].min()
            feature_values["end"][i] = group["end"].max()

            # assume that seqname and strand are the same for all other
            # entries in the GTF which shared this unique ID
            feature_values["seqname"][i] = group["seqname"].iat[0]
            feature_values["strand"][i] = group["strand"].iat[0]

            # there's probably no rigorous way to set the values of
            # 'score' or 'frame' columns so leave them empty
            for column_name in feature_columns:
                if column_name not in existing_columns:
                    raise ValueError(
                        "Column '%s' does not exist in GTF, columns = %s" % (
                            column_name, existing_columns))

                # expect that all entries related to a reconstructed feature
                # are related and are thus within the same interval of
                # positions on the same chromosome
                unique_values = group[column_name].dropna().unique()
                if len(unique_values) == 1:
                    feature_values[column_name][i] = unique_values[0]
        extra_dataframes.append(pd.DataFrame(feature_values))
    return pd.concat([dataframe] + extra_dataframes, ignore_index=True)

1	# Licensed under the Apache License, Version 2.0 (the "License");
2	# you may not use this file except in compliance with the License.
3	# You may obtain a copy of the License at
4	#
5	# http://www.apache.org/licenses/LICENSE-2.0
6	#
7	# Unless required by applicable law or agreed to in writing, software
8	# distributed under the License is distributed on an "AS IS" BASIS,
9	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10	# See the License for the specific language governing permissions and
11	# limitations under the License.
12
13	import logging	3✔
14	from collections import OrderedDict	3✔
15
16	import pandas as pd	3✔
17
18	logging.basicConfig(level=logging.INFO)	3✔
19	logger = logging.getLogger(__name__)	3✔
20
21
22	def create_missing_features(	3✔
23	dataframe,
24	unique_keys={},
25	extra_columns={},
26	missing_value=None):
27	"""
28	Helper function used to construct a missing feature such as 'transcript'
29	or 'gene'. Some GTF files only have 'exon' and 'CDS' entries, but have
30	transcript_id and gene_id annotations which allow us to construct those
31	missing features.
32
33	Parameters
34	----------
35	dataframe : pandas.DataFrame
36	Should contain at least the core GTF columns, such as "seqname",
37	"start", and "end"
38
39	unique_keys : dict
40	Mapping from feature names to the name of the column which should
41	act as a unique key for that feature. Example: {"gene": "gene_id"}
42
43	extra_columns : dict
44	By default the constructed feature row will include only the 8
45	core columns and its unique key. Any other columns that should
46	be included should be associated with the feature name in this
47	dict.
48
49	missing_value : any
50	Which value to fill in for columns that we don't infer values for.
51
52	Returns original dataframe (converted to Pandas if necessary) along with all
53	extra rows created for missing features.
54	"""
55	if hasattr(dataframe, "to_pandas"):	3✔
UNCOV 56	dataframe = dataframe.to_pandas()	×
57
58	extra_dataframes = []	3✔
59
60	existing_features = set(dataframe["feature"])	3✔
61	existing_columns = set(dataframe.columns)	3✔
62
63	for (feature_name, groupby_key) in unique_keys.items():	3✔
64
65	if feature_name in existing_features:	3✔
UNCOV 66	logging.info(	×
67	"Feature '%s' already exists in GTF data" % feature_name)
UNCOV 68	continue	×
69	logging.info("Creating rows for missing feature '%s'" % feature_name)	3✔
70
71	# don't include rows where the groupby key was missing
72	missing = pd.Series([	3✔
73	x is None or x == ""
74	for x in dataframe[groupby_key]])
75	not_missing = ~missing	3✔
76	row_groups = dataframe[not_missing].groupby(groupby_key)	3✔
77
78	# Each group corresponds to a unique feature entry for which the
79	# other columns may or may not be uniquely defined. Start off by
80	# assuming the values for every column are missing and fill them in
81	# where possible.
82	feature_values = OrderedDict([	3✔
83	(column_name, [missing_value] * row_groups.ngroups)
84	for column_name in dataframe.keys()
85	])
86
87	# User specifies which non-required columns should we try to infer
88	# values for
89	feature_columns = list(extra_columns.get(feature_name, []))	3✔
90
91	for i, (feature_id, group) in enumerate(row_groups):	3✔
92	# fill in the required columns by assuming that this feature
93	# is the union of all intervals of other features that were
94	# tagged with its unique ID (e.g. union of exons which had a
95	# particular gene_id).
96	feature_values["feature"][i] = feature_name	3✔
97	feature_values[groupby_key][i] = feature_id	3✔
98	# set the source to 'gtfparse' to indicate that we made this
99	# entry up from other data
100	feature_values["source"][i] = "gtfparse"	3✔
101	feature_values["start"][i] = group["start"].min()	3✔
102	feature_values["end"][i] = group["end"].max()	3✔
103
104	# assume that seqname and strand are the same for all other
105	# entries in the GTF which shared this unique ID
106	feature_values["seqname"][i] = group["seqname"].iat[0]	3✔
107	feature_values["strand"][i] = group["strand"].iat[0]	3✔
108
109	# there's probably no rigorous way to set the values of
110	# 'score' or 'frame' columns so leave them empty
111	for column_name in feature_columns:	3✔
112	if column_name not in existing_columns:	3✔
UNCOV 113	raise ValueError(	×
114	"Column '%s' does not exist in GTF, columns = %s" % (
115	column_name, existing_columns))
116
117	# expect that all entries related to a reconstructed feature
118	# are related and are thus within the same interval of
119	# positions on the same chromosome
120	unique_values = group[column_name].dropna().unique()	3✔
121	if len(unique_values) == 1:	3✔
122	feature_values[column_name][i] = unique_values[0]	3✔
123	extra_dataframes.append(pd.DataFrame(feature_values))	3✔
124	return pd.concat([dataframe] + extra_dataframes, ignore_index=True)	3✔

openvax / gtfparse / 13576134035

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous