10085126638

Committed 24 Jul 2024 10:50PM UTC coverage: 95.972% (+0.05%) from 95.927%

Build # 10085126638

Build Type

Pull #326

github

Committed by

web-flow

Commit Message

Merge a4697b1ca into 0ac0d4297

Pull Request Pull Request #326: [FIX] Exclude sessions missing a queried property from matches

Run Details

7 of 7 new or added lines in 3 files covered. (100.0%)

1 existing line in 1 file now uncovered.

691 of 720 relevant lines covered (95.97%)

1.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

86.21

/app/api/utility.py

"""Constants for graph server connection and utility functions for writing the SPARQL query."""

import json
import os
import textwrap
import warnings
from collections import namedtuple
from pathlib import Path
from typing import Optional

import httpx

# Request constants
EnvVar = namedtuple("EnvVar", ["name", "val"])

ALLOWED_ORIGINS = EnvVar(
    "NB_API_ALLOWED_ORIGINS", os.environ.get("NB_API_ALLOWED_ORIGINS", "")
)

GRAPH_USERNAME = EnvVar(
    "NB_GRAPH_USERNAME", os.environ.get("NB_GRAPH_USERNAME")
)
GRAPH_PASSWORD = EnvVar(
    "NB_GRAPH_PASSWORD", os.environ.get("NB_GRAPH_PASSWORD")
)
GRAPH_ADDRESS = EnvVar(
    "NB_GRAPH_ADDRESS", os.environ.get("NB_GRAPH_ADDRESS", "206.12.99.17")
)
GRAPH_DB = EnvVar(
    "NB_GRAPH_DB", os.environ.get("NB_GRAPH_DB", "test_data/query")
)
GRAPH_PORT = EnvVar("NB_GRAPH_PORT", os.environ.get("NB_GRAPH_PORT", 5820))
# TODO: Environment variables can't be parsed as bool so this is a workaround but isn't ideal.
# Another option is to switch this to a command-line argument, but that would require changing the
# Dockerfile also since Uvicorn can't accept custom command-line args.
RETURN_AGG = EnvVar(
    "NB_RETURN_AGG", os.environ.get("NB_RETURN_AGG", "True").lower() == "true"
)

QUERY_URL = f"http://{GRAPH_ADDRESS.val}:{GRAPH_PORT.val}/{GRAPH_DB.val}"
QUERY_HEADER = {
    "Content-Type": "application/sparql-query",
    "Accept": "application/sparql-results+json",
}

CONTEXT = {
    "cogatlas": "https://www.cognitiveatlas.org/task/id/",
    "nb": "http://neurobagel.org/vocab/",
    "nbg": "http://neurobagel.org/graph/",  # TODO: Check if we still need this namespace.
    "ncit": "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#",
    "nidm": "http://purl.org/nidash/nidm#",
    "snomed": "http://purl.bioontology.org/ontology/SNOMEDCT/",
}

# Store domains in named tuples
Domain = namedtuple("Domain", ["var", "pred"])
# Core domains
AGE = Domain("age", "nb:hasAge")
SEX = Domain("sex", "nb:hasSex")
DIAGNOSIS = Domain("diagnosis", "nb:hasDiagnosis")
IS_CONTROL = Domain("subject_group", "nb:isSubjectGroup")
ASSESSMENT = Domain("assessment", "nb:hasAssessment")
IMAGE_MODAL = Domain("image_modal", "nb:hasContrastType")
PROJECT = Domain("project", "nb:hasSamples")


CATEGORICAL_DOMAINS = [SEX, DIAGNOSIS, IMAGE_MODAL, ASSESSMENT]

IS_CONTROL_TERM = "ncit:C94342"

BACKUP_VOCAB_DIR = (
    Path(__file__).absolute().parents[2] / "vocab/backup_external"
)


def parse_origins_as_list(allowed_origins: str) -> list:
    """Returns user-defined allowed origins as a list."""
    return list(allowed_origins.split(" "))


def create_context() -> str:
    """Creates a SPARQL query context string from the CONTEXT dictionary."""
    return "\n".join(
        [f"PREFIX {prefix}: <{uri}>" for prefix, uri in CONTEXT.items()]
    )


def unpack_http_response_json_to_dicts(response: dict) -> list[dict]:
    """
    Reformats a nested dictionary object from a SPARQL query response JSON into a more human-readable list of dictionaries,
    where the keys are the variables selected in the SPARQL query and the values correspond to the variable values.
    The number of dictionaries should correspond to the number of query matches.
    """
    return [
        {k: v["value"] for k, v in res.items()}
        for res in response["results"]["bindings"]
    ]


def create_bound_filter(var: str) -> str:
    """
    Create a SPARQL filter substring for checking if a variable is bound
    (meaning the variable actually has a corresponding value, e.g., the property exists).
    """
    return f"FILTER (BOUND(?{var})"


def create_query(
    return_agg: bool,
    age: Optional[tuple] = (None, None),
    sex: Optional[str] = None,
    diagnosis: Optional[str] = None,
    is_control: Optional[bool] = None,
    min_num_imaging_sessions: Optional[int] = None,
    min_num_phenotypic_sessions: Optional[int] = None,
    assessment: Optional[str] = None,
    image_modal: Optional[str] = None,
) -> str:
    """
    Creates a SPARQL query using a query template and filters it using the input parameters.

    Parameters
    ----------
    return_agg : bool
        Whether to return only aggregate query results (and not subject-level attributes besides file paths).
    age : tuple, optional
        Minimum and maximum age of subject, by default (None, None).
    sex : str, optional
        Subject sex, by default None.
    diagnosis : str, optional
        Subject diagnosis, by default None.
    is_control : bool, optional
        Whether or not subject is a control, by default None.
    min_num_imaging_sessions : int, optional
        Subject minimum number of imaging sessions, by default None.
    min_num_phenotypic_sessions : int, optional
        Subject minimum number of phenotypic sessions, by default None.
    assessment : str, optional
        Non-imaging assessment completed by subjects, by default None.
    image_modal : str, optional
        Imaging modality of subject scans, by default None.

    Returns
    -------
    str
        The SPARQL query.
    """
    subject_level_filters = ""
    if min_num_phenotypic_sessions is not None:
        subject_level_filters += (
            "\n"
            + f"FILTER (?num_matching_phenotypic_sessions >= {min_num_phenotypic_sessions})."
        )
    if min_num_imaging_sessions is not None:
        subject_level_filters += (
            "\n"
            + f"FILTER (?num_matching_imaging_sessions >= {min_num_imaging_sessions})."
        )

    phenotypic_session_level_filters = ""

    if age[0] is not None:
        phenotypic_session_level_filters += (
            "\n"
            + f"{create_bound_filter(AGE.var)} && ?{AGE.var} >= {age[0]})."
        )
    if age[1] is not None:
        phenotypic_session_level_filters += (
            "\n"
            + f"{create_bound_filter(AGE.var)} && ?{AGE.var} <= {age[1]})."
        )

    if sex is not None:
        phenotypic_session_level_filters += (
            "\n" + f"{create_bound_filter(SEX.var)} && ?{SEX.var} = {sex})."
        )

    if diagnosis is not None:
        phenotypic_session_level_filters += (
            "\n"
            + f"{create_bound_filter(DIAGNOSIS.var)} && ?{DIAGNOSIS.var} = {diagnosis})."
        )

    if is_control is not None:
        if is_control:
            phenotypic_session_level_filters += (
                "\n"
                + f"{create_bound_filter(IS_CONTROL.var)} && ?{IS_CONTROL.var} = {IS_CONTROL_TERM})."
            )
        else:
            # TODO: Revisit - this logic seems odd, since in our current data model the session should not have this edge if it's not a control.
            phenotypic_session_level_filters += (
                "\n"
                + f"{create_bound_filter(IS_CONTROL.var)} && ?{IS_CONTROL.var} != {IS_CONTROL_TERM})."
            )

    if assessment is not None:
        phenotypic_session_level_filters += (
            "\n"
            + f"{create_bound_filter(ASSESSMENT.var)} && ?{ASSESSMENT.var} = {assessment})."
        )

    imaging_session_level_filters = ""
    if image_modal is not None:
        imaging_session_level_filters += (
            "\n" + f"FILTER (?{IMAGE_MODAL.var} = {image_modal})."
        )

    query_string = textwrap.dedent(
        f"""
        SELECT DISTINCT ?dataset_uuid ?dataset_name ?dataset_portal_uri ?sub_id ?age ?sex
        ?diagnosis ?subject_group ?num_matching_phenotypic_sessions ?num_matching_imaging_sessions ?session_id ?session_type ?assessment ?image_modal ?session_file_path
        WHERE {{
            ?dataset_uuid a nb:Dataset;
                nb:hasLabel ?dataset_name;
                nb:hasSamples ?subject.
            ?subject a nb:Subject;
                nb:hasLabel ?sub_id;
                nb:hasSession ?session.
            ?session a ?session_type;
                nb:hasLabel ?session_id.
            OPTIONAL {{
                ?session nb:hasAcquisition/nb:hasContrastType ?image_modal.
                OPTIONAL {{?session nb:hasFilePath ?session_file_path.}}
            }}
            OPTIONAL {{?dataset_uuid nb:hasPortalURI ?dataset_portal_uri.}}
            OPTIONAL {{?session nb:hasAge ?age.}}
            OPTIONAL {{?session nb:hasSex ?sex.}}
            OPTIONAL {{?session nb:hasDiagnosis ?diagnosis.}}
            OPTIONAL {{?session nb:isSubjectGroup ?subject_group.}}
            OPTIONAL {{?session nb:hasAssessment ?assessment.}}
            {{
                SELECT ?subject (count(distinct ?phenotypic_session) as ?num_matching_phenotypic_sessions)
                WHERE {{
                    ?subject nb:hasSession ?phenotypic_session.
                    ?phenotypic_session a nb:PhenotypicSession.

                    OPTIONAL {{?phenotypic_session nb:hasAge ?age.}}
                    OPTIONAL {{?phenotypic_session nb:hasSex ?sex.}}
                    OPTIONAL {{?phenotypic_session nb:hasDiagnosis ?diagnosis.}}
                    OPTIONAL {{?phenotypic_session nb:isSubjectGroup ?subject_group.}}
                    OPTIONAL {{?phenotypic_session nb:hasAssessment ?assessment.}}

                    {phenotypic_session_level_filters}
                }} GROUP BY ?subject
            }}
            {{
                SELECT ?subject (count(distinct ?imaging_session) as ?num_matching_imaging_sessions)
                WHERE {{
                    OPTIONAL {{
                        ?subject nb:hasSession ?imaging_session.
                        ?imaging_session a nb:ImagingSession;
                            nb:hasAcquisition/nb:hasContrastType ?image_modal.
                    }}
                    {imaging_session_level_filters}
                }} GROUP BY ?subject
            }}
            {subject_level_filters}
        }}
    """
    )

    # The query defined above will return all subject-level attributes from the graph. If RETURN_AGG variable has been set to true,
    # wrap query in an aggregating statement so data returned from graph include only attributes needed for dataset-level aggregate metadata.
    if return_agg:
        query_string = (
            textwrap.dedent(
                """
            SELECT ?dataset_uuid ?dataset_name ?dataset_portal_uri ?sub_id ?image_modal
            WHERE {"""
            )
            + textwrap.indent(query_string, "    ")
            + "} GROUP BY ?dataset_uuid ?dataset_name ?dataset_portal_uri ?sub_id ?image_modal"
        )

    print(query_string)
    return "\n".join([create_context(), query_string])


def create_multidataset_size_query(dataset_uuids: list) -> str:
    """Construct a SPARQL query to retrieve the number of subjects in each dataset in a list of dataset UUIDs."""
    dataset_uuids_string = "\n".join([f"<{uuid}>" for uuid in dataset_uuids])
    query_string = f"""
        SELECT ?dataset_uuid (COUNT(DISTINCT ?subject) as ?total_subjects)
        WHERE {{
            VALUES ?dataset_uuid {{
                {dataset_uuids_string}
            }}
            ?dataset_uuid nb:hasSamples ?subject.
            ?subject a nb:Subject.
        }} GROUP BY ?dataset_uuid
    """

    return "\n".join([create_context(), query_string])


def create_terms_query(data_element_URI: str) -> str:
    """
    Creates a SPARQL query using a simple query template to retrieve term URLS for a given data element.

    Parameters
    ----------
    data_element_URI : str
        The URI of the data element for which to retrieve the URIs of all connected term.

    Returns
    -------
    str
        The SPARQL query.

    Examples
    --------
    get_terms_query("nb:Assessment")
    """

    query_string = f"""
    SELECT DISTINCT ?termURL
    WHERE {{
        ?termURL a {data_element_URI} .
        {data_element_URI} rdfs:subClassOf nb:ControlledTerm .
    }}
    """

    return "\n".join([create_context(), query_string])


def is_term_namespace_in_context(term_url: str) -> bool:
    """
    Performs basic check for if a term URL contains a namespace URI from the context.

    Parameters
    ----------
    term_url : str
        A controlled term URI.

    Returns
    -------
    bool
        True if the term URL contains a namespace URI from the context, False otherwise.
    """
    for uri in CONTEXT.values():
        if uri in term_url:
            return True
    return False


def strip_namespace_from_term_uri(term: str, has_prefix: bool = False) -> str:
    """
    Removes namespace URI or prefix from a term URI if the namespace is recognized.

    Parameters
    ----------
    term : str
        A controlled term URI.
    has_prefix : bool, optional
        Whether the term URI includes a namespace prefix (as opposed to the full namespace URI), by default False.

    Returns
    -------
    str
        The unique term ID.
    """
    if has_prefix:
        term_split = term.rsplit(":", 1)
        if term_split[0] in CONTEXT:
            return term_split[1]
    else:
        for uri in CONTEXT.values():
            if uri in term:
                return term.replace(uri, "")

    # If no match found within the context, return original term
    return term


def replace_namespace_uri_with_prefix(url: str) -> str:
    """
    Replaces namespace URIs in term URLs with corresponding prefixes from the context.

    Parameters
    ----------
    url : str
        A controlled term URL.

    Returns
    -------
    str
        The term with namespace URIs replaced with prefixes if found in the context, or the original URL.
    """
    for prefix, uri in CONTEXT.items():
        if uri in url:
            return url.replace(uri, f"{prefix}:")

    # If no match found within the context, return original URL
    return url


def load_json(path: Path) -> dict:
    """
    Loads a user-specified JSON file.

    Parameters
    ----------
    path : Path
        Path to JSON file.
    """
    with open(path, "r") as f:
        return json.load(f)


def fetch_and_save_cogatlas(output_path: Path):
    """
    Fetches the Cognitive Atlas vocabulary using its native Task API and writes term ID-label mappings to a temporary lookup file.
    If the API request fails, a backup copy of the vocabulary is used instead.

    Saves a JSON with keys corresponding to Cognitive Atlas task IDs and values corresponding to human-readable task names.

    Parameters
    ----------
    output_path : Path
        File path to store output vocabulary lookup file.
    """
    api_url = "https://www.cognitiveatlas.org/api/v-alpha/task?format=json"

    try:
        response = httpx.get(url=api_url)
        if response.is_success:
            vocab = response.json()
        else:
            warnings.warn(
                f"""
                The API was unable to fetch the Cognitive Atlas task vocabulary (https://www.cognitiveatlas.org/tasks/a/) from the source and will default to using a local backup copy of the vocabulary instead.

                Details of the response from the source:
                Status code {response.status_code}
                {response.reason_phrase}: {response.text}
                """
            )
            # Use backup copy of the raw vocabulary JSON
            vocab = load_json(BACKUP_VOCAB_DIR / "cogatlas_task.json")
    except httpx.NetworkError as exc:
        warnings.warn(
            f""""
            Fetching of the Cognitive Atlas task vocabulary (https://www.cognitiveatlas.org/tasks/a/) from the source failed due to a network error.
            The API will default to using a local backup copy of the vocabulary instead.

            Error: {exc}
            """
        )
        # Use backup copy of the raw vocabulary JSON
        vocab = load_json(BACKUP_VOCAB_DIR / "cogatlas_task.json")

    term_labels = {term["id"]: term["name"] for term in vocab}
    with open(output_path, "w") as f:
        f.write(json.dumps(term_labels, indent=2))


def create_snomed_term_lookup(output_path: Path):
    """
    Reads in a file of disorder terms from the SNOMED CT vocabulary and writes term ID-label mappings to a temporary lookup file.

    Saves a JSON with keys corresponding to SNOMED CT IDs and values corresponding to human-readable term names.

    Parameters
    ----------
    output_path : Path
        File path to store output vocabulary lookup file.
    """
    vocab = load_json(BACKUP_VOCAB_DIR / "snomedct_disorder.json")

    term_labels = {term["sctid"]: term["preferred_name"] for term in vocab}
    with open(output_path, "w") as f:
        f.write(json.dumps(term_labels, indent=2))

1	"""Constants for graph server connection and utility functions for writing the SPARQL query."""	1✔
2
3	import json	2✔
4	import os	2✔
5	import textwrap	2✔
6	import warnings	2✔
7	from collections import namedtuple	2✔
8	from pathlib import Path	2✔
9	from typing import Optional	2✔
10
11	import httpx	2✔
12
13	# Request constants
14	EnvVar = namedtuple("EnvVar", ["name", "val"])	2✔
15
16	ALLOWED_ORIGINS = EnvVar(	2✔
17	"NB_API_ALLOWED_ORIGINS", os.environ.get("NB_API_ALLOWED_ORIGINS", "")
18	)
19
20	GRAPH_USERNAME = EnvVar(	2✔
21	"NB_GRAPH_USERNAME", os.environ.get("NB_GRAPH_USERNAME")
22	)
23	GRAPH_PASSWORD = EnvVar(	2✔
24	"NB_GRAPH_PASSWORD", os.environ.get("NB_GRAPH_PASSWORD")
25	)
26	GRAPH_ADDRESS = EnvVar(	2✔
27	"NB_GRAPH_ADDRESS", os.environ.get("NB_GRAPH_ADDRESS", "206.12.99.17")
28	)
29	GRAPH_DB = EnvVar(	2✔
30	"NB_GRAPH_DB", os.environ.get("NB_GRAPH_DB", "test_data/query")
31	)
32	GRAPH_PORT = EnvVar("NB_GRAPH_PORT", os.environ.get("NB_GRAPH_PORT", 5820))	2✔
33	# TODO: Environment variables can't be parsed as bool so this is a workaround but isn't ideal.
34	# Another option is to switch this to a command-line argument, but that would require changing the
35	# Dockerfile also since Uvicorn can't accept custom command-line args.
36	RETURN_AGG = EnvVar(	2✔
37	"NB_RETURN_AGG", os.environ.get("NB_RETURN_AGG", "True").lower() == "true"
38	)
39
40	QUERY_URL = f"http://{GRAPH_ADDRESS.val}:{GRAPH_PORT.val}/{GRAPH_DB.val}"	2✔
41	QUERY_HEADER = {	2✔
42	"Content-Type": "application/sparql-query",
43	"Accept": "application/sparql-results+json",
44	}
45
46	CONTEXT = {	2✔
47	"cogatlas": "https://www.cognitiveatlas.org/task/id/",
48	"nb": "http://neurobagel.org/vocab/",
49	"nbg": "http://neurobagel.org/graph/", # TODO: Check if we still need this namespace.
50	"ncit": "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#",
51	"nidm": "http://purl.org/nidash/nidm#",
52	"snomed": "http://purl.bioontology.org/ontology/SNOMEDCT/",
53	}
54
55	# Store domains in named tuples
56	Domain = namedtuple("Domain", ["var", "pred"])	2✔
57	# Core domains
58	AGE = Domain("age", "nb:hasAge")	2✔
59	SEX = Domain("sex", "nb:hasSex")	2✔
60	DIAGNOSIS = Domain("diagnosis", "nb:hasDiagnosis")	2✔
61	IS_CONTROL = Domain("subject_group", "nb:isSubjectGroup")	2✔
62	ASSESSMENT = Domain("assessment", "nb:hasAssessment")	2✔
63	IMAGE_MODAL = Domain("image_modal", "nb:hasContrastType")	2✔
64	PROJECT = Domain("project", "nb:hasSamples")	2✔
65
66
67	CATEGORICAL_DOMAINS = [SEX, DIAGNOSIS, IMAGE_MODAL, ASSESSMENT]	2✔
68
69	IS_CONTROL_TERM = "ncit:C94342"	2✔
70
71	BACKUP_VOCAB_DIR = (	2✔
72	Path(__file__).absolute().parents[2] / "vocab/backup_external"
73	)
74
75
76	def parse_origins_as_list(allowed_origins: str) -> list:	2✔
77	"""Returns user-defined allowed origins as a list."""
78	return list(allowed_origins.split(" "))	2✔
79
80
81	def create_context() -> str:	2✔
82	"""Creates a SPARQL query context string from the CONTEXT dictionary."""
83	return "\n".join(	2✔
84	[f"PREFIX {prefix}: <{uri}>" for prefix, uri in CONTEXT.items()]
85	)
86
87
88	def unpack_http_response_json_to_dicts(response: dict) -> list[dict]:	2✔
89	"""
90	Reformats a nested dictionary object from a SPARQL query response JSON into a more human-readable list of dictionaries,
91	where the keys are the variables selected in the SPARQL query and the values correspond to the variable values.
92	The number of dictionaries should correspond to the number of query matches.
93	"""
94	return [	2✔
95	{k: v["value"] for k, v in res.items()}
96	for res in response["results"]["bindings"]
97	]
98
99
100	def create_bound_filter(var: str) -> str:	2✔
101	"""
102	Create a SPARQL filter substring for checking if a variable is bound
103	(meaning the variable actually has a corresponding value, e.g., the property exists).
104	"""
105	return f"FILTER (BOUND(?{var})"	2✔
106
107
108	def create_query(	2✔
109	return_agg: bool,
110	age: Optional[tuple] = (None, None),
111	sex: Optional[str] = None,
112	diagnosis: Optional[str] = None,
113	is_control: Optional[bool] = None,
114	min_num_imaging_sessions: Optional[int] = None,
115	min_num_phenotypic_sessions: Optional[int] = None,
116	assessment: Optional[str] = None,
117	image_modal: Optional[str] = None,
118	) -> str:
119	"""
120	Creates a SPARQL query using a query template and filters it using the input parameters.
121
122	Parameters
123	----------
124	return_agg : bool
125	Whether to return only aggregate query results (and not subject-level attributes besides file paths).
126	age : tuple, optional
127	Minimum and maximum age of subject, by default (None, None).
128	sex : str, optional
129	Subject sex, by default None.
130	diagnosis : str, optional
131	Subject diagnosis, by default None.
132	is_control : bool, optional
133	Whether or not subject is a control, by default None.
134	min_num_imaging_sessions : int, optional
135	Subject minimum number of imaging sessions, by default None.
136	min_num_phenotypic_sessions : int, optional
137	Subject minimum number of phenotypic sessions, by default None.
138	assessment : str, optional
139	Non-imaging assessment completed by subjects, by default None.
140	image_modal : str, optional
141	Imaging modality of subject scans, by default None.
142
143	Returns
144	-------
145	str
146	The SPARQL query.
147	"""
148	subject_level_filters = ""	2✔
149	if min_num_phenotypic_sessions is not None:	2✔
150	subject_level_filters += (	×
151	"\n"
152	+ f"FILTER (?num_matching_phenotypic_sessions >= {min_num_phenotypic_sessions})."
153	)
154	if min_num_imaging_sessions is not None:	2✔
155	subject_level_filters += (	×
156	"\n"
157	+ f"FILTER (?num_matching_imaging_sessions >= {min_num_imaging_sessions})."
158	)
159
160	phenotypic_session_level_filters = ""	2✔
161
162	if age[0] is not None:	2✔
163	phenotypic_session_level_filters += (	×
164	"\n"
165	+ f"{create_bound_filter(AGE.var)} && ?{AGE.var} >= {age[0]})."
166	)
167	if age[1] is not None:	2✔
168	phenotypic_session_level_filters += (	×
169	"\n"
170	+ f"{create_bound_filter(AGE.var)} && ?{AGE.var} <= {age[1]})."
171	)
172
173	if sex is not None:	2✔
174	phenotypic_session_level_filters += (	×
175	"\n" + f"{create_bound_filter(SEX.var)} && ?{SEX.var} = {sex})."
176	)
177
178	if diagnosis is not None:	2✔
179	phenotypic_session_level_filters += (	×
180	"\n"
181	+ f"{create_bound_filter(DIAGNOSIS.var)} && ?{DIAGNOSIS.var} = {diagnosis})."
182	)
183
184	if is_control is not None:	2✔
185	if is_control:	×
186	phenotypic_session_level_filters += (	×
187	"\n"
188	+ f"{create_bound_filter(IS_CONTROL.var)} && ?{IS_CONTROL.var} = {IS_CONTROL_TERM})."
189	)
190	else:
191	# TODO: Revisit - this logic seems odd, since in our current data model the session should not have this edge if it's not a control.
UNCOV 192	phenotypic_session_level_filters += (	×
193	"\n"
194	+ f"{create_bound_filter(IS_CONTROL.var)} && ?{IS_CONTROL.var} != {IS_CONTROL_TERM})."
195	)
196
197	if assessment is not None:	2✔
198	phenotypic_session_level_filters += (	×
199	"\n"
200	+ f"{create_bound_filter(ASSESSMENT.var)} && ?{ASSESSMENT.var} = {assessment})."
201	)
202
203	imaging_session_level_filters = ""	2✔
204	if image_modal is not None:	2✔
205	imaging_session_level_filters += (	×
206	"\n" + f"FILTER (?{IMAGE_MODAL.var} = {image_modal})."
207	)
208
209	query_string = textwrap.dedent(	2✔
210	f"""
211	SELECT DISTINCT ?dataset_uuid ?dataset_name ?dataset_portal_uri ?sub_id ?age ?sex
212	?diagnosis ?subject_group ?num_matching_phenotypic_sessions ?num_matching_imaging_sessions ?session_id ?session_type ?assessment ?image_modal ?session_file_path
213	WHERE {{
214	?dataset_uuid a nb:Dataset;
215	nb:hasLabel ?dataset_name;
216	nb:hasSamples ?subject.
217	?subject a nb:Subject;
218	nb:hasLabel ?sub_id;
219	nb:hasSession ?session.
220	?session a ?session_type;
221	nb:hasLabel ?session_id.
222	OPTIONAL {{
223	?session nb:hasAcquisition/nb:hasContrastType ?image_modal.
224	OPTIONAL {{?session nb:hasFilePath ?session_file_path.}}
225	}}
226	OPTIONAL {{?dataset_uuid nb:hasPortalURI ?dataset_portal_uri.}}
227	OPTIONAL {{?session nb:hasAge ?age.}}
228	OPTIONAL {{?session nb:hasSex ?sex.}}
229	OPTIONAL {{?session nb:hasDiagnosis ?diagnosis.}}
230	OPTIONAL {{?session nb:isSubjectGroup ?subject_group.}}
231	OPTIONAL {{?session nb:hasAssessment ?assessment.}}
232	{{
233	SELECT ?subject (count(distinct ?phenotypic_session) as ?num_matching_phenotypic_sessions)
234	WHERE {{
235	?subject nb:hasSession ?phenotypic_session.
236	?phenotypic_session a nb:PhenotypicSession.
237
238	OPTIONAL {{?phenotypic_session nb:hasAge ?age.}}
239	OPTIONAL {{?phenotypic_session nb:hasSex ?sex.}}
240	OPTIONAL {{?phenotypic_session nb:hasDiagnosis ?diagnosis.}}
241	OPTIONAL {{?phenotypic_session nb:isSubjectGroup ?subject_group.}}
242	OPTIONAL {{?phenotypic_session nb:hasAssessment ?assessment.}}
243
244	{phenotypic_session_level_filters}
245	}} GROUP BY ?subject
246	}}
247	{{
248	SELECT ?subject (count(distinct ?imaging_session) as ?num_matching_imaging_sessions)
249	WHERE {{
250	OPTIONAL {{
251	?subject nb:hasSession ?imaging_session.
252	?imaging_session a nb:ImagingSession;
253	nb:hasAcquisition/nb:hasContrastType ?image_modal.
254	}}
255	{imaging_session_level_filters}
256	}} GROUP BY ?subject
257	}}
258	{subject_level_filters}
259	}}
260	"""
261	)
262
263	# The query defined above will return all subject-level attributes from the graph. If RETURN_AGG variable has been set to true,
264	# wrap query in an aggregating statement so data returned from graph include only attributes needed for dataset-level aggregate metadata.
265	if return_agg:	2✔
266	query_string = (	2✔
267	textwrap.dedent(
268	"""
269	SELECT ?dataset_uuid ?dataset_name ?dataset_portal_uri ?sub_id ?image_modal
270	WHERE {"""
271	)
272	+ textwrap.indent(query_string, " ")
273	+ "} GROUP BY ?dataset_uuid ?dataset_name ?dataset_portal_uri ?sub_id ?image_modal"
274	)
275
276	print(query_string)	2✔
277	return "\n".join([create_context(), query_string])	2✔
278
279
280	def create_multidataset_size_query(dataset_uuids: list) -> str:	2✔
281	"""Construct a SPARQL query to retrieve the number of subjects in each dataset in a list of dataset UUIDs."""
282	dataset_uuids_string = "\n".join([f"<{uuid}>" for uuid in dataset_uuids])	2✔
283	query_string = f"""	2✔
284	SELECT ?dataset_uuid (COUNT(DISTINCT ?subject) as ?total_subjects)
285	WHERE {{
286	VALUES ?dataset_uuid {{
287	{dataset_uuids_string}
288	}}
289	?dataset_uuid nb:hasSamples ?subject.
290	?subject a nb:Subject.
291	}} GROUP BY ?dataset_uuid
292	"""
293
294	return "\n".join([create_context(), query_string])	2✔
295
296
297	def create_terms_query(data_element_URI: str) -> str:	2✔
298	"""
299	Creates a SPARQL query using a simple query template to retrieve term URLS for a given data element.
300
301	Parameters
302	----------
303	data_element_URI : str
304	The URI of the data element for which to retrieve the URIs of all connected term.
305
306	Returns
307	-------
308	str
309	The SPARQL query.
310
311	Examples
312	--------
313	get_terms_query("nb:Assessment")
314	"""
315
316	query_string = f"""	2✔
317	SELECT DISTINCT ?termURL
318	WHERE {{
319	?termURL a {data_element_URI} .
320	{data_element_URI} rdfs:subClassOf nb:ControlledTerm .
321	}}
322	"""
323
324	return "\n".join([create_context(), query_string])	2✔
325
326
327	def is_term_namespace_in_context(term_url: str) -> bool:	2✔
328	"""
329	Performs basic check for if a term URL contains a namespace URI from the context.
330
331	Parameters
332	----------
333	term_url : str
334	A controlled term URI.
335
336	Returns
337	-------
338	bool
339	True if the term URL contains a namespace URI from the context, False otherwise.
340	"""
341	for uri in CONTEXT.values():	2✔
342	if uri in term_url:	2✔
343	return True	2✔
344	return False	2✔
345
346
347	def strip_namespace_from_term_uri(term: str, has_prefix: bool = False) -> str:	2✔
348	"""
349	Removes namespace URI or prefix from a term URI if the namespace is recognized.
350
351	Parameters
352	----------
353	term : str
354	A controlled term URI.
355	has_prefix : bool, optional
356	Whether the term URI includes a namespace prefix (as opposed to the full namespace URI), by default False.
357
358	Returns
359	-------
360	str
361	The unique term ID.
362	"""
363	if has_prefix:	2✔
364	term_split = term.rsplit(":", 1)	×
365	if term_split[0] in CONTEXT:	×
366	return term_split[1]	×
367	else:
368	for uri in CONTEXT.values():	2✔
369	if uri in term:	2✔
370	return term.replace(uri, "")	2✔
371
372	# If no match found within the context, return original term
373	return term	×
374
375
376	def replace_namespace_uri_with_prefix(url: str) -> str:	2✔
377	"""
378	Replaces namespace URIs in term URLs with corresponding prefixes from the context.
379
380	Parameters
381	----------
382	url : str
383	A controlled term URL.
384
385	Returns
386	-------
387	str
388	The term with namespace URIs replaced with prefixes if found in the context, or the original URL.
389	"""
390	for prefix, uri in CONTEXT.items():	2✔
391	if uri in url:	2✔
392	return url.replace(uri, f"{prefix}:")	2✔
393
394	# If no match found within the context, return original URL
395	return url	×
396
397
398	def load_json(path: Path) -> dict:	2✔
399	"""
400	Loads a user-specified JSON file.
401
402	Parameters
403	----------
404	path : Path
405	Path to JSON file.
406	"""
407	with open(path, "r") as f:	2✔
408	return json.load(f)	2✔
409
410
411	def fetch_and_save_cogatlas(output_path: Path):	2✔
412	"""
413	Fetches the Cognitive Atlas vocabulary using its native Task API and writes term ID-label mappings to a temporary lookup file.
414	If the API request fails, a backup copy of the vocabulary is used instead.
415
416	Saves a JSON with keys corresponding to Cognitive Atlas task IDs and values corresponding to human-readable task names.
417
418	Parameters
419	----------
420	output_path : Path
421	File path to store output vocabulary lookup file.
422	"""
423	api_url = "https://www.cognitiveatlas.org/api/v-alpha/task?format=json"	2✔
424
425	try:	2✔
426	response = httpx.get(url=api_url)	2✔
427	if response.is_success:	2✔
428	vocab = response.json()	2✔
429	else:
430	warnings.warn(	2✔
431	f"""
432	The API was unable to fetch the Cognitive Atlas task vocabulary (https://www.cognitiveatlas.org/tasks/a/) from the source and will default to using a local backup copy of the vocabulary instead.
433
434	Details of the response from the source:
435	Status code {response.status_code}
436	{response.reason_phrase}: {response.text}
437	"""
438	)
439	# Use backup copy of the raw vocabulary JSON
440	vocab = load_json(BACKUP_VOCAB_DIR / "cogatlas_task.json")	2✔
441	except httpx.NetworkError as exc:	2✔
442	warnings.warn(	2✔
443	f""""
444	Fetching of the Cognitive Atlas task vocabulary (https://www.cognitiveatlas.org/tasks/a/) from the source failed due to a network error.
445	The API will default to using a local backup copy of the vocabulary instead.
446
447	Error: {exc}
448	"""
449	)
450	# Use backup copy of the raw vocabulary JSON
451	vocab = load_json(BACKUP_VOCAB_DIR / "cogatlas_task.json")	2✔
452
453	term_labels = {term["id"]: term["name"] for term in vocab}	2✔
454	with open(output_path, "w") as f:	2✔
455	f.write(json.dumps(term_labels, indent=2))	2✔
456
457
458	def create_snomed_term_lookup(output_path: Path):	2✔
459	"""
460	Reads in a file of disorder terms from the SNOMED CT vocabulary and writes term ID-label mappings to a temporary lookup file.
461
462	Saves a JSON with keys corresponding to SNOMED CT IDs and values corresponding to human-readable term names.
463
464	Parameters
465	----------
466	output_path : Path
467	File path to store output vocabulary lookup file.
468	"""
469	vocab = load_json(BACKUP_VOCAB_DIR / "snomedct_disorder.json")	2✔
470
471	term_labels = {term["sctid"]: term["preferred_name"] for term in vocab}	2✔
472	with open(output_path, "w") as f:	2✔
473	f.write(json.dumps(term_labels, indent=2))	2✔

neurobagel / api / 10085126638

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous