22648019224

Committed 03 Mar 2026 11:44PM UTC coverage: 78.369% (-10.6%) from 88.948%

Build # 22648019224

Build Type

push

github

Committed by

web-flow

Commit Message

Merge pull request #15 from CBIIT/develop

merge develop into main

Coverage Stats

127 of 289 new or added lines in 8 files covered. (43.94%)

1 existing line in 1 file now uncovered.

913 of 1165 relevant lines covered (78.37%)

0.78 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

64.86

/core/processor/post_processor.py

import copy
import logging
import re
from datetime import datetime, timezone
from typing import Callable, Any

from html2text import HTML2Text

from utils.post_processor_utils import deep_merge_additive

logger = logging.getLogger(__name__)


def post_processor(fn: Callable[..., Any]):
    """Labels a function as a post-processor by setting attribute.

    Args:
        fn (Callable[..., Any]): Function to be labeled post-processor.

    Returns:
        Callable[..., Any]: Original function with an '_is_post_processor'
        attribute added.
    """
    fn._is_post_processor = True
    return fn


def transform_html(html: str) -> str:
    """Transforms HTML to plain text.

    Args:
        html (str): HTML string to transform.

    Returns:
        str: Plain-text version of HTML string.
    """
    converter = HTML2Text()
    converter.ignore_links = True
    converter.body_width = 0
    converter.ignore_emphasis = True
    converter.single_line_break = True

    text = converter.handle(html)
    text = re.sub(r"\s+", " ", text).strip()

    return text


@post_processor
def clean_idc_metadata(metadata_list: list[dict]) -> list[dict]:
    """Transforms 'description' fields in IDC metadata from HTML to plain text.

    Args:
        metadata_list (list[dict]): List of IDC metadata dicts.

    Returns:
        list[dict]: Updated metadata with transformed 'description' values.
    """
    for metadata in metadata_list:
        if "description" in metadata:
            metadata["description"] = transform_html(metadata["description"])
            logger.info("Transformed HTML in 'description' field of metadata")
        else:
            logger.warning("'description' key not found in metadata.")
    return metadata_list


@post_processor
def aggregate_tcia_series_data(
    data: list, entity: dict, collection_id: str, entity_id_key: str
) -> dict:
    """Aggregates TCIA metadata fields for a given entity.

    Args:
        data (list[dict]): Array of TCIA metadata dicts.
        entity (dict): Entity record being processed.
        collection_id (str): ID of TCIA data collection.
        entity_id_key (str): Key used to identify entity in project metadata.

    Returns:
        dict: A dict of aggregated metadata fields for the collection.
    """
    ENTITY_OVERRIDES = {
        "GLIOMA01": {
            "Aggregate_ImageCount": 84,
            "Aggregate_Modality": ["Histopathology"],
        }
    }

    total_images = 0
    total_patients = set()
    unique_modalities = set()
    unique_bodyparts = set()

    for item in data:
        total_images += item["ImageCount"]
        total_patients.add(item["PatientID"])
        unique_modalities.add(item["Modality"])
        unique_bodyparts.add(item["BodyPartExamined"])

    result = {
        "Collection": collection_id,
        "Aggregate_PatientID": len(total_patients),
        "Aggregate_Modality": list(unique_modalities),
        "Aggregate_BodyPartExamined": list(unique_bodyparts),
        "Aggregate_ImageCount": total_images,
    }

    entity_id = entity.get(entity_id_key)
    if entity_id in ENTITY_OVERRIDES:
        override = copy.deepcopy(ENTITY_OVERRIDES[entity_id])
        result = deep_merge_additive(result, override)
        logger.info(f"Additional TCIA data for {entity_id} entity added to totals.")

    logger.info(
        f"Completed aggregation of TCIA series data for collection '{collection_id}': "
        f"{result['Aggregate_PatientID']} patients, {result['Aggregate_ImageCount']} images, "
        f"modalities: {sorted(result['Aggregate_Modality'])}, body parts: {sorted(result['Aggregate_BodyPartExamined'])}"
    )

    return result


@post_processor
def format_for_icdc(data: list[dict]) -> list[dict]:
    """Formats fetched and processed data for ICDC ingestion.

    Args:
        data (list[dict]): List of fetched and processed data dicts.

    Returns:
        list[dict]: Formatted data ready for ICDC ingestion.
    """
    formatted_results = []

    for document in data:
        external_dataset = {}
        image_collections = 0
        external_repos = []

        now_utc = datetime.now(timezone.utc)
        external_dataset["timestamp"] = now_utc.isoformat(
            timespec="milliseconds"
        ).replace("+00:00", "Z")

        external_dataset["clinical_study_designation"] = document.get("entity_id")
        external_dataset["CRDCLinks"] = document.get("CRDCLinks", [])

        for link in external_dataset["CRDCLinks"]:
            image_collections += 1
            external_repos.append(link.get("repository"))

        external_dataset["numberOfImageCollections"] = image_collections
        external_dataset["numberOfCRDCNodes"] = len(set(external_repos))
        formatted_results.append(external_dataset)

    return formatted_results


@post_processor
def format_for_ccdi(data: list[dict]) -> list[dict]:
    """Formats fetched data for CCDI ingestion.

    Args:
        data (list[dict]): List of fetched data dicts.

    Returns:
        list[dict]: Formatted data ready for CCDI ingestion.
    """
    formatted_results = []

    now_utc = datetime.now(timezone.utc)
    timestamp = now_utc.isoformat(timespec="milliseconds").replace("+00:00", "Z")

    for document in data:
        formatted_results.append(
            {
                "timestamp": timestamp,
                "repository": document.get("repository", "unknown"),
                "data": document,
            }
        )

    return formatted_results

1	import copy	1✔
2	import logging	1✔
3	import re	1✔
4	from datetime import datetime, timezone	1✔
5	from typing import Callable, Any	1✔
6
7	from html2text import HTML2Text	1✔
8
9	from utils.post_processor_utils import deep_merge_additive	1✔
10
11	logger = logging.getLogger(__name__)	1✔
12
13
14	def post_processor(fn: Callable[..., Any]):	1✔
15	"""Labels a function as a post-processor by setting attribute.
16
17	Args:
18	fn (Callable[..., Any]): Function to be labeled post-processor.
19
20	Returns:
21	Callable[..., Any]: Original function with an '_is_post_processor'
22	attribute added.
23	"""
24	fn._is_post_processor = True	1✔
25	return fn	1✔
26
27
28	def transform_html(html: str) -> str:	1✔
29	"""Transforms HTML to plain text.
30
31	Args:
32	html (str): HTML string to transform.
33
34	Returns:
35	str: Plain-text version of HTML string.
36	"""
37	converter = HTML2Text()	1✔
38	converter.ignore_links = True	1✔
39	converter.body_width = 0	1✔
40	converter.ignore_emphasis = True	1✔
41	converter.single_line_break = True	1✔
42
43	text = converter.handle(html)	1✔
44	text = re.sub(r"\s+", " ", text).strip()	1✔
45
46	return text	1✔
47
48
49	@post_processor	1✔
50	def clean_idc_metadata(metadata_list: list[dict]) -> list[dict]:	1✔
51	"""Transforms 'description' fields in IDC metadata from HTML to plain text.
52
53	Args:
54	metadata_list (list[dict]): List of IDC metadata dicts.
55
56	Returns:
57	list[dict]: Updated metadata with transformed 'description' values.
58	"""
59	for metadata in metadata_list:	1✔
60	if "description" in metadata:	1✔
61	metadata["description"] = transform_html(metadata["description"])	1✔
62	logger.info("Transformed HTML in 'description' field of metadata")	1✔
63	else:
64	logger.warning("'description' key not found in metadata.")	×
65	return metadata_list	1✔
66
67
68	@post_processor	1✔
69	def aggregate_tcia_series_data(	1✔
70	data: list, entity: dict, collection_id: str, entity_id_key: str
71	) -> dict:
72	"""Aggregates TCIA metadata fields for a given entity.
73
74	Args:
75	data (list[dict]): Array of TCIA metadata dicts.
76	entity (dict): Entity record being processed.
77	collection_id (str): ID of TCIA data collection.
78	entity_id_key (str): Key used to identify entity in project metadata.
79
80	Returns:
81	dict: A dict of aggregated metadata fields for the collection.
82	"""
83	ENTITY_OVERRIDES = {	1✔
84	"GLIOMA01": {
85	"Aggregate_ImageCount": 84,
86	"Aggregate_Modality": ["Histopathology"],
87	}
88	}
89
90	total_images = 0	1✔
91	total_patients = set()	1✔
92	unique_modalities = set()	1✔
93	unique_bodyparts = set()	1✔
94
95	for item in data:	1✔
96	total_images += item["ImageCount"]	1✔
97	total_patients.add(item["PatientID"])	1✔
98	unique_modalities.add(item["Modality"])	1✔
99	unique_bodyparts.add(item["BodyPartExamined"])	1✔
100
101	result = {	1✔
102	"Collection": collection_id,
103	"Aggregate_PatientID": len(total_patients),
104	"Aggregate_Modality": list(unique_modalities),
105	"Aggregate_BodyPartExamined": list(unique_bodyparts),
106	"Aggregate_ImageCount": total_images,
107	}
108
109	entity_id = entity.get(entity_id_key)	1✔
110	if entity_id in ENTITY_OVERRIDES:	1✔
111	override = copy.deepcopy(ENTITY_OVERRIDES[entity_id])	×
112	result = deep_merge_additive(result, override)	×
113	logger.info(f"Additional TCIA data for {entity_id} entity added to totals.")	×
114
115	logger.info(	1✔
116	f"Completed aggregation of TCIA series data for collection '{collection_id}': "
117	f"{result['Aggregate_PatientID']} patients, {result['Aggregate_ImageCount']} images, "
118	f"modalities: {sorted(result['Aggregate_Modality'])}, body parts: {sorted(result['Aggregate_BodyPartExamined'])}"
119	)
120
121	return result	1✔
122
123
124	@post_processor	1✔
125	def format_for_icdc(data: list[dict]) -> list[dict]:	1✔
126	"""Formats fetched and processed data for ICDC ingestion.
127
128	Args:
129	data (list[dict]): List of fetched and processed data dicts.
130
131	Returns:
132	list[dict]: Formatted data ready for ICDC ingestion.
133	"""
NEW 134	formatted_results = []	×
135
NEW 136	for document in data:	×
NEW 137	external_dataset = {}	×
NEW 138	image_collections = 0	×
NEW 139	external_repos = []	×
140
NEW 141	now_utc = datetime.now(timezone.utc)	×
NEW 142	external_dataset["timestamp"] = now_utc.isoformat(	×
143	timespec="milliseconds"
144	).replace("+00:00", "Z")
145
NEW 146	external_dataset["clinical_study_designation"] = document.get("entity_id")	×
NEW 147	external_dataset["CRDCLinks"] = document.get("CRDCLinks", [])	×
148
NEW 149	for link in external_dataset["CRDCLinks"]:	×
NEW 150	image_collections += 1	×
NEW 151	external_repos.append(link.get("repository"))	×
152
NEW 153	external_dataset["numberOfImageCollections"] = image_collections	×
NEW 154	external_dataset["numberOfCRDCNodes"] = len(set(external_repos))	×
NEW 155	formatted_results.append(external_dataset)	×
156
NEW 157	return formatted_results	×
158
159
160	@post_processor	1✔
161	def format_for_ccdi(data: list[dict]) -> list[dict]:	1✔
162	"""Formats fetched data for CCDI ingestion.
163
164	Args:
165	data (list[dict]): List of fetched data dicts.
166
167	Returns:
168	list[dict]: Formatted data ready for CCDI ingestion.
169	"""
NEW 170	formatted_results = []	×
171
NEW 172	now_utc = datetime.now(timezone.utc)	×
NEW 173	timestamp = now_utc.isoformat(timespec="milliseconds").replace("+00:00", "Z")	×
174
NEW 175	for document in data:	×
NEW 176	formatted_results.append(	×
177	{
178	"timestamp": timestamp,
179	"repository": document.get("repository", "unknown"),
180	"data": document,
181	}
182	)
183
NEW 184	return formatted_results	×

CBIIT / crdc-icdc-data-retriever / 22648019224

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous