• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

CBIIT / crdc-icdc-data-retriever / 22648019224

03 Mar 2026 11:44PM UTC coverage: 78.369% (-10.6%) from 88.948%
22648019224

push

github

web-flow
Merge pull request #15 from CBIIT/develop

merge develop into main

127 of 289 new or added lines in 8 files covered. (43.94%)

1 existing line in 1 file now uncovered.

913 of 1165 relevant lines covered (78.37%)

0.78 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

64.86
/core/processor/post_processor.py
1
import copy
1✔
2
import logging
1✔
3
import re
1✔
4
from datetime import datetime, timezone
1✔
5
from typing import Callable, Any
1✔
6

7
from html2text import HTML2Text
1✔
8

9
from utils.post_processor_utils import deep_merge_additive
1✔
10

11
logger = logging.getLogger(__name__)
1✔
12

13

14
def post_processor(fn: Callable[..., Any]):
1✔
15
    """Labels a function as a post-processor by setting attribute.
16

17
    Args:
18
        fn (Callable[..., Any]): Function to be labeled post-processor.
19

20
    Returns:
21
        Callable[..., Any]: Original function with an '_is_post_processor'
22
        attribute added.
23
    """
24
    fn._is_post_processor = True
1✔
25
    return fn
1✔
26

27

28
def transform_html(html: str) -> str:
1✔
29
    """Transforms HTML to plain text.
30

31
    Args:
32
        html (str): HTML string to transform.
33

34
    Returns:
35
        str: Plain-text version of HTML string.
36
    """
37
    converter = HTML2Text()
1✔
38
    converter.ignore_links = True
1✔
39
    converter.body_width = 0
1✔
40
    converter.ignore_emphasis = True
1✔
41
    converter.single_line_break = True
1✔
42

43
    text = converter.handle(html)
1✔
44
    text = re.sub(r"\s+", " ", text).strip()
1✔
45

46
    return text
1✔
47

48

49
@post_processor
1✔
50
def clean_idc_metadata(metadata_list: list[dict]) -> list[dict]:
1✔
51
    """Transforms 'description' fields in IDC metadata from HTML to plain text.
52

53
    Args:
54
        metadata_list (list[dict]): List of IDC metadata dicts.
55

56
    Returns:
57
        list[dict]: Updated metadata with transformed 'description' values.
58
    """
59
    for metadata in metadata_list:
1✔
60
        if "description" in metadata:
1✔
61
            metadata["description"] = transform_html(metadata["description"])
1✔
62
            logger.info("Transformed HTML in 'description' field of metadata")
1✔
63
        else:
64
            logger.warning("'description' key not found in metadata.")
×
65
    return metadata_list
1✔
66

67

68
@post_processor
1✔
69
def aggregate_tcia_series_data(
1✔
70
    data: list, entity: dict, collection_id: str, entity_id_key: str
71
) -> dict:
72
    """Aggregates TCIA metadata fields for a given entity.
73

74
    Args:
75
        data (list[dict]): Array of TCIA metadata dicts.
76
        entity (dict): Entity record being processed.
77
        collection_id (str): ID of TCIA data collection.
78
        entity_id_key (str): Key used to identify entity in project metadata.
79

80
    Returns:
81
        dict: A dict of aggregated metadata fields for the collection.
82
    """
83
    ENTITY_OVERRIDES = {
1✔
84
        "GLIOMA01": {
85
            "Aggregate_ImageCount": 84,
86
            "Aggregate_Modality": ["Histopathology"],
87
        }
88
    }
89

90
    total_images = 0
1✔
91
    total_patients = set()
1✔
92
    unique_modalities = set()
1✔
93
    unique_bodyparts = set()
1✔
94

95
    for item in data:
1✔
96
        total_images += item["ImageCount"]
1✔
97
        total_patients.add(item["PatientID"])
1✔
98
        unique_modalities.add(item["Modality"])
1✔
99
        unique_bodyparts.add(item["BodyPartExamined"])
1✔
100

101
    result = {
1✔
102
        "Collection": collection_id,
103
        "Aggregate_PatientID": len(total_patients),
104
        "Aggregate_Modality": list(unique_modalities),
105
        "Aggregate_BodyPartExamined": list(unique_bodyparts),
106
        "Aggregate_ImageCount": total_images,
107
    }
108

109
    entity_id = entity.get(entity_id_key)
1✔
110
    if entity_id in ENTITY_OVERRIDES:
1✔
111
        override = copy.deepcopy(ENTITY_OVERRIDES[entity_id])
×
112
        result = deep_merge_additive(result, override)
×
113
        logger.info(f"Additional TCIA data for {entity_id} entity added to totals.")
×
114

115
    logger.info(
1✔
116
        f"Completed aggregation of TCIA series data for collection '{collection_id}': "
117
        f"{result['Aggregate_PatientID']} patients, {result['Aggregate_ImageCount']} images, "
118
        f"modalities: {sorted(result['Aggregate_Modality'])}, body parts: {sorted(result['Aggregate_BodyPartExamined'])}"
119
    )
120

121
    return result
1✔
122

123

124
@post_processor
1✔
125
def format_for_icdc(data: list[dict]) -> list[dict]:
1✔
126
    """Formats fetched and processed data for ICDC ingestion.
127

128
    Args:
129
        data (list[dict]): List of fetched and processed data dicts.
130

131
    Returns:
132
        list[dict]: Formatted data ready for ICDC ingestion.
133
    """
NEW
134
    formatted_results = []
×
135

NEW
136
    for document in data:
×
NEW
137
        external_dataset = {}
×
NEW
138
        image_collections = 0
×
NEW
139
        external_repos = []
×
140

NEW
141
        now_utc = datetime.now(timezone.utc)
×
NEW
142
        external_dataset["timestamp"] = now_utc.isoformat(
×
143
            timespec="milliseconds"
144
        ).replace("+00:00", "Z")
145

NEW
146
        external_dataset["clinical_study_designation"] = document.get("entity_id")
×
NEW
147
        external_dataset["CRDCLinks"] = document.get("CRDCLinks", [])
×
148

NEW
149
        for link in external_dataset["CRDCLinks"]:
×
NEW
150
            image_collections += 1
×
NEW
151
            external_repos.append(link.get("repository"))
×
152

NEW
153
        external_dataset["numberOfImageCollections"] = image_collections
×
NEW
154
        external_dataset["numberOfCRDCNodes"] = len(set(external_repos))
×
NEW
155
        formatted_results.append(external_dataset)
×
156

NEW
157
    return formatted_results
×
158

159

160
@post_processor
1✔
161
def format_for_ccdi(data: list[dict]) -> list[dict]:
1✔
162
    """Formats fetched data for CCDI ingestion.
163

164
    Args:
165
        data (list[dict]): List of fetched data dicts.
166

167
    Returns:
168
        list[dict]: Formatted data ready for CCDI ingestion.
169
    """
NEW
170
    formatted_results = []
×
171

NEW
172
    now_utc = datetime.now(timezone.utc)
×
NEW
173
    timestamp = now_utc.isoformat(timespec="milliseconds").replace("+00:00", "Z")
×
174

NEW
175
    for document in data:
×
NEW
176
        formatted_results.append(
×
177
            {
178
                "timestamp": timestamp,
179
                "repository": document.get("repository", "unknown"),
180
                "data": document,
181
            }
182
        )
183

NEW
184
    return formatted_results
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc