• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Clinical-Genomics / demultiplexing / 4627485829

pending completion
4627485829

push

github-actions

karlnyr
remova conda stuff, expand aliases

501 of 941 relevant lines covered (53.24%)

0.53 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.06
/demux/utils/indexreport.py
1
import bs4
1✔
2
import logging
1✔
3
import re
1✔
4

5
import xml.etree.cElementTree as Et
1✔
6

7
from pathlib import Path
1✔
8
from typing import Tuple
1✔
9

10
from demux.constants.indexreport import FLOWCELL_VERSION_LANE_COUNT
1✔
11
from demux.utils.html import (
1✔
12
    get_html_content,
13
    parse_html_header,
14
    parse_html_project_cluster_counts,
15
)
16

17
from demux.exc import IndexReportError
1✔
18

19
LOG = logging.getLogger(__name__)
1✔
20

21

22
class IndexReport:
1✔
23
    """Indexcheck report class, able to hold and process information out of bcl2fastq html reports"""
24

25
    def __init__(
1✔
26
        self,
27
        cluster_counts: int,
28
        index_report_path: Path,
29
        out_dir: Path,
30
        report_tables_index: dict,
31
        run_parameters_path: Path,
32
    ):
33
        self.flowcell_id = find_flowcell_id(run_parameters_path=run_parameters_path)
1✔
34
        self.flowcell_version = find_flowcell_version(
1✔
35
            run_parameters_path=run_parameters_path
36
        )
37
        self.index_report_path = index_report_path
1✔
38
        self.out_dir = out_dir
1✔
39
        self.run_parameters = run_parameters_path
1✔
40

41
        LOG.info(
1✔
42
            f"Parsing file index report for {self.flowcell_version} FC {self.flowcell_id}, extracting top unknown"
43
            f"barcodes and samples with cluster counts lower than {cluster_counts}."
44
        )
45
        self.html_content = get_html_content(index_report_path=index_report_path)
1✔
46
        self.report_tables = self.get_report_tables(html_content=self.html_content)
1✔
47
        self.sample_table_header = self.get_sample_table_header(
1✔
48
            report_tables=self.report_tables, report_tables_index=report_tables_index
49
        )
50
        self.low_cluster_counts = self.get_low_cluster_counts(
1✔
51
            report_tables=self.report_tables,
52
            sample_table_header=self.sample_table_header,
53
            report_tables_index=report_tables_index,
54
            cluster_counts=cluster_counts,
55
        )
56
        self.top_unknown_barcodes = self.get_top_unknown_barcodes_table(
1✔
57
            report_tables=self.report_tables, report_tables_index=report_tables_index
58
        )
59
        LOG.info(f"Parsing complete!")
1✔
60

61
    @staticmethod
1✔
62
    def get_report_tables(html_content: bs4.BeautifulSoup) -> bs4.ResultSet:
1✔
63
        """Get the ReportTables inside the html report"""
64

65
        report_tables = html_content.find_all("table", id="ReportTable")
1✔
66
        return report_tables
1✔
67

68
    @staticmethod
1✔
69
    def get_sample_table_header(
1✔
70
        report_tables: bs4.ResultSet, report_tables_index: dict
71
    ) -> dict:
72
        """Get the header from the large table with all sample clusters"""
73

74
        header_index = {}
1✔
75

76
        html_sample_headers = report_tables[
1✔
77
            report_tables_index["cluster_count_table"]
78
        ].tr.find_all("th")
79

80
        for index, html_column_header in enumerate(html_sample_headers):
1✔
81
            header = parse_html_header(html_column_header)
1✔
82
            header_index[header] = index
1✔
83

84
        return header_index
1✔
85

86
    @staticmethod
1✔
87
    def get_low_cluster_counts(
1✔
88
        cluster_counts: int,
89
        report_tables: bs4.ResultSet,
90
        report_tables_index: dict,
91
        sample_table_header: dict,
92
    ) -> list:
93
        """Find samples with low cluster counts"""
94

95
        low_cluster_counts = []
1✔
96

97
        for html_project_cluster_count in report_tables[
1✔
98
            report_tables_index["cluster_count_table"]
99
        ].find_all("tr")[1:]:
100
            project, cluster_count = parse_html_project_cluster_counts(
1✔
101
                project_row=html_project_cluster_count,
102
                header_index=sample_table_header,
103
            )
104
            if project != "indexcheck":
1✔
105
                if cluster_count < cluster_counts:
1✔
106
                    low_cluster_counts.append(html_project_cluster_count)
×
107

108
        return low_cluster_counts
1✔
109

110
    @staticmethod
1✔
111
    def get_top_unknown_barcodes_table(
1✔
112
        report_tables: bs4.ResultSet, report_tables_index: dict
113
    ) -> bs4.element.Tag:
114
        """Get the table with the top unknown barcodes"""
115

116
        return report_tables[report_tables_index["top_unknown_barcode_table"]]
1✔
117

118
    def validate(self, reference_report_header: list):
1✔
119
        """Validate report structure"""
120

121
        LOG.info(f"Validating report")
1✔
122
        for valid, message in [
1✔
123
            validate_report_tables(report_tables=self.report_tables),
124
            validate_index_report_header(
125
                reference_header=reference_report_header,
126
                sample_table_header=self.sample_table_header,
127
            ),
128
            validate_top_unknown_barcodes_table(
129
                top_unknown_barcodes_table=self.top_unknown_barcodes,
130
                flowcell_version=self.flowcell_version,
131
            ),
132
        ]:
133
            if not valid:
1✔
134
                LOG.error(message)
1✔
135
                raise IndexReportError
1✔
136
            elif valid:
1✔
137
                LOG.info(message)
1✔
138
        LOG.info(f"Validation passed")
1✔
139

140
    def write_summary(self, report_tables_index: dict):
1✔
141
        """Compile a summary report of the bcl2fastq report"""
142

143
        out_dir_path = Path(self.out_dir)
1✔
144

145
        with open((out_dir_path / "laneBarcode_summary.html"), "+w") as fo:
1✔
146
            fo.write(
1✔
147
                f"<h1>Flowcell summary: {self.flowcell_id}</h1>"
148
                f"<h2>Low cluster counts</h2>"
149
            )
150
            fo.write(f'<table border="1" ID="ReportTable">')
1✔
151
            fo.write(
1✔
152
                str(self.report_tables[report_tables_index["cluster_count_table"]].tr)
153
            )
154
            for row in self.low_cluster_counts:
1✔
155
                fo.write(str(row))
×
156
            fo.write(f"</table>")
1✔
157
            fo.write(
1✔
158
                str(
159
                    self.html_content.find_all("h2")[
160
                        report_tables_index["top_unknown_barcode_table"]
161
                    ]
162
                )
163
            )
164
            fo.write(str(self.top_unknown_barcodes))
1✔
165
        LOG.info(
1✔
166
            f"Wrote indexcheck report summary to {self.out_dir}/laneBarcode_summary.html"
167
        )
168

169

170
def find_flowcell_id(run_parameters_path: Path) -> str:
1✔
171
    """Parse the RunParameters.xml file and retrieve flowcell ID"""
172
    root = Et.parse(run_parameters_path).getroot()
1✔
173

174
    flowcell_id = root.find("ExperimentName").text
1✔
175

176
    return flowcell_id
1✔
177

178

179
def find_flowcell_version(run_parameters_path: Path) -> str:
1✔
180
    """Parse the RunParameters.xml file and retrieve flowcell version, e.g. S4, S1"""
181
    root = Et.parse(run_parameters_path).getroot()
1✔
182

183
    rf_info = root.iter("RfidsInfo")
1✔
184

185
    for info in rf_info:
1✔
186
        flowcell_version = info.find("FlowCellMode").text
1✔
187

188
        return flowcell_version
1✔
189

190

191
def validate_top_unknown_barcodes_table(
1✔
192
    top_unknown_barcodes_table: bs4.element.Tag, flowcell_version: str
193
) -> Tuple[bool, str]:
194
    """Validate the top unknown barcodes table, checking that all lanes are present"""
195
    try:
1✔
196
        assert (
1✔
197
            len(
198
                re.sub("<.*?>", "", str(top_unknown_barcodes_table.tr))
199
                .strip()
200
                .split("Lane")
201
            )
202
            == FLOWCELL_VERSION_LANE_COUNT[flowcell_version] + 1
203
        )
204
    except AssertionError as e:
1✔
205
        message = f"Top unknown barcode table is not matching the reference, please check the report"
1✔
206
        return False, message
1✔
207
    message = "Top Unknown Barcodes table: Passed!"
1✔
208
    return True, message
1✔
209

210

211
def validate_report_tables(report_tables: bs4.ResultSet) -> Tuple[bool, str]:
1✔
212
    """Validate the number of report tables"""
213
    try:
1✔
214
        assert len(report_tables) == 3
1✔
215
    except AssertionError:
1✔
216
        message = "The number of Report Tables are not matching the reference, please check the report"
1✔
217
        return False, message
1✔
218
    message = "Number of report tables: Passed!"
1✔
219
    return True, message
1✔
220

221

222
def validate_index_report_header(
1✔
223
    reference_header: list, sample_table_header: dict
224
) -> Tuple[bool, str]:
225
    """Validate the index report headers"""
226

227
    try:
1✔
228
        assert reference_header == list(sample_table_header.keys())
1✔
229
    except AssertionError as e:
1✔
230
        message = (
1✔
231
            f"The header in the cluster count sample table is not matching the\n"
232
            f"control headers. Check if they need correction"
233
        )
234
        return False, message
1✔
235
    message = f"Sample cluster count headers: Passed!"
1✔
236
    return True, message
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc