• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Clinical-Genomics / demultiplexing / 4627485829

pending completion
4627485829

push

github-actions

karlnyr
remova conda stuff, expand aliases

501 of 941 relevant lines covered (53.24%)

0.53 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

69.35
/demux/utils/samplesheet.py
1
import re
1✔
2

3
from collections import OrderedDict
1✔
4
from copy import deepcopy
1✔
5
from pathlib import Path
1✔
6
from typing import Set, Tuple
1✔
7

8
from demux.constants.constants import COMMA, NEW_LINE
1✔
9

10

11
class SampleSheetValidationException(Exception):
1✔
12
    def __init__(self, section, msg, line_nr):
1✔
13
        self.section = section
1✔
14
        self.msg = msg
1✔
15
        self.line_nr = line_nr
1✔
16

17
    def __str__(self):
1✔
18
        return repr(
1✔
19
            "Section '{}', Line '{}': {}".format(self.section, self.msg, self.line_nr)
20
        )
21

22

23
class SampleSheetParseException(Exception):
1✔
24
    pass
1✔
25

26

27
class Line(dict):
1✔
28
    @staticmethod
1✔
29
    def _reverse_complement(dna):
30
        complement = {"A": "T", "C": "G", "G": "C", "T": "A"}
×
31
        return "".join([complement[base] for base in dna[::-1]])
×
32

33
    @property
1✔
34
    def dualindex(self, delim="-", revcomp=False):
1✔
35
        if "index2" in self and len(self["index2"]):
1✔
36
            index2 = (
×
37
                self._reverse_complement(self["index2"]) if revcomp else self["index2"]
38
            )
39
            return self["index"] + delim + index2
×
40
        return self["index"]
1✔
41

42

43
class Samplesheet(object):
1✔
44
    """SampleSheet.
45

46
    Stores the samplesheet in sections: self.section
47
    e.g. [Data] section will be stored in self.section['[Data]']
48

49
    The [Data] section is the actual samplesheet. It consists of a '[Data]' section marker,
50
    a column header, and the rows of samplesheet data.
51

52
    This will be split on line, with each line turned into a dictionary (column header as keys),
53
    and stored into self.samplesheet.
54

55
    e.g.:
56

57
    [Data],,
58
    Flowcell,SampleID,Index
59
    HHGGFFSS,ADM1123A1,ACTGACTG
60

61
    self.section['[Data]'] = [
62
        {'FCID': 'HHGGFFSS', 'SampleID': 'ADM1123A1', 'Index': 'ACTGACTG'}
63
    ]
64

65
    The original split line with the section marker, will be stored in self.section_markers['[Data]'] = ['[Data]','','']
66

67
    """
68

69
    # known sections
70
    HEADER = "[Header]"
1✔
71
    DATA = "[Data]"
1✔
72

73
    # for the [Data] section: provide a universal header line.
74
    # The mapping is like this: universal: expected, e.g. for NIPT we have expected Sample_ID,
75
    # which is mapped to the more universal 'sample_id': 'sample_id': 'Sample_ID'
76
    # Universal keys follow the python variable naming rules:
77
    # lowercase with words separated by underscores as necessary to improve readability.
78
    # One can make one header map for each samplesheet type.
79
    header_map = {
1✔
80
        "fcid": "FCID",
81
        "lane": "Lane",
82
        "sample_id": "SampleID",
83
        "sample_ref": "SampleRef",
84
        "index": "index",
85
        "index2": "index2",
86
        "sample_name": "SampleName",
87
        "control": "Control",
88
        "recipe": "Recipe",
89
        "operator": "Operator",
90
        "project": "Project",
91
    }
92

93
    def _get_flowcell(self):
1✔
94
        for line in self.section[self.DATA]:
×
95
            if "Flowcell" in line:
×
96
                return line["Flowcell"]
×
97
        return None
×
98

99
    def __init__(self, samplesheet_path):
1✔
100
        self.samplesheet_path = samplesheet_path
1✔
101
        self.original_sheet = []  # all lines of the samplesheet
1✔
102
        self.section_markers = (
1✔
103
            dict()
104
        )  # [Name]: line; does this section have a named section
105
        self.parse(samplesheet_path)
1✔
106

107
    def _get_data_header(self):
1✔
108
        try:
1✔
109
            header_r = self._get_data_header_r()
1✔
110
            header_map_r = dict((v, k) for k, v in self.header_map.items())
1✔
111
            header = [header_map_r[k] for k in header_r]
1✔
112
        except KeyError as e:
1✔
113
            msg = f"Incorrect column found - {e}"
1✔
114
            raise SampleSheetValidationException(section="header", msg=msg, line_nr=0)
1✔
115
        return header
1✔
116

117
    def _get_data_header_r(self):
1✔
118
        return self.section[self.DATA][0]
1✔
119

120
    def _get_header_key(self, key):
1✔
121
        if key not in self.header_map:
×
122
            raise KeyError("'{}' not in header_map!".format(key))
×
123
        return self.header_map[key]
×
124

125
    def parse(self, samplesheet_path, delim=","):
1✔
126
        """
127
        Parses a Samplesheet, with their fake csv format.
128
        Should be instantiated with the samplesheet path as an argument.
129
        Will create a dict for each section. Header: (lines)
130
        """
131

132
        name = "[Data]"
1✔
133
        self.section = OrderedDict()
1✔
134
        with open(samplesheet_path) as csvfile:
1✔
135
            for line in csvfile:
1✔
136
                line = line.strip()
1✔
137
                line = line.split(delim)
1✔
138
                if len(line) == 0:
1✔
139
                    # skip empty lines
140
                    continue
×
141
                self.original_sheet.append(line)
1✔
142
                if line[0].startswith("["):
1✔
143
                    name = line[0]
1✔
144
                    self.section_markers[name] = line
1✔
145
                    # skip the actual section header
146
                    continue
1✔
147

148
                if name not in self.section:
1✔
149
                    self.section[name] = []
1✔
150

151
                self.section[name].append(line)
1✔
152

153
        if self.DATA not in self.section:
1✔
154
            raise SampleSheetParseException("No data found!")
×
155

156
        header = self._get_data_header()
1✔
157
        self.samplesheet = [
1✔
158
            Line(dict(zip(header, line))) for line in self.section[self.DATA][1:]
159
        ]
160

161
        header_r = self._get_data_header_r()
1✔
162
        self.samplesheet_r = [
1✔
163
            dict(zip(header_r, line)) for line in self.section[self.DATA][1:]
164
        ]
165

166
    def lines(self):
1✔
167
        """Yields all lines of the [Data] section."""
168
        for line in self.samplesheet:
1✔
169
            yield line
1✔
170

171
    def lines_r(self):
1✔
172
        """Yields all lines of the [Data] section based on the original header"""
173
        for line in self.samplesheet_r:
1✔
174
            yield line
1✔
175

176
    def raw(self, delim=",", end="\n"):
1✔
177
        """Reconstructs the sample sheet."""
178
        rs = []
1✔
179
        for line in self.original_sheet:
1✔
180
            rs.append(delim.join(line))
1✔
181
        return end.join(rs)
1✔
182

183
    def samples(self, column="sample_id"):
1✔
184
        """Return all samples in the samplesheet"""
185
        return self.column(column)
1✔
186

187
    def samples_r(self, column="SampleID"):
1✔
188
        """Return all samples in the samplesheet based on the original header"""
189
        return self.column_r(column)
1✔
190

191
    def column(self, column):
1✔
192
        """Return all values from a column in the samplesheet"""
193
        for line in self.samplesheet:
1✔
194
            yield line[column]
1✔
195

196
    def column_r(self, column):
1✔
197
        """Return all values from a column in the samplesheet based on the original header"""
198
        for line in self.samplesheet_r:
1✔
199
            yield line[column]
1✔
200

201
    def cell(self, line, column):
1✔
202
        """return the contents of a column in a line"""
203

204
        return line[self._get_header_key(column)]
×
205

206
    def lines_per_column(self, column, content):
1✔
207
        """Return all lines with the same column content
208
        e.g. return all lines of column='lane' content='1'"""
209
        for line in self.samplesheet:
1✔
210
            if line[column] == content:
1✔
211
                yield line
1✔
212

213
    def lines_per_column_r(self, column, content):
1✔
214
        """Return all lines with the same column content
215
        e.g. return all lines of column='Lane' content='1'"""
216
        for line in self.samplesheet_r:
1✔
217
            if line[column] == content:
1✔
218
                yield line
1✔
219

220
    def is_pooled_lane(self, lane: int, column="lane"):
1✔
221
        """Return True if lane contains multiple samples"""
222
        lane_count = 0
1✔
223
        lane = str(lane)
1✔
224
        for line in self.samplesheet:
1✔
225
            if line[column] == lane:
1✔
226
                lane_count += 1
1✔
227

228
            if lane_count > 1:
1✔
229
                return True
1✔
230

231
        return False
1✔
232

233
    def is_pooled_lane_r(self, lane, column="lane"):
1✔
234
        """Return True if lane contains multiple samples based on the original header"""
235
        lane_count = 0
1✔
236
        lane = str(lane)
1✔
237
        for line in self.samplesheet_r:
1✔
238
            if line[column] == lane:
1✔
239
                lane_count += 1
1✔
240

241
            if lane_count > 1:
1✔
242
                return True
1✔
243

244
        return False
1✔
245

246
    def pooled_lanes(self) -> Set[int]:
1✔
247
        """Return set of pooled lanes"""
248
        pooled_lanes = set()
1✔
249
        for lane in self.column("lane"):
1✔
250
            if self.is_pooled_lane(lane):
1✔
251
                pooled_lanes.add(int(lane))
1✔
252
        return pooled_lanes
1✔
253

254
    def sample_in_pooled_lane(self, sample: str) -> bool:
1✔
255
        """Return True if sample is in pooled lane"""
256

257
        for line in self.lines():
1✔
258
            if sample == line["sample_id"]:
1✔
259
                if self.pooled_lanes().intersection({int(line["lane"])}):
1✔
260
                    return True
1✔
261
        return False
1✔
262

263
    def validate(self):
1✔
264
        """General validation of a samplesheet"""
265

266
        def _validate_length(section):
1✔
267
            if len(section) > 2:
1✔
268
                header = section[0]
1✔
269
                lines = section[1:]
1✔
270
                for i, line in enumerate(lines):
1✔
271
                    if len(header) != len(line):
1✔
272
                        msg = "'{}': #fields != #fields in header".format(line)
1✔
273
                        # add i + 2 as it makes it easier to spot the 'wrong' line
274
                        return (msg, i + 2)
1✔
275
            return True
1✔
276

277
        def _validate_unique_index(samplesheet):
1✔
278
            lanes = list(set(self.column("lane")))
1✔
279
            for lane in lanes:
1✔
280
                if self.is_pooled_lane(lane, column="lane"):
1✔
281
                    sample_of = dict()
1✔
282
                    for line in self.lines_per_column("lane", lane):
1✔
283
                        index = line["index"]
1✔
284
                        if index not in sample_of:
1✔
285
                            sample_of[index] = set()
1✔
286
                        sample_of[index].add(line["sample_id"])
1✔
287
                    for index, samples in sample_of.items():
1✔
288
                        if len(samples) > 1:
1✔
289
                            samples_list = list(samples)
1✔
290
                            samples_list.sort()
1✔
291
                            return (
1✔
292
                                "Same index for {} on lane {}".format(
293
                                    " , ".join(samples_list), lane
294
                                ),
295
                                index,
296
                            )
297

298
            return True
1✔
299

300
        def _validate_sample_name(samplesheet):
1✔
301
            for i, line in enumerate(samplesheet):
1✔
302
                forbidden_chars = set(" ")
1✔
303
                if any((c in forbidden_chars) for c in line["sample_id"]):
1✔
304
                    return (
1✔
305
                        "Sample contains forbidden chars ({}): {}".format(
306
                            forbidden_chars, line["sample_id"]
307
                        ),
308
                        i + 2,
309
                    )
310

311
        rs = _validate_unique_index(self.samplesheet)
1✔
312
        if type(rs) is tuple:
1✔
313
            raise SampleSheetValidationException(self.DATA, rs[1], rs[0])
1✔
314

315
        for section_marker, section in self.section.items():
1✔
316
            validation_section = section[
1✔
317
                :
318
            ]  # only validate the content, not the [Data] header
319
            rs = _validate_length(validation_section)
1✔
320
            if type(rs) is tuple:
1✔
321
                raise SampleSheetValidationException(section_marker, rs[1], rs[0])
1✔
322

323
        rs = _validate_sample_name(self.samplesheet)
1✔
324
        if type(rs) is tuple:
1✔
325
            raise SampleSheetValidationException(self.DATA, rs[1], rs[0])
1✔
326

327
        return True
1✔
328

329

330
class HiSeqXSamplesheet(Samplesheet):
1✔
331
    def unparse(self, delim=","):
1✔
332
        """Reconstruct the sample sheet based on the (modified) parsed values."""
333

334
        yield "[Data]"
×
335
        yield delim.join(self._get_data_header_r())
×
336
        for line in self.samplesheet:
×
337
            line_r = []
×
338
            for part in self._get_data_header():
×
339
                line_r.append(line[part])
×
340
            yield delim.join(line_r)
×
341

342
    def validate(self):
1✔
343
        Samplesheet.validate(self)
1✔
344

345
        def _validate_project_samplename() -> Tuple[str, int]:
1✔
346
            for i, line in enumerate(self.lines()):
1✔
347
                if line["project"] != line["sample_name"]:
1✔
348
                    # add i + 2 as it makes it easier to spot the 'wrong' line
349
                    line_nr = i + 2
×
350
                    msg = "Project and SampleName cannot be different!"
×
351
                    return msg, line_nr
×
352

353
        def _validate_index() -> Tuple[str, int]:
1✔
354
            for i, line in enumerate(self.lines()):
1✔
355
                if line["index"] == "":
1✔
356
                    # add i + 2 as it makes it easier to spot the 'wrong' line
357
                    line_nr = i + 2
×
358
                    msg = "Missing index!"
×
359
                    return msg, line_nr
×
360

361
        def _validate_index_types() -> Tuple[str, None]:
1✔
362
            """Check if there are multiple types of indexes, meaning single, dual, or both"""
363

364
            indexes = []
1✔
365
            try:
1✔
366
                for index in zip(self.column("index"), self.column("index2")):
1✔
367
                    new_index_type = (len(index[0]), len(index[1]))
1✔
368
                    if not any(
1✔
369
                        [index_type == new_index_type for index_type in indexes]
370
                    ):
371
                        indexes.append(new_index_type)
1✔
372
            except KeyError:
×
373
                for index in self.column("index"):
×
374
                    new_index_type = (len(index[0]), len(index[1]))
×
375
                    if not any(
×
376
                        [index_type == new_index_type for index_type in indexes]
377
                    ):
378
                        indexes.append(new_index_type)
×
379
            if len(indexes) > 1:
1✔
380
                msg = "Multiple index types in SampleSheet!"
1✔
381
                line_nr = None
1✔
382
                return msg, line_nr
1✔
383

384
        for rs in [
1✔
385
            _validate_index(),
386
            _validate_project_samplename(),
387
            _validate_index_types(),
388
        ]:
389
            if type(rs) is tuple:
1✔
390
                raise SampleSheetValidationException(self.DATA, rs[1], rs[0])
1✔
391

392
        return True
×
393

394

395
class iseqSampleSheet(Samplesheet):
1✔
396
    header_map = {
1✔
397
        "fcid": "FCID",
398
        "sample_id": "Sample_ID",
399
        "sample_name": "Sample_Name",
400
        "description": "Description",
401
        "index": "index",
402
        "index2": "index2",
403
        "project": "Sample_Project",
404
    }
405

406

407
class MiseqSamplesheet(Samplesheet):
1✔
408
    header_map = {
1✔
409
        "lane": "Lane",
410
        "sample_id": "Sample_ID",
411
        "sample_name": "Sample_Name",
412
        "sample_plate": "Sample_Plate",
413
        "sample_well": "Sample_Well",
414
        "i7_index_id": "I7_Index_ID",
415
        "index": "index",
416
        "sample_project": "Sample_Project",
417
        "index2": "index2",
418
        "i5_index_id": "I5_Index_ID",
419
        "genome_folder": "GenomeFolder",
420
        "description": "Description",
421
    }
422

423
    si5 = {
1✔
424
        "TAGATCGC": "S501",
425
        "CTCTCTAT": "S502",
426
        "TATCCTCT": "S503",
427
        "AGAGTAGA": "S504",
428
        "GTAAGGAG": "S505",
429
        "ACTGCATA": "S506",
430
        "AAGGAGTA": "S507",
431
        "CTAAGCCT": "S508",
432
        "CGTCTAAT": "S510",
433
        "TCTCTCCG": "S511",
434
        "TCGACTAG": "S513",
435
        "TTCTAGCT": "S515",
436
        "CCTAGAGT": "S516",
437
        "GCGTAAGA": "S517",
438
        "CTATTAAG": "S518",
439
        "AAGGCTAT": "S520",
440
        "GAGCCTTA": "S521",
441
        "TTATGCGA": "S522"
442
        #'TAGATCGC': 'N501',
443
        #'CTCTCTAT': 'N502',
444
        #'TATCCTCT': 'N503',
445
        #'AGAGTAGA': 'N504',
446
        #'GTAAGGAG': 'N505',
447
        #'ACTGCATA': 'N506',
448
        #'AAGGAGTA': 'N507',
449
        #'CTAAGCCT': 'N508',
450
    }
451
    ni7 = {
1✔
452
        "TAAGGCGA": "N701",
453
        "CGTACTAG": "N702",
454
        "AGGCAGAA": "N703",
455
        "TCCTGAGC": "N704",
456
        "GGACTCCT": "N705",
457
        "TAGGCATG": "N706",
458
        "CTCTCTAC": "N707",
459
        "CAGAGAGG": "N708",
460
        "CGAGGCTG": "N710",
461
        "AAGAGGCA": "N711",
462
        "GTAGAGGA": "N712",
463
        "GCTCATGA": "N714",
464
        "ATCTCAGG": "N715",
465
        "ACTCGCTA": "N716",
466
        "GGAGCTAC": "N718",
467
        "GCGTAGTA": "N719",
468
        "CGGAGCCT": "N720",
469
        "TACGCTGC": "N721",
470
        "ATGCGCAG": "N722",
471
        "TAGCGCTC": "N723",
472
        "ACTGAGCG": "N724",
473
        "CCTAAGAC": "N726",
474
        "CGATCAGT": "N727",
475
        "TGCAGCTA": "N728",
476
        "TCGACGTC": "N729",
477
    }
478
    di7 = {
1✔
479
        "ATTACTCG": "D701",
480
        "TCCGGAGA": "D702",
481
        "CGCTCATT": "D703",
482
        "GAGATTCC": "D704",
483
        "ATTCAGAA": "D705",
484
        "GAATTCGT": "D706",
485
        "CTGAAGCT": "D707",
486
        "TAATGCGC": "D708",
487
        "CGGCTATG": "D709",
488
        "TCCGCGAA": "D710",
489
        "TCTCGCGC": "D711",
490
        "AGCGATAG": "D712",
491
    }
492
    di5 = {
1✔
493
        "TATAGCCT": "D501",
494
        "ATAGAGGC": "D502",
495
        "CCTATCCT": "D503",
496
        "GGCTCTGA": "D504",
497
        "AGGCGAAG": "D505",
498
        "TAATCTTA": "D506",
499
        "CAGGACGT": "D507",
500
        "GTACTGAC": "D508",
501
    }
502

503
    def __init__(self, samplesheet_path, flowcell=None, sequencing_date=None):
1✔
504
        Samplesheet.__init__(self, samplesheet_path)
×
505
        if flowcell == None:
×
506
            flowcell = Path(samplesheet_path).dirname().basename().split("_")[-1]
×
507
        if sequencing_date == None:
×
508
            sequencing_date = Path(samplesheet_path).dirname().basename().split("_")[0]
×
509
        self.flowcell = flowcell
×
510
        self.sequencing_date = sequencing_date
×
511

512
    def _get_flowcell(self):
1✔
513
        return self.flowcell
×
514

515
    def to_demux(self, delim=",", end="\n"):
1✔
516
        """Convert miseq to hiseq style samplesheet for demultiplexing."""
517

518
        checked_indexes = {}  # the indexes in the SampleSheet
×
519

520
        def clean(input):
×
521
            return re.sub(r"[ _/]+", "", input)
×
522

523
        def check_index(index):
×
524
            checked_indexes[index] = 1
×
525

526
        def get_undetermined_indexes():
×
527
            # combine the D indexes
528
            for di7_index, di7_name in self.di7.items():
×
529
                for di5_index, di5_name in self.di5.items():
×
530
                    d_index = di7_index + "-" + di5_index
×
531
                    if d_index not in checked_indexes:
×
532
                        yield d_index, str(di7_name + "-" + di5_name)
×
533

534
            # combine the other indexes
535
            for ni7_index, ni7_name in self.ni7.items():
×
536
                for si5_index, si5_name in self.si5.items():
×
537
                    ns_index = ni7_index + "-" + si5_index
×
538
                    if ns_index not in checked_indexes:
×
539
                        yield ns_index, str(ni7_name + "-" + si5_name)
×
540

541
        expected_header = [
×
542
            "FCID",
543
            "Lane",
544
            "SampleID",
545
            "SampleRef",
546
            "Index",
547
            "Description",
548
            "Control",
549
            "Recipe",
550
            "Operator",
551
            "SampleProject",
552
        ]
553

554
        # get the experiment name
555
        flowcell_id = self._get_flowcell()
×
556
        cur_date = self.sequencing_date
×
557

558
        header = self.section[self.DATA][0]  # '0' is the csv header
×
559
        data_lines = (
×
560
            []
561
        )  # the new data section. Each line holds a dict with the right header keys
562
        data_lines.append(expected_header)
×
563
        for line in self.samplesheet:
×
564
            data_line = {}
×
565
            data_line["FCID"] = flowcell_id
×
566
            data_line["Lane"] = "1"
×
567
            data_line["SampleID"] = cur_date + "-" + clean(line["sample_id"])
×
568
            data_line["SampleRef"] = "hg19"
×
569
            data_line["Index"] = clean(line["index"]) + "-" + clean(line["index2"])
×
570
            data_line["Description"] = line["description"]
×
571
            data_line["Control"] = "N"
×
572
            data_line["Recipe"] = "R1"
×
573
            data_line["Operator"] = "MS"
×
574
            data_line["SampleProject"] = clean(line["sample_project"])
×
575

576
            check_index(clean(line["index"]) + "-" + clean(line["index2"]))
×
577

578
            ordered_line = []
×
579
            for head in expected_header:
×
580
                ordered_line.append(data_line[head])
×
581
            data_lines.append(ordered_line)
×
582

583
        # add the undetermined indexes
584
        for undetermined_index, undetermined_index_name in get_undetermined_indexes():
×
585
            data_line = {}
×
586
            data_line["FCID"] = flowcell_id
×
587
            data_line["Lane"] = "1"
×
588
            data_line["SampleID"] = cur_date + "-" + undetermined_index_name
×
589
            data_line["SampleRef"] = "hg19"
×
590
            data_line["Index"] = undetermined_index
×
591
            data_line["Description"] = "ctmr"
×
592
            data_line["Control"] = "N"
×
593
            data_line["Recipe"] = "R1"
×
594
            data_line["Operator"] = "script"
×
595
            data_line["SampleProject"] = "Undetermined"
×
596

597
            ordered_line = []
×
598
            for head in expected_header:
×
599
                ordered_line.append(data_line[head])
×
600
            data_lines.append(ordered_line)
×
601

602
        rs = []
×
603
        for line in data_lines:
×
604
            rs.append(delim.join(line))
×
605

606
        return end.join(rs)
×
607

608
    def validate(self):
1✔
609
        Samplesheet.validate(self)
×
610

611
        def _validate_missing_index():
×
612
            """We know what indexes to expect in the samplesheet. Check which ones are missing
613
            or have been added."""
614

615
            def is_readymade_index(index):
×
616
                for di7_index, di7_name in self.di7.items():
×
617
                    for di5_index, di5_name in self.di5.items():
×
618
                        d_index = di7_index + "-" + di5_index
×
619
                        if index == d_index:
×
620
                            return index
×
621

622
                # combine the other indexes
623
                for ni7_index, ni7_name in self.ni7.items():
×
624
                    for si5_index, si5_name in self.si5.items():
×
625
                        ns_index = ni7_index + "-" + si5_index
×
626
                        if index == ns_index:
×
627
                            return index
×
628

629
            for i, line in enumerate(self.lines()):
×
630
                readymade_index = is_readymade_index(line["index"])
×
631

632
                if not readymade_index:
×
633
                    return (
×
634
                        "Index {} not in readymade indexes!".format(line["index"]),
635
                        i,
636
                    )
637

638
        rs = _validate_missing_index()
×
639
        if type(rs) is tuple:
×
640
            raise SampleSheetValidationException(self.DATA, rs[1], rs[0])
×
641

642

643
class HiSeq2500Samplesheet(Samplesheet):
1✔
644
    header_map = {
1✔
645
        "fcid": "FCID",
646
        "lane": "Lane",
647
        "sample_id": "SampleID",
648
        "sample_ref": "SampleRef",
649
        "index": "Index",
650
        "index2": "Index2",
651
        "sample_name": "SampleName",
652
        "control": "Control",
653
        "recipe": "Recipe",
654
        "operator": "Operator",
655
        "description": "Description",
656
        "project": "SampleProject",
657
    }
658

659
    def convert(self, delim: str = COMMA, end: str = NEW_LINE) -> str:
1✔
660
        """Converts an old HiSeq2500 sample sheet for use on Hasta"""
661

662
        def _is_dual_index(line: list) -> bool:
1✔
663
            return "Index2" in line
1✔
664

665
        def _insert_empty_index(line: list) -> None:
1✔
666
            line.insert(5, "")
1✔
667

668
        converted_samplesheet: list = list()
1✔
669
        converted_samplesheet.append(self.DATA)
1✔
670
        header: list = self.section[self.DATA][0]
1✔
671
        sample_rows: list = self.section[self.DATA][1:]
1✔
672
        new_header: list = list(Samplesheet.header_map.values())
1✔
673
        converted_header: str = delim.join(new_header)
1✔
674
        converted_samplesheet.append(converted_header)
1✔
675

676
        for row in sample_rows:
1✔
677
            if not _is_dual_index(header):
1✔
678
                _insert_empty_index(row)
1✔
679
            converted_row = delim.join(row)
1✔
680
            converted_samplesheet.append(converted_row)
1✔
681

682
        return end.join(converted_samplesheet)
1✔
683

684

685
class NIPTSamplesheet(Samplesheet):
1✔
686
    header_map = {
1✔
687
        "lane": "Lane",
688
        "sample_id": "Sample_ID",
689
        "sample_name": "Sample_Name",
690
        "sample_plate": "Sample_Plate",
691
        "sample_well": "Sample_Well",
692
        "i7_index_id": "I7_Index_ID",
693
        "index": "index",
694
        "sample_project": "Sample_Project",
695
        "description": "Description",
696
        "sample_type": "SampleType",
697
        "library_nm": "Library_nM",
698
    }
699

700
    def _get_flowcell(self):
1✔
701
        # get the experiment name
702
        for line in self.section[self.HEADER]:
1✔
703
            if line[0] == "Experiment Name":
1✔
704
                return line[1]
1✔
705
        return None
×
706

707
    def _get_project_id(self):
1✔
708
        # get the experiment name
709
        for line in self.section[self.HEADER]:
1✔
710
            if line[0] == "Investigator Name":
1✔
711
                return line[1].split("_")[1]
1✔
712
        return None
×
713

714
    def massage(self, delim=",", end="\n"):
1✔
715
        """Abuses the Investigator Name field to store information about the run.
716

717
        Reshuffles the [Data] section so that it becomes a valid sample sheet.
718

719
        Returns a massaged SampleSheet.
720
        """
721
        # get the experiment name
722
        flowcell_id = self._get_flowcell()
1✔
723

724
        section_copy = deepcopy(self.section)
1✔
725

726
        for i, line in enumerate(section_copy[self.HEADER]):
1✔
727
            if line[0] == "Investigator Name":
1✔
728
                investigator_name = re.split(" |_", line[1])
1✔
729
                if investigator_name[-1] != flowcell_id:
1✔
730
                    investigator_name.append(flowcell_id)
1✔
731
                line[1] = "_".join(investigator_name)
1✔
732
                section_copy[self.HEADER][i] = line
1✔
733

734
        rs = []
1✔
735
        for section_marker, section in section_copy.items():
1✔
736
            if section_marker in self.section_markers:
1✔
737
                rs.append(delim.join(self.section_markers[section_marker]))
1✔
738
            for line in section:
1✔
739
                rs.append(delim.join(line))
1✔
740
        return end.join(rs)
1✔
741

742
    def to_demux(self, delim=",", end="\n"):
1✔
743
        """Replaced the [Data] section with a demuxable [Data] section.
744

745
        This is non destructive and will only return a demuxable samplesheet.
746

747
        Convert the Data section from
748
            Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description,SampleType
749
        to
750
            FCID,Lane,SampleID,SampleRef,Index,Description,Control,Recipe,Operator,SampleProject
751
        """
752

753
        expected_header = [
1✔
754
            "FCID",
755
            "Lane",
756
            "SampleID",
757
            "SampleRef",
758
            "Index",
759
            "Description",
760
            "Control",
761
            "Recipe",
762
            "Operator",
763
            "SampleProject",
764
        ]
765

766
        # get the experiment name
767
        flowcell_id = self._get_flowcell()
1✔
768
        project_id = self._get_project_id()
1✔
769

770
        header = self.section[self.DATA][0]  # '0' is the csv header
1✔
771
        data_lines = (
1✔
772
            []
773
        )  # the new data section. Each line holds a dict with the right header keys
774
        data_lines.append(expected_header)
1✔
775
        for i, line in enumerate(self.section[self.DATA][1:]):
1✔
776
            data_line = dict(zip(header, line))
1✔
777

778
            data_line["FCID"] = flowcell_id
1✔
779
            data_line["SampleID"] = data_line["Sample_ID"]
1✔
780
            data_line["SampleRef"] = "hg19"
1✔
781
            data_line["Index"] = data_line["index"]
1✔
782
            data_line["Description"] = data_line["SampleType"]
1✔
783
            data_line["Control"] = project_id
1✔
784
            data_line["Recipe"] = "R1"
1✔
785
            data_line["Operator"] = "NN"
1✔
786
            data_line["SampleProject"] = project_id
1✔
787

788
            ordered_line = []
1✔
789
            for head in expected_header:
1✔
790
                ordered_line.append(data_line[head])
1✔
791
            data_lines.append(ordered_line)
1✔
792

793
        rs = []
1✔
794
        for line in data_lines:
1✔
795
            rs.append(delim.join(line))
1✔
796
        return end.join(rs)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc