4627485829

Build Type

push

github-actions

Committed by karlnyr

Commit Message

remova conda stuff, expand aliases

Run Details

501 of 941 relevant lines covered (53.24%)

0.53 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

35.45

/demux/utils/novaseq_samplesheet.py

"""
    Create a samplesheet for NovaSeq flowcells
"""
import csv
import sys
from distutils.version import StrictVersion, LooseVersion
from typing import Any, Dict, List, Union

from cglims.api import ClinicalLims
from demux.constants.constants import COMMA, DASH, SPACE
from demux.constants.samplesheet import (
    NIPT_INDEX_LENGTH,
    PARAMETER_TO_VERSION,
    REV_COMP_CONTROL_SOFTWARE_VERSION,
    REV_COMP_REAGENT_KIT_VERSION,
)
from demux.exc import NoValidReagentKitFound

from .runparameters import NovaseqRunParameters
from .samplesheet import Samplesheet


class CreateNovaseqSamplesheet:
    """Create a raw sample sheet for NovaSeq flowcells"""

    LIMS_KEYS = [
        "fcid",
        "lane",
        "sample_id",
        "sample_ref",
        "index",
        "index2",
        "sample_name",
        "control",
        "recipe",
        "operator",
        "project",
    ]

    def __init__(
        self,
        dummy_indexes_file: str,
        flowcell: str,
        lims_config: dict,
        pad: bool,
        runs_dir: str,
    ):
        self.dummy_indexes_file = dummy_indexes_file
        self.flowcell = flowcell
        self.lims_api = ClinicalLims(**lims_config)
        self.pad = pad
        self.runparameters = NovaseqRunParameters(self.flowcell, runs_dir)

    @property
    def control_software_version(self) -> StrictVersion:
        """Returns control software version in StrictVersion format"""
        return StrictVersion(self.runparameters.control_software_version)

    @property
    def reagent_kit_version(self) -> LooseVersion:
        """Derives the reagent kit version from the run parameters"""

        reagent_kit_version = self.runparameters.reagent_kit_version
        if reagent_kit_version not in PARAMETER_TO_VERSION.keys():
            raise NoValidReagentKitFound(
                f"Expected reagent kit version {', '.join(PARAMETER_TO_VERSION.keys())}. Found {reagent_kit_version} instead. Exiting",
            )

        return LooseVersion(PARAMETER_TO_VERSION[reagent_kit_version])

    @property
    def header(self) -> list:
        """Create the sample sheet header"""
        return list(Samplesheet.header_map.values())

    def get_dummy_sample_information(
        self, dummy_index: str, lane: int, name: str
    ) -> Dict[Union[str, Any], Union[Union[str, int], Any]]:
        """Constructs and returns a dummy sample in novaseq samplesheet format"""

        return {
            "control": "N",
            "description": "",
            "fcid": self.flowcell,
            "index": dummy_index,
            "index2": "",
            "lane": lane,
            "operator": "script",
            "project": "indexcheck",
            "recipe": "R1",
            "sample_id": name.replace(" ", "-").replace("(", "-").replace(")", "-"),
            "sample_name": "indexcheck",
            "sample_ref": "hg19",
        }

    @staticmethod
    def get_project_name(project: str, delimiter=SPACE) -> str:
        """Only keeps the first part of the project name"""
        return project.split(delimiter)[0]

    @staticmethod
    def get_reverse_complement_dna_seq(dna: str) -> str:
        """Generates the reverse complement of a DNA sequence"""
        complement = {"A": "T", "C": "G", "G": "C", "T": "A"}
        return "".join(complement[base] for base in reversed(dna))

    @staticmethod
    def is_dual_index(index: str, delimiter=DASH) -> bool:
        """Determines if an index in the raw samplesheet is dual index or not"""
        return delimiter in index

    @staticmethod
    def is_dummy_sample_in_samplesheet(dummy_index: str, sample_indexes: list) -> bool:
        """Determines if a dummy sample is already present in the samplesheet"""
        return any(
            sample_index.startswith(dummy_index) for sample_index in sample_indexes
        )

    @staticmethod
    def get_sample_indexes_in_lane(lane: str, raw_samplesheet: List[Dict]) -> list:
        """Returns all sample indexes in a given lane"""
        return [sample["index"] for sample in raw_samplesheet if sample["lane"] == lane]

    def replace_project_with_lims_sample_name(
        self, raw_samplesheet: List[Dict]
    ) -> List[Dict]:
        """Replaces the project in the SampleName column with the LIMS Sample Name"""
        for sample in raw_samplesheet:
            sample["sample_name"] = self.lims_api.sample(sample["sample_id"]).name
        return raw_samplesheet

    def is_reverse_complement(self) -> bool:
        """
        If the run used NovaSeq control software version REV_COMP_CONTROL_SOFTWARE_VERSION or later and reagent
        kit version REV_COMP_REAGENT_KIT_VERSION or later, the second index should be the reverse complement
        """
        return (
            self.control_software_version >= REV_COMP_CONTROL_SOFTWARE_VERSION
            and self.reagent_kit_version >= REV_COMP_REAGENT_KIT_VERSION
        )

    def is_nipt_samplesheet(self) -> bool:
        """Determines if a sample sheet if for NIPT demultiplexing, based on the index length in the run paramaters"""
        return self.runparameters.index_reads == NIPT_INDEX_LENGTH

    def add_dummy_indexes(self, raw_samplesheet: List[Dict]) -> List[Dict]:
        """Add all dummy indexes to raw sample sheet. Dummy indexes are used to check for index
        contamination"""
        with open(f"{self.dummy_indexes_file}") as csv_file:
            dummy_samples_csv = csv.reader(csv_file, delimiter=COMMA)
            dummy_samples = [row for row in dummy_samples_csv]
            new_dummy_samples = []
            lanes = {sample["lane"] for sample in raw_samplesheet}

            for lane in lanes:
                sample_indexes = self.get_sample_indexes_in_lane(lane, raw_samplesheet)
                for sample_name, dummy_index in dummy_samples:
                    if not self.is_dummy_sample_in_samplesheet(
                        dummy_index, sample_indexes
                    ):
                        new_dummy_sample = self.get_dummy_sample_information(
                            dummy_index,
                            lane,
                            sample_name,
                        )
                        new_dummy_samples.append(new_dummy_sample)

            raw_samplesheet.extend(new_dummy_samples)

            return raw_samplesheet

    def remove_unwanted_indexes(self, raw_samplesheet: List[Dict]) -> List[Dict]:
        """Filter out indexes of unwanted length and single indexes"""

        raw_samplesheet = [
            line for line in raw_samplesheet if self.is_dual_index(line["index"])
        ]
        return raw_samplesheet

    def adapt_indexes(self, raw_samplesheet: List[Dict]) -> List[Dict]:
        """Adapts the indexes: pads all indexes so that all indexes have a length equal to the number  of index reads,
        and takes the reverse complement of index 2 in case of the new novaseq software control version
        (1.7) in combination with the new reagent kit (version 1.5)"""

        is_reverse_complement = self.is_reverse_complement()

        for line in raw_samplesheet:
            index1, index2 = line["index"].split("-")
            if self.pad and len(index1) == 8:
                line["index"], line["index2"] = self.pad_and_rc_indexes(
                    index1, index2, is_reverse_complement
                )
            elif len(index2) == 10:
                line["index"] = index1
                line["index2"] = (
                    self.get_reverse_complement_dna_seq(index2)
                    if is_reverse_complement
                    else index2
                )
            else:
                line["index"], line["index2"] = index1, index2

        return raw_samplesheet

    def pad_and_rc_indexes(
        self, index1: str, index2: str, is_reverse_complement: bool
    ) -> tuple:
        """Pads and reverse complements indexes"""

        if self.runparameters.index_reads == 8:
            index2 = (
                self.get_reverse_complement_dna_seq(index2)
                if is_reverse_complement
                else index2
            )
        if self.runparameters.index_reads == 10:
            index1 += "AT"
            index2 = (
                self.get_reverse_complement_dna_seq("AC" + index2)
                if is_reverse_complement
                else index2 + "AC"
            )

        return index1, index2

    def get_raw_samplesheet(self) -> List[Dict]:
        raw_samplesheet = list(self.lims_api.samplesheet(self.flowcell))
        if not raw_samplesheet:
            sys.stderr.write(f"Samplesheet for {self.flowcell} not found in LIMS! ")
            sys.exit()
        return raw_samplesheet

    def construct_samplesheet(self, delimiter=COMMA, end="\n") -> str:
        """Construct the sample sheet"""

        demux_samplesheet = [delimiter.join(self.header)]
        raw_samplesheet = self.get_raw_samplesheet()
        raw_samplesheet = self.replace_project_with_lims_sample_name(raw_samplesheet)
        if not self.is_nipt_samplesheet():
            raw_samplesheet = self.add_dummy_indexes(raw_samplesheet)
        raw_samplesheet = self.remove_unwanted_indexes(raw_samplesheet)
        raw_samplesheet = self.adapt_indexes(raw_samplesheet)
        for line in raw_samplesheet:
            # fix the project content
            project = self.get_project_name(line["project"])
            line["project"] = project

            demux_samplesheet.append(
                delimiter.join([str(line[lims_key]) for lims_key in self.LIMS_KEYS])
            )

        return end.join(demux_samplesheet)

1	"""
2	Create a samplesheet for NovaSeq flowcells
3	"""
4	import csv	1✔
5	import sys	1✔
6	from distutils.version import StrictVersion, LooseVersion	1✔
7	from typing import Any, Dict, List, Union	1✔
8
9	from cglims.api import ClinicalLims	1✔
10	from demux.constants.constants import COMMA, DASH, SPACE	1✔
11	from demux.constants.samplesheet import (	1✔
12	NIPT_INDEX_LENGTH,
13	PARAMETER_TO_VERSION,
14	REV_COMP_CONTROL_SOFTWARE_VERSION,
15	REV_COMP_REAGENT_KIT_VERSION,
16	)
17	from demux.exc import NoValidReagentKitFound	1✔
18
19	from .runparameters import NovaseqRunParameters	1✔
20	from .samplesheet import Samplesheet	1✔
21
22
23	class CreateNovaseqSamplesheet:	1✔
24	"""Create a raw sample sheet for NovaSeq flowcells"""
25
26	LIMS_KEYS = [	1✔
27	"fcid",
28	"lane",
29	"sample_id",
30	"sample_ref",
31	"index",
32	"index2",
33	"sample_name",
34	"control",
35	"recipe",
36	"operator",
37	"project",
38	]
39
40	def __init__(	1✔
41	self,
42	dummy_indexes_file: str,
43	flowcell: str,
44	lims_config: dict,
45	pad: bool,
46	runs_dir: str,
47	):
48	self.dummy_indexes_file = dummy_indexes_file	×
49	self.flowcell = flowcell	×
50	self.lims_api = ClinicalLims(**lims_config)	×
51	self.pad = pad	×
52	self.runparameters = NovaseqRunParameters(self.flowcell, runs_dir)	×
53
54	@property	1✔
55	def control_software_version(self) -> StrictVersion:	1✔
56	"""Returns control software version in StrictVersion format"""
57	return StrictVersion(self.runparameters.control_software_version)	×
58
59	@property	1✔
60	def reagent_kit_version(self) -> LooseVersion:	1✔
61	"""Derives the reagent kit version from the run parameters"""
62
63	reagent_kit_version = self.runparameters.reagent_kit_version	×
64	if reagent_kit_version not in PARAMETER_TO_VERSION.keys():	×
65	raise NoValidReagentKitFound(	×
66	f"Expected reagent kit version {', '.join(PARAMETER_TO_VERSION.keys())}. Found {reagent_kit_version} instead. Exiting",
67	)
68
69	return LooseVersion(PARAMETER_TO_VERSION[reagent_kit_version])	×
70
71	@property	1✔
72	def header(self) -> list:	1✔
73	"""Create the sample sheet header"""
74	return list(Samplesheet.header_map.values())	×
75
76	def get_dummy_sample_information(	1✔
77	self, dummy_index: str, lane: int, name: str
78	) -> Dict[Union[str, Any], Union[Union[str, int], Any]]:
79	"""Constructs and returns a dummy sample in novaseq samplesheet format"""
80
81	return {	×
82	"control": "N",
83	"description": "",
84	"fcid": self.flowcell,
85	"index": dummy_index,
86	"index2": "",
87	"lane": lane,
88	"operator": "script",
89	"project": "indexcheck",
90	"recipe": "R1",
91	"sample_id": name.replace(" ", "-").replace("(", "-").replace(")", "-"),
92	"sample_name": "indexcheck",
93	"sample_ref": "hg19",
94	}
95
96	@staticmethod	1✔
97	def get_project_name(project: str, delimiter=SPACE) -> str:	1✔
98	"""Only keeps the first part of the project name"""
99	return project.split(delimiter)[0]	×
100
101	@staticmethod	1✔
102	def get_reverse_complement_dna_seq(dna: str) -> str:	1✔
103	"""Generates the reverse complement of a DNA sequence"""
104	complement = {"A": "T", "C": "G", "G": "C", "T": "A"}	×
105	return "".join(complement[base] for base in reversed(dna))	×
106
107	@staticmethod	1✔
108	def is_dual_index(index: str, delimiter=DASH) -> bool:	1✔
109	"""Determines if an index in the raw samplesheet is dual index or not"""
110	return delimiter in index	×
111
112	@staticmethod	1✔
113	def is_dummy_sample_in_samplesheet(dummy_index: str, sample_indexes: list) -> bool:	1✔
114	"""Determines if a dummy sample is already present in the samplesheet"""
115	return any(	×
116	sample_index.startswith(dummy_index) for sample_index in sample_indexes
117	)
118
119	@staticmethod	1✔
120	def get_sample_indexes_in_lane(lane: str, raw_samplesheet: List[Dict]) -> list:	1✔
121	"""Returns all sample indexes in a given lane"""
122	return [sample["index"] for sample in raw_samplesheet if sample["lane"] == lane]	×
123
124	def replace_project_with_lims_sample_name(	1✔
125	self, raw_samplesheet: List[Dict]
126	) -> List[Dict]:
127	"""Replaces the project in the SampleName column with the LIMS Sample Name"""
128	for sample in raw_samplesheet:	×
129	sample["sample_name"] = self.lims_api.sample(sample["sample_id"]).name	×
130	return raw_samplesheet	×
131
132	def is_reverse_complement(self) -> bool:	1✔
133	"""
134	If the run used NovaSeq control software version REV_COMP_CONTROL_SOFTWARE_VERSION or later and reagent
135	kit version REV_COMP_REAGENT_KIT_VERSION or later, the second index should be the reverse complement
136	"""
137	return (	×
138	self.control_software_version >= REV_COMP_CONTROL_SOFTWARE_VERSION
139	and self.reagent_kit_version >= REV_COMP_REAGENT_KIT_VERSION
140	)
141
142	def is_nipt_samplesheet(self) -> bool:	1✔
143	"""Determines if a sample sheet if for NIPT demultiplexing, based on the index length in the run paramaters"""
144	return self.runparameters.index_reads == NIPT_INDEX_LENGTH	×
145
146	def add_dummy_indexes(self, raw_samplesheet: List[Dict]) -> List[Dict]:	1✔
147	"""Add all dummy indexes to raw sample sheet. Dummy indexes are used to check for index
148	contamination"""
149	with open(f"{self.dummy_indexes_file}") as csv_file:	×
150	dummy_samples_csv = csv.reader(csv_file, delimiter=COMMA)	×
151	dummy_samples = [row for row in dummy_samples_csv]	×
152	new_dummy_samples = []	×
153	lanes = {sample["lane"] for sample in raw_samplesheet}	×
154
155	for lane in lanes:	×
156	sample_indexes = self.get_sample_indexes_in_lane(lane, raw_samplesheet)	×
157	for sample_name, dummy_index in dummy_samples:	×
158	if not self.is_dummy_sample_in_samplesheet(	×
159	dummy_index, sample_indexes
160	):
161	new_dummy_sample = self.get_dummy_sample_information(	×
162	dummy_index,
163	lane,
164	sample_name,
165	)
166	new_dummy_samples.append(new_dummy_sample)	×
167
168	raw_samplesheet.extend(new_dummy_samples)	×
169
170	return raw_samplesheet	×
171
172	def remove_unwanted_indexes(self, raw_samplesheet: List[Dict]) -> List[Dict]:	1✔
173	"""Filter out indexes of unwanted length and single indexes"""
174
175	raw_samplesheet = [	×
176	line for line in raw_samplesheet if self.is_dual_index(line["index"])
177	]
178	return raw_samplesheet	×
179
180	def adapt_indexes(self, raw_samplesheet: List[Dict]) -> List[Dict]:	1✔
181	"""Adapts the indexes: pads all indexes so that all indexes have a length equal to the number of index reads,
182	and takes the reverse complement of index 2 in case of the new novaseq software control version
183	(1.7) in combination with the new reagent kit (version 1.5)"""
184
185	is_reverse_complement = self.is_reverse_complement()	×
186
187	for line in raw_samplesheet:	×
188	index1, index2 = line["index"].split("-")	×
189	if self.pad and len(index1) == 8:	×
190	line["index"], line["index2"] = self.pad_and_rc_indexes(	×
191	index1, index2, is_reverse_complement
192	)
193	elif len(index2) == 10:	×
194	line["index"] = index1	×
195	line["index2"] = (	×
196	self.get_reverse_complement_dna_seq(index2)
197	if is_reverse_complement
198	else index2
199	)
200	else:
201	line["index"], line["index2"] = index1, index2	×
202
203	return raw_samplesheet	×
204
205	def pad_and_rc_indexes(	1✔
206	self, index1: str, index2: str, is_reverse_complement: bool
207	) -> tuple:
208	"""Pads and reverse complements indexes"""
209
210	if self.runparameters.index_reads == 8:	×
211	index2 = (	×
212	self.get_reverse_complement_dna_seq(index2)
213	if is_reverse_complement
214	else index2
215	)
216	if self.runparameters.index_reads == 10:	×
217	index1 += "AT"	×
218	index2 = (	×
219	self.get_reverse_complement_dna_seq("AC" + index2)
220	if is_reverse_complement
221	else index2 + "AC"
222	)
223
224	return index1, index2	×
225
226	def get_raw_samplesheet(self) -> List[Dict]:	1✔
227	raw_samplesheet = list(self.lims_api.samplesheet(self.flowcell))	×
228	if not raw_samplesheet:	×
229	sys.stderr.write(f"Samplesheet for {self.flowcell} not found in LIMS! ")	×
230	sys.exit()	×
231	return raw_samplesheet	×
232
233	def construct_samplesheet(self, delimiter=COMMA, end="\n") -> str:	1✔
234	"""Construct the sample sheet"""
235
236	demux_samplesheet = [delimiter.join(self.header)]	×
237	raw_samplesheet = self.get_raw_samplesheet()	×
238	raw_samplesheet = self.replace_project_with_lims_sample_name(raw_samplesheet)	×
239	if not self.is_nipt_samplesheet():	×
240	raw_samplesheet = self.add_dummy_indexes(raw_samplesheet)	×
241	raw_samplesheet = self.remove_unwanted_indexes(raw_samplesheet)	×
242	raw_samplesheet = self.adapt_indexes(raw_samplesheet)	×
243	for line in raw_samplesheet:	×
244	# fix the project content
245	project = self.get_project_name(line["project"])	×
246	line["project"] = project	×
247
248	demux_samplesheet.append(	×
249	delimiter.join([str(line[lims_key]) for lims_key in self.LIMS_KEYS])
250	)
251
252	return end.join(demux_samplesheet)	×

Clinical-Genomics / demultiplexing / 4627485829

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous