• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Edinburgh-Genome-Foundry / SnapGeneReader / 14177235573

31 Mar 2025 05:07PM UTC coverage: 96.97% (+0.03%) from 96.939%
14177235573

push

github

veghp
Update build

192 of 198 relevant lines covered (96.97%)

0.97 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.92
/snapgene_reader/snapgene_reader.py
1
"""
2
snapgene reader main file
3
"""
4

5
import struct
1✔
6

7
# import json
8
import xmltodict
1✔
9

10
from Bio.Seq import Seq
1✔
11
from Bio.SeqRecord import SeqRecord
1✔
12

13
try:
1✔
14
    # Biopython <1.78
15
    from Bio.Alphabet import DNAAlphabet
1✔
16

17
    has_dna_alphabet = True
×
18
except ImportError:
1✔
19
    # Biopython >=1.78
20
    has_dna_alphabet = False
1✔
21
from Bio.SeqFeature import SeqFeature, FeatureLocation
1✔
22
import html2text
1✔
23

24
HTML_PARSER = html2text.HTML2Text()
1✔
25
HTML_PARSER.ignore_emphasis = True
1✔
26
HTML_PARSER.ignore_links = True
1✔
27
HTML_PARSER.body_width = 0
1✔
28
HTML_PARSER.single_line_break = True
1✔
29

30

31
def parse(val):
1✔
32
    """Parse html."""
33
    if isinstance(val, str):
1✔
34
        return HTML_PARSER.handle(val).strip().replace("\n", " ").replace('"', "'")
1✔
35
    else:
36
        return val
×
37

38

39
def parse_dict(obj):
1✔
40
    """Parse dict in the obj."""
41
    if isinstance(obj, dict):
1✔
42
        for key in obj:
1✔
43
            if isinstance(obj[key], str):
1✔
44
                obj[key] = parse(obj[key])
1✔
45
            elif isinstance(obj[key], dict):
1✔
46
                parse_dict(obj[key])
1✔
47
    return obj
1✔
48

49

50
def snapgene_file_to_dict(filepath=None, fileobject=None):
1✔
51
    """Return a dictionary containing the data from a ``*.dna`` file.
52

53
    Parameters
54
    ----------
55
    filepath
56
        Path to a .dna file created with SnapGene.
57

58
    fileobject
59
        On object-like pointing to the data of a .dna file created with
60
        SnapGene.
61
    """
62
    if filepath is not None:
1✔
63
        fileobject = open(filepath, "rb")
1✔
64

65
    if fileobject.read(1) != b"\t":
1✔
66
        raise ValueError("Wrong format for a SnapGene file!")
×
67

68
    def unpack(size, mode):
1✔
69
        """Unpack the fileobject."""
70
        return struct.unpack(">" + mode, fileobject.read(size))[0]
1✔
71

72
    # READ THE DOCUMENT PROPERTIES
73
    length = unpack(4, "I")
1✔
74
    title = fileobject.read(8).decode("ascii")
1✔
75
    if length != 14 or title != "SnapGene":
1✔
76
        raise ValueError("Wrong format for a SnapGene file !")
×
77

78
    data = dict(
1✔
79
        isDNA=unpack(2, "H"),
80
        exportVersion=unpack(2, "H"),
81
        importVersion=unpack(2, "H"),
82
        features=[],
83
    )
84

85
    while True:
1✔
86
        # READ THE WHOLE FILE, BLOCK BY BLOCK, UNTIL THE END
87
        next_byte = fileobject.read(1)
1✔
88

89
        # next_byte table
90
        # 0: dna sequence
91
        # 1: compressed DNA
92
        # 2: unknown
93
        # 3: unknown
94
        # 5: primers
95
        # 6: notes
96
        # 7: history tree
97
        # 8: additional sequence properties segment
98
        # 9: file Description
99
        # 10: features
100
        # 11: history node
101
        # 13: unknown
102
        # 16: alignable sequence
103
        # 17: alignable sequence
104
        # 18: sequence trace
105
        # 19: Uracil Positions
106
        # 20: custom DNA colors
107

108
        if next_byte == b"":
1✔
109
            # END OF FILE
110
            break
1✔
111

112
        block_size = unpack(4, "I")
1✔
113

114
        if ord(next_byte) == 0:
1✔
115
            # READ THE SEQUENCE AND ITS PROPERTIES
116
            props = unpack(1, "b")
1✔
117
            data["dna"] = dict(
1✔
118
                topology="circular" if props & 0x01 else "linear",
119
                strandedness="double" if props & 0x02 > 0 else "single",
120
                damMethylated=props & 0x04 > 0,
121
                dcmMethylated=props & 0x08 > 0,
122
                ecoKIMethylated=props & 0x10 > 0,
123
                length=block_size - 1,
124
            )
125
            data["seq"] = fileobject.read(block_size - 1).decode("ascii")
1✔
126

127
        elif ord(next_byte) == 6:
1✔
128
            # READ THE NOTES
129
            block_content = fileobject.read(block_size).decode("utf-8")
1✔
130
            note_data = parse_dict(xmltodict.parse(block_content))
1✔
131
            data["notes"] = note_data["Notes"]
1✔
132

133
        elif ord(next_byte) == 10:
1✔
134
            # READ THE FEATURES
135
            strand_dict = {"0": ".", "1": "+", "2": "-", "3": "="}
1✔
136
            format_dict = {"@text": parse, "@int": int}
1✔
137
            features_data = xmltodict.parse(fileobject.read(block_size))
1✔
138
            features = features_data["Features"]["Feature"]
1✔
139
            if not isinstance(features, list):
1✔
140
                features = [features]
1✔
141
            for feature in features:
1✔
142
                segments = feature["Segment"]
1✔
143
                if not isinstance(segments, list):
1✔
144
                    segments = [segments]
1✔
145
                segments_ranges = [
1✔
146
                    sorted([int(e) for e in segment["@range"].split("-")])
147
                    for segment in segments
148
                ]
149
                qualifiers = feature.get("Q", [])
1✔
150
                if not isinstance(qualifiers, list):
1✔
151
                    qualifiers = [qualifiers]
1✔
152
                parsed_qualifiers = {}
1✔
153
                for qualifier in qualifiers:
1✔
154
                    if qualifier["V"] is None:
1✔
155
                        pass
×
156
                    elif isinstance(qualifier["V"], list):
1✔
157
                        if len(qualifier["V"][0].items()) == 1:
1✔
158
                            parsed_qualifiers[qualifier["@name"]] = l_v = []
1✔
159
                            for e_v in qualifier["V"]:
1✔
160
                                fmt, value = e_v.popitem()
1✔
161
                                fmt = format_dict.get(fmt, parse)
1✔
162
                                l_v.append(fmt(value))
1✔
163
                        else:
164
                            parsed_qualifiers[qualifier["@name"]] = d_v = {}
1✔
165
                            for e_v in qualifier["V"]:
1✔
166
                                (fmt1, value1), (_, value2) = e_v.items()
1✔
167
                                fmt = format_dict.get(fmt1, parse)
1✔
168
                                d_v[value2] = fmt(value1)
1✔
169
                    else:
170
                        fmt, value = qualifier["V"].popitem()
1✔
171
                        fmt = format_dict.get(fmt, parse)
1✔
172
                        parsed_qualifiers[qualifier["@name"]] = fmt(value)
1✔
173

174
                if "label" not in parsed_qualifiers:
1✔
175
                    parsed_qualifiers["label"] = feature["@name"]
1✔
176
                if "note" not in parsed_qualifiers:
1✔
177
                    parsed_qualifiers["note"] = []
1✔
178
                if not isinstance(parsed_qualifiers["note"], list):
1✔
179
                    parsed_qualifiers["note"] = [parsed_qualifiers["note"]]
1✔
180
                color = segments[0]["@color"]
1✔
181
                parsed_qualifiers["note"].append("color: " + color)
1✔
182

183
                data["features"].append(
1✔
184
                    dict(
185
                        start=min([start - 1 for (start, end) in segments_ranges]),
186
                        end=max([end for (start, end) in segments_ranges]),
187
                        strand=strand_dict[feature.get("@directionality", "0")],
188
                        type=feature["@type"],
189
                        name=feature["@name"],
190
                        color=segments[0]["@color"],
191
                        textColor="black",
192
                        segments=segments,
193
                        row=0,
194
                        isOrf=False,
195
                        qualifiers=parsed_qualifiers,
196
                    )
197
                )
198

199
        else:
200
            # WE IGNORE THE WHOLE BLOCK
201
            fileobject.read(block_size)
1✔
202
            pass
1✔
203

204
    fileobject.close()
1✔
205
    return data
1✔
206

207

208
def snapgene_file_to_seqrecord(filepath=None, fileobject=None):
1✔
209
    """Return a BioPython SeqRecord from the data of a ``*.dna`` file.
210

211
    Parameters
212
    ----------
213
    filepath
214
        Path to a .dna file created with SnapGene.
215

216
    fileobject
217
        On object-like pointing to the data of a .dna file created with
218
        SnapGene.
219
    """
220
    data = snapgene_file_to_dict(filepath=filepath, fileobject=fileobject)
1✔
221
    strand_dict = {"+": 1, "-": -1, ".": 0, "=": 0}
1✔
222

223
    if has_dna_alphabet:
1✔
224
        seq = Seq(data["seq"], alphabet=DNAAlphabet())
×
225
    else:
226
        seq = Seq(data["seq"])
1✔
227

228
    seqrecord = SeqRecord(
1✔
229
        seq=seq,
230
        features=[
231
            SeqFeature(
232
                location=FeatureLocation(
233
                    start=feature["start"],
234
                    end=feature["end"],
235
                    strand=strand_dict[feature["strand"]],
236
                ),
237
                type=feature["type"],
238
                qualifiers=feature["qualifiers"],
239
            )
240
            for feature in data["features"]
241
        ],
242
        annotations=dict(topology=data["dna"]["topology"], **data["notes"]),
243
    )
244

245
    seqrecord.annotations["molecule_type"] = "DNA"
1✔
246

247
    return seqrecord
1✔
248

249

250
def snapgene_file_to_gbk(read_file_object, write_file_object):
1✔
251
    """Convert a file object."""
252

253
    def analyse_gs(dic, *args, **kwargs):
1✔
254
        """Extract gs block in the document."""
255
        if "default" not in kwargs:
1✔
256
            kwargs["default"] = None
1✔
257

258
        for arg in args:
1✔
259
            if arg in dic:
1✔
260
                dic = dic[arg]
1✔
261
            else:
262
                return kwargs["default"]
1✔
263
        return dic
1✔
264

265
    data = snapgene_file_to_dict(fileobject=read_file_object)
1✔
266
    wfo = write_file_object
1✔
267
    wfo.write(
1✔
268
        (
269
            "LOCUS       Exported              {0:>6} bp ds-DNA     {1:>8} SYN \
270
15-APR-2012\n"
271
        ).format(len(data["seq"]), data["dna"]["topology"])
272
    )
273
    definition = analyse_gs(data, "notes", "Description", default=".").replace(
1✔
274
        "\n", "\n            "
275
    )
276
    wfo.write("DEFINITION  {}\n".format(definition))
1✔
277
    wfo.write("ACCESSION   .\n")
1✔
278
    wfo.write("VERSION     .\n")
1✔
279
    wfo.write(
1✔
280
        "KEYWORDS    {}\n".format(
281
            analyse_gs(data, "notes", "CustomMapLabel", default=".")
282
        )
283
    )
284
    wfo.write("SOURCE      .\n")
1✔
285
    wfo.write("  ORGANISM  .\n")
1✔
286

287
    references = analyse_gs(data, "notes", "References")
1✔
288

289
    reference_count = 0
1✔
290
    if references:
1✔
291
        for key in references:
1✔
292
            reference_count += 1
1✔
293
            ref = references[key]
1✔
294
            wfo.write(
1✔
295
                "REFERENCE   {}  (bases 1 to {} )\n".format(
296
                    reference_count, analyse_gs(data, "dna", "length")
297
                )
298
            )
299
            for key2 in ref:
1✔
300
                gb_key = key2.replace("@", "").upper()
1✔
301
                wfo.write("  {}   {}\n".format(gb_key, ref[key2]))
1✔
302

303
    # generate special reference
304
    reference_count += 1
1✔
305
    wfo.write(
1✔
306
        "REFERENCE   {}  (bases 1 to {} )\n".format(
307
            reference_count, analyse_gs(data, "dna", "length")
308
        )
309
    )
310
    wfo.write("  AUTHORS   SnapGeneReader\n")
1✔
311
    wfo.write("  TITLE     Direct Submission\n")
1✔
312
    wfo.write(
1✔
313
        (
314
            "  JOURNAL   Exported Monday, Sep 05, 2020 from SnapGene File\
315
 Reader\n"
316
        )
317
    )
318
    wfo.write(
1✔
319
        "            https://github.com/Edinburgh-Genome-Foundry/SnapGeneReader\n"
320
    )
321

322
    wfo.write(
1✔
323
        "COMMENT     {}\n".format(
324
            analyse_gs(data, "notes", "Comments", default=".")
325
            .replace("\n", "\n            ")
326
            .replace("\\", "")
327
        )
328
    )
329
    wfo.write("FEATURES             Location/Qualifiers\n")
1✔
330

331
    features = analyse_gs(data, "features")
1✔
332
    for feature in features:
1✔
333
        strand = analyse_gs(feature, "strand", default="")
1✔
334

335
        segments = analyse_gs(feature, "segments", default=[])
1✔
336
        segments = [x for x in segments if x["@type"] == "standard"]
1✔
337
        if len(segments) > 1:
1✔
338
            line = "join("
1✔
339
            for segment in segments:
1✔
340
                segment_range = analyse_gs(segment, "@range").replace("-", "..")
1✔
341
                if analyse_gs(segment, "@type") == "standard":
1✔
342
                    line += segment_range
1✔
343
                    line += ","
1✔
344
            line = line[:-1] + ")"
1✔
345
        else:
346
            line = "{}..{}".format(
1✔
347
                analyse_gs(feature, "start", default=" "),
348
                analyse_gs(feature, "end", default=" "),
349
            )
350

351
        if strand == "-":
1✔
352
            wfo.write(
1✔
353
                "     {} complement({})\n".format(
354
                    analyse_gs(feature, "type", default=" ").ljust(15),
355
                    line,
356
                )
357
            )
358
        else:
359
            wfo.write(
1✔
360
                "     {} {}\n".format(
361
                    analyse_gs(feature, "type", default=" ").ljust(15),
362
                    line,
363
                )
364
            )
365
        strand = analyse_gs(feature, "strand", default="")
1✔
366
        # if strand == '-':
367
        #     wfo.write('                     /direction=LEFT\n')
368
        # name
369
        wfo.write(
1✔
370
            '                     /note="{}"\n'.format(
371
                analyse_gs(feature, "name", default="feature")
372
            )
373
        )
374
        # qualifiers
375
        for q_key in analyse_gs(feature, "qualifiers", default={}):
1✔
376
            # do not write label, because it has been written at first.
377
            if q_key == "label":
1✔
378
                pass
1✔
379
            elif q_key == "note":
1✔
380
                for note in analyse_gs(feature, "qualifiers", q_key, default=[]):
1✔
381
                    # do note write color, because it will be written later
382
                    if note[:6] != "color:":
1✔
383
                        wfo.write('                     /note="{}"\n'.format(note))
1✔
384
            else:
385
                wfo.write(
1✔
386
                    '                     /{}="{}"\n'.format(
387
                        q_key, analyse_gs(feature, "qualifiers", q_key, default="")
388
                    )
389
                )
390
        if len(segments) > 1:
1✔
391
            wfo.write(
1✔
392
                (
393
                    '                     /note="This feature \
394
has {} segments:'
395
                ).format(len(segments))
396
            )
397
            for seg_i, seg in enumerate(segments):
1✔
398
                segment_name = analyse_gs(seg, "@name", default="")
1✔
399
                if segment_name:
1✔
400
                    segment_name = " / {}".format(segment_name)
1✔
401
                wfo.write(
1✔
402
                    "\n                        {}:  {} / {}{}".format(
403
                        seg_i,
404
                        seg["@range"].replace("-", " .. "),
405
                        seg["@color"],
406
                        segment_name,
407
                    )
408
                )
409
            wfo.write('"\n')
1✔
410
        else:
411
            # write colors and direction
412
            wfo.write(
1✔
413
                21 * " "
414
                + '/note="color: {}'.format(
415
                    analyse_gs(feature, "color", default="#ffffff")
416
                )
417
            )
418
            if strand == "-":
1✔
419
                wfo.write('; direction: LEFT"\n')
1✔
420
                # wfo.write('"\n')
421
            elif strand == "+":
1✔
422
                wfo.write('; direction: RIGHT"\n')
1✔
423
            else:
424
                wfo.write('"\n')
1✔
425

426
    # sequence
427
    wfo.write("ORIGIN\n")
1✔
428
    seq = analyse_gs(data, "seq")
1✔
429
    # divide rows
430
    for i in range(0, len(seq), 60):
1✔
431
        wfo.write(str(i).rjust(9))
1✔
432
        for j in range(i, min(i + 60, len(seq)), 10):
1✔
433
            wfo.write(" {}".format(seq[j : j + 10]))
1✔
434
        wfo.write("\n")
1✔
435
    wfo.write("//\n")
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc