14247436103

Committed 03 Apr 2025 03:55PM UTC coverage: 87.595% (-1.0%) from 88.623%

Build # 14247436103

Build Type

push

github

Committed by

veghp

Commit Message

Update docs generation

Run Details

2182 of 2491 relevant lines covered (87.6%)

0.88 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.68

/dnacauldron/SequenceRepository.py

from .biotools import (
    load_records_from_files,
    set_record_topology,
    sequence_to_biopython_record,
)
from fuzzywuzzy import process


class NotInRepositoryError(Exception):
    def __init__(self, parts, repository):
        self.parts = parts
        self.repository = repository

        # CREATE THE MESSAGE AND INITIALIZE THE EXCEPTION:

        suggestions = [
            self.create_part_suggestion_string(part_name) for part_name in parts
        ]
        suggestions = ", ".join(suggestions)
        message = "Parts not found in %s: %s" % (repository.name, suggestions)
        super().__init__(message)

    def create_part_suggestion_string(self, part_name):
        suggestions = self.repository.suggest_part_names(part_name)
        if len(suggestions) == 0:
            return part_name
        return "%s (did you mean %s ?)" % (part_name, " or ".join(suggestions))


class RepositoryDuplicateError(Exception):
    def __init__(self, parts, repository):
        self.parts = parts
        self.repository = repository
        parts_list = ", ".join(parts)
        if len(parts_list) > 150:
            parts_list = parts_list[:150] + "..."
        parts = "Part ID%s %s" % ("s" if len(parts) > 1 else "", parts_list)
        repo_name = (" in " + repository.name) if repository.name else ""
        message = parts + " duplicated in " + repo_name
        super().__init__(message)


class SequenceRepository:
    """Sequence repositories store and provide sequence records.

    The records are organized into collections, for instance "parts" to host
    parts, "constructs" for records created during assembly plan simulation,
    or any other collection name like "emma_connectors" to store EMMA
    connectors.

    The suggested initialization of a sequence repository is:

    >>> repository = SequenceRepository()
    >>> repository.import_records(files=['part.fa', 'records.zip', etc.])



    Parameters
    ----------

    collections
      A dict {'collection_name': {'record_id': record, ...}, ...} giving for
      each collection a dict of Biopython records.

    name
      The name of the repository as it may appear in error messages and other
      reports.
    """

    def __init__(self, collections=None, name="repo"):
        self.collections = collections or {}
        self.name = name

    def add_record(self, record, collection="parts"):
        """Add one record to a collection, using its record.id as key.

        The collection is created if it doesn't exist.

        The record can also be a pair (id, "ATGTGCC...").
        """
        if isinstance(record, (tuple, list)):
            _id, _sequence = record
            record = sequence_to_biopython_record(_sequence, id=_id)
        if self.contains_record(record.id):
            raise RepositoryDuplicateError([record.id], repository=self)
        if collection not in self.collections:
            self.collections[collection] = {}
        self.collections[collection][record.id] = record

    def add_records(self, records, collection="parts"):
        """Add"""

        if len(records) == 0:
            return
        for record in records:
            self.add_record(record, collection=collection)

    def contains_record(self, record_id):
        """Return whether the repo has a record corresponding to the given id"""
        collections = self.collections.values()
        return any(record_id in collection for collection in collections)

    def get_record(self, record_id):
        """Return the record from the repository from its ID."""
        for collection in self.collections.values():
            if record_id in collection:
                return collection[record_id]
        raise NotInRepositoryError([record_id], self)

    def get_records(self, record_ids):
        """Get a list of records from a list of record IDs."""
        records = []
        not_in_repository = []
        for name in record_ids:
            if self.contains_record(name):
                records.append(self.get_record(name))
            else:
                not_in_repository.append(name)
        if len(not_in_repository):
            raise NotInRepositoryError(not_in_repository, repository=self)
        return records

    def import_records(
        self,
        files=None,
        folder=None,
        collection="parts",
        use_file_names_as_ids=True,
        topology="default_to_linear",
    ):
        """Import records into the repository, from files and zips and folders.

        Parameters
        ----------

        files
          A list of file paths, either Genbank, Fasta, Snapgene (.dna), or zips
          containing any of these formats.

        folder
          Path to a folder which can be provided instead of ``files``.

        collection
          Name of the collection under which to import the new records.

        use_file_names_as_ids
          If True, the file name will be used as ID for any record obtained
          from a single-record file (fasta files with many records will still
          use the internal ID).

        topology
          Can be "circular", "linear", "default_to_circular" (will default
          to circular if ``annotations['topology']`` is not already set) or
          "default_to_linear".
        """
        if folder is not None:
            records = load_records_from_files(
                folder=folder, use_file_names_as_ids=use_file_names_as_ids
            )
        elif files is not None:
            records = load_records_from_files(
                files=files,
                use_file_names_as_ids=use_file_names_as_ids,
            )
        else:
            raise ValueError("Provide either ``files`` or ``folder``")
        for r in records:
            set_record_topology(r, topology)

        self.add_records(records, collection=collection)

    def get_part_names_by_collection(self, format="dict"):
        """Return a dictionnary or a string representing the repo's content.

        Format: "dict" or "string"
        """
        result = {
            collection_name: list(parts.keys())
            for collection_name, parts in self.collections.items()
        }
        if format == "dict":
            return result
        else:
            return "\n".join(
                "\n".join([name] + ["- " + part for part in sorted(parts)])
                for name, parts in result.items()
            )

    def get_all_part_names(self):
        """Return the list of all part names"""
        parts = [
            part for collection in self.collections.values() for part in collection
        ]
        return sorted(parts)

    def suggest_part_names(self, query, cutoff=90, limit=3):
        """Suggest part names in the repo close to the given query."""
        search = process.extract(query, self.get_all_part_names())
        return [
            name
            for (name, score) in sorted(search, key=lambda e: -e[1])
            if score >= cutoff
        ][:limit]

1	from .biotools import (	1✔
2	load_records_from_files,
3	set_record_topology,
4	sequence_to_biopython_record,
5	)
6	from fuzzywuzzy import process	1✔
7
8
9	class NotInRepositoryError(Exception):	1✔
10	def __init__(self, parts, repository):	1✔
11	self.parts = parts	1✔
12	self.repository = repository	1✔
13
14	# CREATE THE MESSAGE AND INITIALIZE THE EXCEPTION:
15
16	suggestions = [	1✔
17	self.create_part_suggestion_string(part_name) for part_name in parts
18	]
19	suggestions = ", ".join(suggestions)	1✔
20	message = "Parts not found in %s: %s" % (repository.name, suggestions)	1✔
21	super().__init__(message)	1✔
22
23	def create_part_suggestion_string(self, part_name):	1✔
24	suggestions = self.repository.suggest_part_names(part_name)	1✔
25	if len(suggestions) == 0:	1✔
26	return part_name	1✔
27	return "%s (did you mean %s ?)" % (part_name, " or ".join(suggestions))	×
28
29
30	class RepositoryDuplicateError(Exception):	1✔
31	def __init__(self, parts, repository):	1✔
32	self.parts = parts	1✔
33	self.repository = repository	1✔
34	parts_list = ", ".join(parts)	1✔
35	if len(parts_list) > 150:	1✔
36	parts_list = parts_list[:150] + "..."	×
37	parts = "Part ID%s %s" % ("s" if len(parts) > 1 else "", parts_list)	1✔
38	repo_name = (" in " + repository.name) if repository.name else ""	1✔
39	message = parts + " duplicated in " + repo_name	1✔
40	super().__init__(message)	1✔
41
42
43	class SequenceRepository:	1✔
44	"""Sequence repositories store and provide sequence records.
45
46	The records are organized into collections, for instance "parts" to host
47	parts, "constructs" for records created during assembly plan simulation,
48	or any other collection name like "emma_connectors" to store EMMA
49	connectors.
50
51	The suggested initialization of a sequence repository is:
52
53	>>> repository = SequenceRepository()
54	>>> repository.import_records(files=['part.fa', 'records.zip', etc.])
55
56
57
58	Parameters
59	----------
60
61	collections
62	A dict {'collection_name': {'record_id': record, ...}, ...} giving for
63	each collection a dict of Biopython records.
64
65	name
66	The name of the repository as it may appear in error messages and other
67	reports.
68	"""
69
70	def __init__(self, collections=None, name="repo"):	1✔
71	self.collections = collections or {}	1✔
72	self.name = name	1✔
73
74	def add_record(self, record, collection="parts"):	1✔
75	"""Add one record to a collection, using its record.id as key.
76
77	The collection is created if it doesn't exist.
78
79	The record can also be a pair (id, "ATGTGCC...").
80	"""
81	if isinstance(record, (tuple, list)):	1✔
82	_id, _sequence = record	1✔
83	record = sequence_to_biopython_record(_sequence, id=_id)	1✔
84	if self.contains_record(record.id):	1✔
85	raise RepositoryDuplicateError([record.id], repository=self)	1✔
86	if collection not in self.collections:	1✔
87	self.collections[collection] = {}	1✔
88	self.collections[collection][record.id] = record	1✔
89
90	def add_records(self, records, collection="parts"):	1✔
91	"""Add"""
92
93	if len(records) == 0:	1✔
94	return	×
95	for record in records:	1✔
96	self.add_record(record, collection=collection)	1✔
97
98	def contains_record(self, record_id):	1✔
99	"""Return whether the repo has a record corresponding to the given id"""
100	collections = self.collections.values()	1✔
101	return any(record_id in collection for collection in collections)	1✔
102
103	def get_record(self, record_id):	1✔
104	"""Return the record from the repository from its ID."""
105	for collection in self.collections.values():	1✔
106	if record_id in collection:	1✔
107	return collection[record_id]	1✔
108	raise NotInRepositoryError([record_id], self)	×
109
110	def get_records(self, record_ids):	1✔
111	"""Get a list of records from a list of record IDs."""
112	records = []	1✔
113	not_in_repository = []	1✔
114	for name in record_ids:	1✔
115	if self.contains_record(name):	1✔
116	records.append(self.get_record(name))	1✔
117	else:
118	not_in_repository.append(name)	1✔
119	if len(not_in_repository):	1✔
120	raise NotInRepositoryError(not_in_repository, repository=self)	1✔
121	return records	1✔
122
123	def import_records(	1✔
124	self,
125	files=None,
126	folder=None,
127	collection="parts",
128	use_file_names_as_ids=True,
129	topology="default_to_linear",
130	):
131	"""Import records into the repository, from files and zips and folders.
132
133	Parameters
134	----------
135
136	files
137	A list of file paths, either Genbank, Fasta, Snapgene (.dna), or zips
138	containing any of these formats.
139
140	folder
141	Path to a folder which can be provided instead of ``files``.
142
143	collection
144	Name of the collection under which to import the new records.
145
146	use_file_names_as_ids
147	If True, the file name will be used as ID for any record obtained
148	from a single-record file (fasta files with many records will still
149	use the internal ID).
150
151	topology
152	Can be "circular", "linear", "default_to_circular" (will default
153	to circular if ``annotations['topology']`` is not already set) or
154	"default_to_linear".
155	"""
156	if folder is not None:	1✔
157	records = load_records_from_files(	1✔
158	folder=folder, use_file_names_as_ids=use_file_names_as_ids
159	)
160	elif files is not None:	1✔
161	records = load_records_from_files(	1✔
162	files=files,
163	use_file_names_as_ids=use_file_names_as_ids,
164	)
165	else:
166	raise ValueError("Provide either ``files`` or ``folder``")	×
167	for r in records:	1✔
168	set_record_topology(r, topology)	1✔
169
170	self.add_records(records, collection=collection)	1✔
171
172	def get_part_names_by_collection(self, format="dict"):	1✔
173	"""Return a dictionnary or a string representing the repo's content.
174
175	Format: "dict" or "string"
176	"""
177	result = {	1✔
178	collection_name: list(parts.keys())
179	for collection_name, parts in self.collections.items()
180	}
181	if format == "dict":	1✔
182	return result	×
183	else:
184	return "\n".join(	1✔
185	"\n".join([name] + ["- " + part for part in sorted(parts)])
186	for name, parts in result.items()
187	)
188
189	def get_all_part_names(self):	1✔
190	"""Return the list of all part names"""
191	parts = [	1✔
192	part for collection in self.collections.values() for part in collection
193	]
194	return sorted(parts)	1✔
195
196	def suggest_part_names(self, query, cutoff=90, limit=3):	1✔
197	"""Suggest part names in the repo close to the given query."""
198	search = process.extract(query, self.get_all_part_names())	1✔
199	return [	1✔
200	name
201	for (name, score) in sorted(search, key=lambda e: -e[1])
202	if score >= cutoff
203	][:limit]

Edinburgh-Genome-Foundry / DnaCauldron / 14247436103

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous