4219223887

Build # 4219223887

Build Type

push

github

Committed by Mario Graff

Commit Message

test

Run Details

1728 of 1844 relevant lines covered (93.71%)

0.94 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.28

/text_models/model_selection.py

# Copyright 2020 Mario Graff Guerrero

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from EvoMSA.base import EvoMSA
from EvoMSA.utils import LabelEncoderWrapper
from queue import LifoQueue
from microtc.utils import save_model
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from text_models.utils import macro_f1
import numpy as np
import os


class Node(object):
    """
    Base class to perform model selection on the first-stage models

    >>> from EvoMSA import base
    >>> from EvoMSA.utils import download
    >>> from text_models.model_selection import Node
    >>> from microtc.utils import tweet_iterator
    >>> import os
    >>> tweets = os.path.join(os.path.dirname(base.__file__), 'tests', 'tweets.json')
    >>> D = [[x['text'], x['klass']] for x in tweet_iterator(tweets)]
    >>> models = dict()
    >>> models[0] = [download("emo_Es.tm"), "sklearn.svm.LinearSVC"]
    >>> models[1] = ["EvoMSA.model.AggressivenessEs", "sklearn.svm.LinearSVC"]
    >>> models[2] = [download("b4msa_Es.tm"), "sklearn.svm.LinearSVC"]
    >>> X = [x for x, y in D]
    >>> y = [y for x, y in D]
    >>> kf = KFold(n_splits=3, random_state=1, shuffle=True)
    >>> node = Node([0], models=models, split_dataset=kf, aggregate=np.median, cache=os.path.join("cache", "fw"), metric=macro_f1)
    >>> perf = node.performance(X, y)
    >>> [x for x in node]
    [0-1, 0-2]
    >>> model = node.fit(X, y)


    :param model: Models used in the node - List of keys from :py:attr:`models`
    :type model: list
    :param models: Dictionary of pairs (see :py:attr:`EvoMSA.base.EvoMSA.models`)
    :type model: dict
    :param metric: Performance metric, e.g., accuracy
    :type metric: function
    :param split_dataset: Iterator to split dataset in training and validation
    :type split_dataset: instance
    :param aggregate: :math:`\\text{aggregate}: \\mathbb R^d \\rightarrow \\mathbb R`
    :type aggregate: function
    :param cache: Store the output of text models
    :type cache: str
    :param TR: EvoMSA's default model
    :type TR: bool
    :param stacked_method: Classifier or regressor used to ensemble the outputs of :attr:`EvoMSA.models`
    :type stacked_method: str or class

    """

    def __init__(self, model, models=None,
                 metric=None,
                 split_dataset=None,
                 aggregate=None,
                 cache=None,
                 TR=False,
                 stacked_method="sklearn.naive_bayes.GaussianNB",
                 **kwargs):
        assert metric is not None
        assert split_dataset is not None and hasattr(split_dataset, "split")
        assert aggregate is not None
        assert cache is not None
        self._models = models
        self._model = [x for x in model]
        _ = self._model.copy()
        _.sort()
        self._repr = "-".join(map(str, _))
        self._metric = metric
        self._split_dataset = split_dataset
        self._aggregate = aggregate
        self._cache = cache
        self._TR = TR
        self._kwargs = kwargs
        self._kwargs.update(dict(stacked_method=stacked_method))

    def __repr__(self):
        return self._repr

    def __eq__(self, other):
        return isinstance(other, Node) and str(self) == str(other)

    def __hash__(self):
        return hash(str(self))

    def __iter__(self):
        variables = set(self._models.keys())
        model = self._model
        for x in variables - set(model):
            yield self.__class__(model + [x],
                                 models=self._models,
                                 metric=self._metric,
                                 split_dataset=self._split_dataset,
                                 aggregate=self._aggregate,
                                 cache=self._cache,
                                 TR=self._TR,
                                 **self._kwargs)

    @property
    def model(self):
        """Models as received by :py:class:`EvoMSA.base.EvoMSA`"""

        models = self._models
        return [models[x] for x in self._model]

    def fit(self, X, y):
        return self._fit(X, y, None)

    def _fit(self, X, y, cache):
        """Create an EvoMSA's instance

        :param X: Training set - independent variables
        :type X: list
        :param y: Training set - dependent variable
        :type y: list or np.array
        :param TR: EvoMSA's default model
        :type TR: bool
        :param test_set: Dataset to perform transductive learning
        :type test_set: list
        :rtype: self
        """

        return EvoMSA(TR=self._TR, models=self.model,
                      cache=cache,
                      **self._kwargs).fit(X, y)

    @property
    def perf(self):
        """Performance"""
        return self._perf

    def performance(self, X, y):
        """Compute the performance on the dataset

        :param X: Test set - independent variables
        :type X: list
        :param y: Test set - dependent variable
        :type y: list or np.array
        :rtype: float
        """

        try:
            return self._perf
        except AttributeError:
            perf = []
            cache = self._cache
            for index, (tr, vs) in enumerate(self._split_dataset.split(X)):
                evo = self._fit([X[x] for x in tr],
                                [y[x] for x in tr],
                                cache=cache + "-tr-" + str(index))
                hy = evo.predict([X[x] for x in vs],
                                 cache=cache + "-vs-" + str(index))
                perf.append(self._metric([y[x] for x in vs], hy))
            self._perf = self._aggregate(perf)
        return self._perf

 #   def __cmp__(self, other):
 #       x = self.perf
 #       y = other.perf
 #       return (x > y) - (x < y)

    def __gt__(self, other):
        return self.perf > other.perf


class ForwardSelection(object):
    """Forward Selection on the models

    >>> from EvoMSA import base
    >>> from EvoMSA.utils import download
    >>> from text_models.model_selection import ForwardSelection
    >>> from microtc.utils import tweet_iterator
    >>> import os

    Read the dataset

    >>> tweets = os.path.join(os.path.dirname(base.__file__), 'tests', 'tweets.json')
    >>> D = [[x['text'], x['klass']] for x in tweet_iterator(tweets)]
    
    Models

    >>> models = dict()
    >>> models[0] = [download("emo_Es.tm"), "sklearn.svm.LinearSVC"]
    >>> models[1] = ["EvoMSA.model.AggressivenessEs", "sklearn.svm.LinearSVC"]
    >>> models[2] = [download("b4msa_Es.tm"), "sklearn.svm.LinearSVC"]
    >>> X = [x for x, y in D]
    >>> y = [y for x, y in D]
    >>> fwdSel = ForwardSelection(models)
    >>> best = fwdSel.run(X, y)

    :param models: Dictionary of pairs (see :py:attr:`EvoMSA.base.EvoMSA.models`)
    :type models: dict
    :param node: Node use to perform the search
    :type node: :py:class:`text_models.model_selection.Node`
    :param output: Filename to store intermediate models
    :type output: str
    :param verbose: Level to inform the user
    :type verbose: int
    :param metric: Performance metric
    :type metric: function
    :param split_dataset: Iterator to split dataset in training and validation
    :type split_dataset: instance
    :param aggregate: :math:`\\text{aggregate}: \\mathbb R^d \\rightarrow \\mathbb R`
    :type aggregate: function
    :param cache: Store the output of text models
    :type cache: str

    """

    def __init__(self, models, node=Node,
                 output=None, verbose=logging.INFO,
                 metric=macro_f1,
                 split_dataset=KFold(n_splits=3, random_state=1, shuffle=True),
                 aggregate=np.median,
                 classifier=True,
                 cache=os.path.join("cache", "fw"),
                 **kwargs):
        self._models = models
        self._nodes = [node([k], models=models,
                            metric=metric,
                            split_dataset=split_dataset,
                            aggregate=aggregate,
                            cache=cache,
                            classifier=classifier,
                            **kwargs) for k in models.keys()]
        self._output = output
        self._logger = logging.getLogger("text_models.model_selection")
        self._logger.setLevel(verbose)
        self._le = LabelEncoderWrapper(classifier=classifier)

    def run(self, X, y):
        """Perform the search using X and y to guide it

        :param X: Dataset set - independent variables
        :type X: list
        :param y: Dataset set - dependent variable
        :type y: list or np.array
        :rtype: :py:class:`EvoMSA.model_selection.Node`
        """

        self._logger.info("Starting the search")
        self._le = self._le.fit(y)
        y = self._le.transform(y)
        r = [(node.performance(X, y), node) for node in self._nodes]
        node = max(r, key=lambda x: x[0])[1]
        while True:
            self._logger.info("Model: %s perf: %0.4f" % (node, node.perf))
            nodes = list(node)
            if len(nodes) == 0:
                if self._output:
                    save_model(node, self._output)
                return node
            r = [(xx.performance(X, y), xx) for xx in nodes]
            perf, comp = max(r, key=lambda x: x[0])
            if perf < node.perf:
                break
            node = comp
        if self._output:
            save_model(node, self._output)
        return node


class BeamSelection(ForwardSelection):
    """
    Select the models using Beam Search.
    """

    def run(self, X, y, early_stopping=1000):
        """

        :param early_stopping: Number of rounds to perform early stopping
        :type early_stopping: int
        :rtype: :py:class:`text_models.model_selection.Node`
        """
        self._le = self._le.fit(y)
        y = self._le.transform(y)
        visited = [(node.performance(X, y), node) for
                   node in self._nodes]
        _ = max(visited, key=lambda x: x[0])[1]
        best = None
        nodes = LifoQueue()
        nodes.put(_)
        index = len(visited)
        visited = set([x[1] for x in visited])
        while not nodes.empty() and (len(visited) - index) < early_stopping:
            node = nodes.get()
            if best is None or node > best:
                index = len(visited)
                best = node
                if self._output:
                    save_model(best, self._output)
            self._logger.info("Model: %s perf: %0.4f " % (best, best.perf) +
                              "visited: %s " % len(visited) +
                              "size: %s " % nodes.qsize() +
                              "Rounds: %s" % (len(visited) - index))
            nn = [(xx, xx.performance(X, y)) for
                  xx in node if xx not in visited]
            [visited.add(x) for x, _ in nn]
            nn = [xx for xx, perf in nn if perf >= node.perf]
            if len(nn) == 0:
                continue
            nn.sort()
            [nodes.put(x) for x in nn]
        return best

1	# Copyright 2020 Mario Graff Guerrero
2
3	# Licensed under the Apache License, Version 2.0 (the "License");
4	# you may not use this file except in compliance with the License.
5	# You may obtain a copy of the License at
6
7	# http://www.apache.org/licenses/LICENSE-2.0
8
9	# Unless required by applicable law or agreed to in writing, software
10	# distributed under the License is distributed on an "AS IS" BASIS,
11	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	# See the License for the specific language governing permissions and
13	# limitations under the License.
14	import logging	1✔
15	from EvoMSA.base import EvoMSA	1✔
16	from EvoMSA.utils import LabelEncoderWrapper	1✔
17	from queue import LifoQueue	1✔
18	from microtc.utils import save_model	1✔
19	from sklearn.metrics import f1_score	1✔
20	from sklearn.model_selection import KFold	1✔
21	from text_models.utils import macro_f1	1✔
22	import numpy as np	1✔
23	import os	1✔
24
25
26	class Node(object):	1✔
27	"""
28	Base class to perform model selection on the first-stage models
29
30	>>> from EvoMSA import base
31	>>> from EvoMSA.utils import download
32	>>> from text_models.model_selection import Node
33	>>> from microtc.utils import tweet_iterator
34	>>> import os
35	>>> tweets = os.path.join(os.path.dirname(base.__file__), 'tests', 'tweets.json')
36	>>> D = [[x['text'], x['klass']] for x in tweet_iterator(tweets)]
37	>>> models = dict()
38	>>> models[0] = [download("emo_Es.tm"), "sklearn.svm.LinearSVC"]
39	>>> models[1] = ["EvoMSA.model.AggressivenessEs", "sklearn.svm.LinearSVC"]
40	>>> models[2] = [download("b4msa_Es.tm"), "sklearn.svm.LinearSVC"]
41	>>> X = [x for x, y in D]
42	>>> y = [y for x, y in D]
43	>>> kf = KFold(n_splits=3, random_state=1, shuffle=True)
44	>>> node = Node([0], models=models, split_dataset=kf, aggregate=np.median, cache=os.path.join("cache", "fw"), metric=macro_f1)
45	>>> perf = node.performance(X, y)
46	>>> [x for x in node]
47	[0-1, 0-2]
48	>>> model = node.fit(X, y)
49
50
51	:param model: Models used in the node - List of keys from :py:attr:`models`
52	:type model: list
53	:param models: Dictionary of pairs (see :py:attr:`EvoMSA.base.EvoMSA.models`)
54	:type model: dict
55	:param metric: Performance metric, e.g., accuracy
56	:type metric: function
57	:param split_dataset: Iterator to split dataset in training and validation
58	:type split_dataset: instance
59	:param aggregate: :math:`\\text{aggregate}: \\mathbb R^d \\rightarrow \\mathbb R`
60	:type aggregate: function
61	:param cache: Store the output of text models
62	:type cache: str
63	:param TR: EvoMSA's default model
64	:type TR: bool
65	:param stacked_method: Classifier or regressor used to ensemble the outputs of :attr:`EvoMSA.models`
66	:type stacked_method: str or class
67
68	"""
69
70	def __init__(self, model, models=None,	1✔
71	metric=None,
72	split_dataset=None,
73	aggregate=None,
74	cache=None,
75	TR=False,
76	stacked_method="sklearn.naive_bayes.GaussianNB",
77	**kwargs):
78	assert metric is not None	1✔
79	assert split_dataset is not None and hasattr(split_dataset, "split")	1✔
80	assert aggregate is not None	1✔
81	assert cache is not None	1✔
82	self._models = models	1✔
83	self._model = [x for x in model]	1✔
84	_ = self._model.copy()	1✔
85	_.sort()	1✔
86	self._repr = "-".join(map(str, _))	1✔
87	self._metric = metric	1✔
88	self._split_dataset = split_dataset	1✔
89	self._aggregate = aggregate	1✔
90	self._cache = cache	1✔
91	self._TR = TR	1✔
92	self._kwargs = kwargs	1✔
93	self._kwargs.update(dict(stacked_method=stacked_method))	1✔
94
95	def __repr__(self):	1✔
96	return self._repr	1✔
97
98	def __eq__(self, other):	1✔
99	return isinstance(other, Node) and str(self) == str(other)	1✔
100
101	def __hash__(self):	1✔
102	return hash(str(self))	1✔
103
104	def __iter__(self):	1✔
105	variables = set(self._models.keys())	1✔
106	model = self._model	1✔
107	for x in variables - set(model):	1✔
108	yield self.__class__(model + [x],	1✔
109	models=self._models,
110	metric=self._metric,
111	split_dataset=self._split_dataset,
112	aggregate=self._aggregate,
113	cache=self._cache,
114	TR=self._TR,
115	**self._kwargs)
116
117	@property	1✔
118	def model(self):	1✔
119	"""Models as received by :py:class:`EvoMSA.base.EvoMSA`"""
120
121	models = self._models	1✔
122	return [models[x] for x in self._model]	1✔
123
124	def fit(self, X, y):	1✔
125	return self._fit(X, y, None)	×
126
127	def _fit(self, X, y, cache):	1✔
128	"""Create an EvoMSA's instance
129
130	:param X: Training set - independent variables
131	:type X: list
132	:param y: Training set - dependent variable
133	:type y: list or np.array
134	:param TR: EvoMSA's default model
135	:type TR: bool
136	:param test_set: Dataset to perform transductive learning
137	:type test_set: list
138	:rtype: self
139	"""
140
141	return EvoMSA(TR=self._TR, models=self.model,	1✔
142	cache=cache,
143	**self._kwargs).fit(X, y)
144
145	@property	1✔
146	def perf(self):	1✔
147	"""Performance"""
148	return self._perf	1✔
149
150	def performance(self, X, y):	1✔
151	"""Compute the performance on the dataset
152
153	:param X: Test set - independent variables
154	:type X: list
155	:param y: Test set - dependent variable
156	:type y: list or np.array
157	:rtype: float
158	"""
159
160	try:	1✔
161	return self._perf	1✔
162	except AttributeError:	1✔
163	perf = []	1✔
164	cache = self._cache	1✔
165	for index, (tr, vs) in enumerate(self._split_dataset.split(X)):	1✔
166	evo = self._fit([X[x] for x in tr],	1✔
167	[y[x] for x in tr],
168	cache=cache + "-tr-" + str(index))
169	hy = evo.predict([X[x] for x in vs],	1✔
170	cache=cache + "-vs-" + str(index))
171	perf.append(self._metric([y[x] for x in vs], hy))	1✔
172	self._perf = self._aggregate(perf)	1✔
173	return self._perf	1✔
174
175	# def __cmp__(self, other):
176	# x = self.perf
177	# y = other.perf
178	# return (x > y) - (x < y)
179
180	def __gt__(self, other):	1✔
181	return self.perf > other.perf	×
182
183
184	class ForwardSelection(object):	1✔
185	"""Forward Selection on the models
186
187	>>> from EvoMSA import base
188	>>> from EvoMSA.utils import download
189	>>> from text_models.model_selection import ForwardSelection
190	>>> from microtc.utils import tweet_iterator
191	>>> import os
192
193	Read the dataset
194
195	>>> tweets = os.path.join(os.path.dirname(base.__file__), 'tests', 'tweets.json')
196	>>> D = [[x['text'], x['klass']] for x in tweet_iterator(tweets)]
197
198	Models
199
200	>>> models = dict()
201	>>> models[0] = [download("emo_Es.tm"), "sklearn.svm.LinearSVC"]
202	>>> models[1] = ["EvoMSA.model.AggressivenessEs", "sklearn.svm.LinearSVC"]
203	>>> models[2] = [download("b4msa_Es.tm"), "sklearn.svm.LinearSVC"]
204	>>> X = [x for x, y in D]
205	>>> y = [y for x, y in D]
206	>>> fwdSel = ForwardSelection(models)
207	>>> best = fwdSel.run(X, y)
208
209	:param models: Dictionary of pairs (see :py:attr:`EvoMSA.base.EvoMSA.models`)
210	:type models: dict
211	:param node: Node use to perform the search
212	:type node: :py:class:`text_models.model_selection.Node`
213	:param output: Filename to store intermediate models
214	:type output: str
215	:param verbose: Level to inform the user
216	:type verbose: int
217	:param metric: Performance metric
218	:type metric: function
219	:param split_dataset: Iterator to split dataset in training and validation
220	:type split_dataset: instance
221	:param aggregate: :math:`\\text{aggregate}: \\mathbb R^d \\rightarrow \\mathbb R`
222	:type aggregate: function
223	:param cache: Store the output of text models
224	:type cache: str
225
226	"""
227
228	def __init__(self, models, node=Node,	1✔
229	output=None, verbose=logging.INFO,
230	metric=macro_f1,
231	split_dataset=KFold(n_splits=3, random_state=1, shuffle=True),
232	aggregate=np.median,
233	classifier=True,
234	cache=os.path.join("cache", "fw"),
235	**kwargs):
236	self._models = models	1✔
237	self._nodes = [node([k], models=models,	1✔
238	metric=metric,
239	split_dataset=split_dataset,
240	aggregate=aggregate,
241	cache=cache,
242	classifier=classifier,
243	**kwargs) for k in models.keys()]
244	self._output = output	1✔
245	self._logger = logging.getLogger("text_models.model_selection")	1✔
246	self._logger.setLevel(verbose)	1✔
247	self._le = LabelEncoderWrapper(classifier=classifier)	1✔
248
249	def run(self, X, y):	1✔
250	"""Perform the search using X and y to guide it
251
252	:param X: Dataset set - independent variables
253	:type X: list
254	:param y: Dataset set - dependent variable
255	:type y: list or np.array
256	:rtype: :py:class:`EvoMSA.model_selection.Node`
257	"""
258
259	self._logger.info("Starting the search")	1✔
260	self._le = self._le.fit(y)	1✔
261	y = self._le.transform(y)	1✔
262	r = [(node.performance(X, y), node) for node in self._nodes]	1✔
263	node = max(r, key=lambda x: x[0])[1]	1✔
264	while True:
265	self._logger.info("Model: %s perf: %0.4f" % (node, node.perf))	1✔
266	nodes = list(node)	1✔
267	if len(nodes) == 0:	1✔
268	if self._output:	×
269	save_model(node, self._output)	×
270	return node	×
271	r = [(xx.performance(X, y), xx) for xx in nodes]	1✔
272	perf, comp = max(r, key=lambda x: x[0])	1✔
273	if perf < node.perf:	1✔
274	break	1✔
275	node = comp	×
276	if self._output:	1✔
277	save_model(node, self._output)	1✔
278	return node	1✔
279
280
281	class BeamSelection(ForwardSelection):	1✔
282	"""
283	Select the models using Beam Search.
284	"""
285
286	def run(self, X, y, early_stopping=1000):	1✔
287	"""
288
289	:param early_stopping: Number of rounds to perform early stopping
290	:type early_stopping: int
291	:rtype: :py:class:`text_models.model_selection.Node`
292	"""
293	self._le = self._le.fit(y)	1✔
294	y = self._le.transform(y)	1✔
295	visited = [(node.performance(X, y), node) for	1✔
296	node in self._nodes]
297	_ = max(visited, key=lambda x: x[0])[1]	1✔
298	best = None	1✔
299	nodes = LifoQueue()	1✔
300	nodes.put(_)	1✔
301	index = len(visited)	1✔
302	visited = set([x[1] for x in visited])	1✔
303	while not nodes.empty() and (len(visited) - index) < early_stopping:	1✔
304	node = nodes.get()	1✔
305	if best is None or node > best:	1✔
306	index = len(visited)	1✔
307	best = node	1✔
308	if self._output:	1✔
309	save_model(best, self._output)	1✔
310	self._logger.info("Model: %s perf: %0.4f " % (best, best.perf) +	1✔
311	"visited: %s " % len(visited) +
312	"size: %s " % nodes.qsize() +
313	"Rounds: %s" % (len(visited) - index))
314	nn = [(xx, xx.performance(X, y)) for	1✔
315	xx in node if xx not in visited]
316	[visited.add(x) for x, _ in nn]	1✔
317	nn = [xx for xx, perf in nn if perf >= node.perf]	1✔
318	if len(nn) == 0:	1✔
319	continue	1✔
320	nn.sort()	×
321	[nodes.put(x) for x in nn]	×
322	return best	1✔

INGEOTEC / text_models / 4219223887

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous