• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

INGEOTEC / text_models / 4219223887

pending completion
4219223887

push

github

Mario Graff
test

1728 of 1844 relevant lines covered (93.71%)

0.94 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.28
/text_models/model_selection.py
1
# Copyright 2020 Mario Graff Guerrero
2

3
# Licensed under the Apache License, Version 2.0 (the "License");
4
# you may not use this file except in compliance with the License.
5
# You may obtain a copy of the License at
6

7
#     http://www.apache.org/licenses/LICENSE-2.0
8

9
# Unless required by applicable law or agreed to in writing, software
10
# distributed under the License is distributed on an "AS IS" BASIS,
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
# See the License for the specific language governing permissions and
13
# limitations under the License.
14
import logging
1✔
15
from EvoMSA.base import EvoMSA
1✔
16
from EvoMSA.utils import LabelEncoderWrapper
1✔
17
from queue import LifoQueue
1✔
18
from microtc.utils import save_model
1✔
19
from sklearn.metrics import f1_score
1✔
20
from sklearn.model_selection import KFold
1✔
21
from text_models.utils import macro_f1
1✔
22
import numpy as np
1✔
23
import os
1✔
24

25

26
class Node(object):
1✔
27
    """
28
    Base class to perform model selection on the first-stage models
29

30
    >>> from EvoMSA import base
31
    >>> from EvoMSA.utils import download
32
    >>> from text_models.model_selection import Node
33
    >>> from microtc.utils import tweet_iterator
34
    >>> import os
35
    >>> tweets = os.path.join(os.path.dirname(base.__file__), 'tests', 'tweets.json')
36
    >>> D = [[x['text'], x['klass']] for x in tweet_iterator(tweets)]
37
    >>> models = dict()
38
    >>> models[0] = [download("emo_Es.tm"), "sklearn.svm.LinearSVC"]
39
    >>> models[1] = ["EvoMSA.model.AggressivenessEs", "sklearn.svm.LinearSVC"]
40
    >>> models[2] = [download("b4msa_Es.tm"), "sklearn.svm.LinearSVC"]
41
    >>> X = [x for x, y in D]
42
    >>> y = [y for x, y in D]
43
    >>> kf = KFold(n_splits=3, random_state=1, shuffle=True)
44
    >>> node = Node([0], models=models, split_dataset=kf, aggregate=np.median, cache=os.path.join("cache", "fw"), metric=macro_f1)
45
    >>> perf = node.performance(X, y)
46
    >>> [x for x in node]
47
    [0-1, 0-2]
48
    >>> model = node.fit(X, y)
49

50

51
    :param model: Models used in the node - List of keys from :py:attr:`models`
52
    :type model: list
53
    :param models: Dictionary of pairs (see :py:attr:`EvoMSA.base.EvoMSA.models`)
54
    :type model: dict
55
    :param metric: Performance metric, e.g., accuracy
56
    :type metric: function
57
    :param split_dataset: Iterator to split dataset in training and validation
58
    :type split_dataset: instance
59
    :param aggregate: :math:`\\text{aggregate}: \\mathbb R^d \\rightarrow \\mathbb R`
60
    :type aggregate: function
61
    :param cache: Store the output of text models
62
    :type cache: str
63
    :param TR: EvoMSA's default model
64
    :type TR: bool
65
    :param stacked_method: Classifier or regressor used to ensemble the outputs of :attr:`EvoMSA.models`
66
    :type stacked_method: str or class
67

68
    """
69

70
    def __init__(self, model, models=None,
1✔
71
                 metric=None,
72
                 split_dataset=None,
73
                 aggregate=None,
74
                 cache=None,
75
                 TR=False,
76
                 stacked_method="sklearn.naive_bayes.GaussianNB",
77
                 **kwargs):
78
        assert metric is not None
1✔
79
        assert split_dataset is not None and hasattr(split_dataset, "split")
1✔
80
        assert aggregate is not None
1✔
81
        assert cache is not None
1✔
82
        self._models = models
1✔
83
        self._model = [x for x in model]
1✔
84
        _ = self._model.copy()
1✔
85
        _.sort()
1✔
86
        self._repr = "-".join(map(str, _))
1✔
87
        self._metric = metric
1✔
88
        self._split_dataset = split_dataset
1✔
89
        self._aggregate = aggregate
1✔
90
        self._cache = cache
1✔
91
        self._TR = TR
1✔
92
        self._kwargs = kwargs
1✔
93
        self._kwargs.update(dict(stacked_method=stacked_method))
1✔
94

95
    def __repr__(self):
1✔
96
        return self._repr
1✔
97

98
    def __eq__(self, other):
1✔
99
        return isinstance(other, Node) and str(self) == str(other)
1✔
100

101
    def __hash__(self):
1✔
102
        return hash(str(self))
1✔
103

104
    def __iter__(self):
1✔
105
        variables = set(self._models.keys())
1✔
106
        model = self._model
1✔
107
        for x in variables - set(model):
1✔
108
            yield self.__class__(model + [x],
1✔
109
                                 models=self._models,
110
                                 metric=self._metric,
111
                                 split_dataset=self._split_dataset,
112
                                 aggregate=self._aggregate,
113
                                 cache=self._cache,
114
                                 TR=self._TR,
115
                                 **self._kwargs)
116

117
    @property
1✔
118
    def model(self):
1✔
119
        """Models as received by :py:class:`EvoMSA.base.EvoMSA`"""
120

121
        models = self._models
1✔
122
        return [models[x] for x in self._model]
1✔
123

124
    def fit(self, X, y):
1✔
125
        return self._fit(X, y, None)
×
126

127
    def _fit(self, X, y, cache):
1✔
128
        """Create an EvoMSA's instance
129

130
        :param X: Training set - independent variables
131
        :type X: list
132
        :param y: Training set - dependent variable
133
        :type y: list or np.array
134
        :param TR: EvoMSA's default model
135
        :type TR: bool
136
        :param test_set: Dataset to perform transductive learning
137
        :type test_set: list
138
        :rtype: self
139
        """
140

141
        return EvoMSA(TR=self._TR, models=self.model,
1✔
142
                      cache=cache,
143
                      **self._kwargs).fit(X, y)
144

145
    @property
1✔
146
    def perf(self):
1✔
147
        """Performance"""
148
        return self._perf
1✔
149

150
    def performance(self, X, y):
1✔
151
        """Compute the performance on the dataset
152

153
        :param X: Test set - independent variables
154
        :type X: list
155
        :param y: Test set - dependent variable
156
        :type y: list or np.array
157
        :rtype: float
158
        """
159

160
        try:
1✔
161
            return self._perf
1✔
162
        except AttributeError:
1✔
163
            perf = []
1✔
164
            cache = self._cache
1✔
165
            for index, (tr, vs) in enumerate(self._split_dataset.split(X)):
1✔
166
                evo = self._fit([X[x] for x in tr],
1✔
167
                                [y[x] for x in tr],
168
                                cache=cache + "-tr-" + str(index))
169
                hy = evo.predict([X[x] for x in vs],
1✔
170
                                 cache=cache + "-vs-" + str(index))
171
                perf.append(self._metric([y[x] for x in vs], hy))
1✔
172
            self._perf = self._aggregate(perf)
1✔
173
        return self._perf
1✔
174

175
 #   def __cmp__(self, other):
176
 #       x = self.perf
177
 #       y = other.perf
178
 #       return (x > y) - (x < y)
179

180
    def __gt__(self, other):
1✔
181
        return self.perf > other.perf
×
182

183

184
class ForwardSelection(object):
1✔
185
    """Forward Selection on the models
186

187
    >>> from EvoMSA import base
188
    >>> from EvoMSA.utils import download
189
    >>> from text_models.model_selection import ForwardSelection
190
    >>> from microtc.utils import tweet_iterator
191
    >>> import os
192

193
    Read the dataset
194

195
    >>> tweets = os.path.join(os.path.dirname(base.__file__), 'tests', 'tweets.json')
196
    >>> D = [[x['text'], x['klass']] for x in tweet_iterator(tweets)]
197
    
198
    Models
199

200
    >>> models = dict()
201
    >>> models[0] = [download("emo_Es.tm"), "sklearn.svm.LinearSVC"]
202
    >>> models[1] = ["EvoMSA.model.AggressivenessEs", "sklearn.svm.LinearSVC"]
203
    >>> models[2] = [download("b4msa_Es.tm"), "sklearn.svm.LinearSVC"]
204
    >>> X = [x for x, y in D]
205
    >>> y = [y for x, y in D]
206
    >>> fwdSel = ForwardSelection(models)
207
    >>> best = fwdSel.run(X, y)
208

209
    :param models: Dictionary of pairs (see :py:attr:`EvoMSA.base.EvoMSA.models`)
210
    :type models: dict
211
    :param node: Node use to perform the search
212
    :type node: :py:class:`text_models.model_selection.Node`
213
    :param output: Filename to store intermediate models
214
    :type output: str
215
    :param verbose: Level to inform the user
216
    :type verbose: int
217
    :param metric: Performance metric
218
    :type metric: function
219
    :param split_dataset: Iterator to split dataset in training and validation
220
    :type split_dataset: instance
221
    :param aggregate: :math:`\\text{aggregate}: \\mathbb R^d \\rightarrow \\mathbb R`
222
    :type aggregate: function
223
    :param cache: Store the output of text models
224
    :type cache: str
225

226
    """
227

228
    def __init__(self, models, node=Node,
1✔
229
                 output=None, verbose=logging.INFO,
230
                 metric=macro_f1,
231
                 split_dataset=KFold(n_splits=3, random_state=1, shuffle=True),
232
                 aggregate=np.median,
233
                 classifier=True,
234
                 cache=os.path.join("cache", "fw"),
235
                 **kwargs):
236
        self._models = models
1✔
237
        self._nodes = [node([k], models=models,
1✔
238
                            metric=metric,
239
                            split_dataset=split_dataset,
240
                            aggregate=aggregate,
241
                            cache=cache,
242
                            classifier=classifier,
243
                            **kwargs) for k in models.keys()]
244
        self._output = output
1✔
245
        self._logger = logging.getLogger("text_models.model_selection")
1✔
246
        self._logger.setLevel(verbose)
1✔
247
        self._le = LabelEncoderWrapper(classifier=classifier)
1✔
248

249
    def run(self, X, y):
1✔
250
        """Perform the search using X and y to guide it
251

252
        :param X: Dataset set - independent variables
253
        :type X: list
254
        :param y: Dataset set - dependent variable
255
        :type y: list or np.array
256
        :rtype: :py:class:`EvoMSA.model_selection.Node`
257
        """
258

259
        self._logger.info("Starting the search")
1✔
260
        self._le = self._le.fit(y)
1✔
261
        y = self._le.transform(y)
1✔
262
        r = [(node.performance(X, y), node) for node in self._nodes]
1✔
263
        node = max(r, key=lambda x: x[0])[1]
1✔
264
        while True:
265
            self._logger.info("Model: %s perf: %0.4f" % (node, node.perf))
1✔
266
            nodes = list(node)
1✔
267
            if len(nodes) == 0:
1✔
268
                if self._output:
×
269
                    save_model(node, self._output)
×
270
                return node
×
271
            r = [(xx.performance(X, y), xx) for xx in nodes]
1✔
272
            perf, comp = max(r, key=lambda x: x[0])
1✔
273
            if perf < node.perf:
1✔
274
                break
1✔
275
            node = comp
×
276
        if self._output:
1✔
277
            save_model(node, self._output)
1✔
278
        return node
1✔
279

280

281
class BeamSelection(ForwardSelection):
1✔
282
    """
283
    Select the models using Beam Search.
284
    """
285

286
    def run(self, X, y, early_stopping=1000):
1✔
287
        """
288

289
        :param early_stopping: Number of rounds to perform early stopping
290
        :type early_stopping: int
291
        :rtype: :py:class:`text_models.model_selection.Node`
292
        """
293
        self._le = self._le.fit(y)
1✔
294
        y = self._le.transform(y)
1✔
295
        visited = [(node.performance(X, y), node) for
1✔
296
                   node in self._nodes]
297
        _ = max(visited, key=lambda x: x[0])[1]
1✔
298
        best = None
1✔
299
        nodes = LifoQueue()
1✔
300
        nodes.put(_)
1✔
301
        index = len(visited)
1✔
302
        visited = set([x[1] for x in visited])
1✔
303
        while not nodes.empty() and (len(visited) - index) < early_stopping:
1✔
304
            node = nodes.get()
1✔
305
            if best is None or node > best:
1✔
306
                index = len(visited)
1✔
307
                best = node
1✔
308
                if self._output:
1✔
309
                    save_model(best, self._output)
1✔
310
            self._logger.info("Model: %s perf: %0.4f " % (best, best.perf) +
1✔
311
                              "visited: %s " % len(visited) +
312
                              "size: %s " % nodes.qsize() +
313
                              "Rounds: %s" % (len(visited) - index))
314
            nn = [(xx, xx.performance(X, y)) for
1✔
315
                  xx in node if xx not in visited]
316
            [visited.add(x) for x, _ in nn]
1✔
317
            nn = [xx for xx, perf in nn if perf >= node.perf]
1✔
318
            if len(nn) == 0:
1✔
319
                continue
1✔
320
            nn.sort()
×
321
            [nodes.put(x) for x in nn]
×
322
        return best
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc