• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pymc-devs / pymc3 / 8033

pending completion
8033

Pull #3115

travis-ci

web-flow
DOC Update GLM hogg outlier example. Newer sampling syntax, better model specification.
Pull Request #3115: DOC Update GLM hogg outlier example. Newer sampling syntax, better model specification.

17660 of 19801 relevant lines covered (89.19%)

4.71 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

78.53
/pymc3/data.py
1
from copy import copy
11✔
2
import io
11✔
3
import os
11✔
4
import pkgutil
11✔
5
import collections
11✔
6
import numpy as np
11✔
7
import pymc3 as pm
11✔
8
import theano.tensor as tt
11✔
9
import theano
11✔
10

11
__all__ = [
11✔
12
    'get_data',
13
    'GeneratorAdapter',
14
    'Minibatch',
15
    'align_minibatches'
16
]
17

18

19
def get_data(filename):
11✔
20
    """Returns a BytesIO object for a package data file.
21

22
    Parameters
23
    ----------
24
    filename : str
25
        file to load
26
    Returns
27
    -------
28
    BytesIO of the data
29
    """
30
    data_pkg = 'pymc3.examples'
3✔
31
    return io.BytesIO(pkgutil.get_data(data_pkg, os.path.join('data', filename)))
3✔
32

33

34
class GenTensorVariable(tt.TensorVariable):
11✔
35
    def __init__(self, op, type, name=None):
11✔
36
        super(GenTensorVariable, self).__init__(type=type, name=name)
3✔
37
        self.op = op
3✔
38

39
    def set_gen(self, gen):
11✔
40
        self.op.set_gen(gen)
3✔
41

42
    def set_default(self, value):
11✔
43
        self.op.set_default(value)
3✔
44

45
    def clone(self):
11✔
46
        cp = self.__class__(self.op, self.type, self.name)
3✔
47
        cp.tag = copy(self.tag)
3✔
48
        return cp
3✔
49

50

51
class GeneratorAdapter(object):
11✔
52
    """
53
    Helper class that helps to infer data type of generator with looking
54
    at the first item, preserving the order of the resulting generator
55
    """
56

57
    def make_variable(self, gop, name=None):
11✔
58
        var = GenTensorVariable(gop, self.tensortype, name)
3✔
59
        var.tag.test_value = self.test_value
3✔
60
        return var
3✔
61

62
    def __init__(self, generator):
11✔
63
        if not pm.vartypes.isgenerator(generator):
3✔
64
            raise TypeError('Object should be generator like')
×
65
        self.test_value = pm.smartfloatX(copy(next(generator)))
3✔
66
        # make pickling potentially possible
67
        self._yielded_test_value = False
3✔
68
        self.gen = generator
3✔
69
        self.tensortype = tt.TensorType(
3✔
70
            self.test_value.dtype,
71
            ((False, ) * self.test_value.ndim))
72

73
    # python3 generator
74
    def __next__(self):
11✔
75
        if not self._yielded_test_value:
3✔
76
            self._yielded_test_value = True
3✔
77
            return self.test_value
3✔
78
        else:
79
            return pm.smartfloatX(copy(next(self.gen)))
3✔
80

81
    # python2 generator
82
    next = __next__
11✔
83

84
    def __iter__(self):
11✔
85
        return self
×
86

87
    def __eq__(self, other):
11✔
88
        return id(self) == id(other)
×
89

90
    def __hash__(self):
11✔
91
        return hash(id(self))
3✔
92

93

94
class Minibatch(tt.TensorVariable):
11✔
95
    """Multidimensional minibatch that is pure TensorVariable
96

97
    Parameters
98
    ----------
99
    data : :class:`ndarray`
100
        initial data
101
    batch_size : `int` or `List[int|tuple(size, random_seed)]`
102
        batch size for inference, random seed is needed 
103
        for child random generators
104
    dtype : `str`
105
        cast data to specific type
106
    broadcastable : tuple[bool]
107
        change broadcastable pattern that defaults to `(False, ) * ndim`
108
    name : `str`
109
        name for tensor, defaults to "Minibatch"
110
    random_seed : `int`
111
        random seed that is used by default
112
    update_shared_f : `callable`
113
        returns :class:`ndarray` that will be carefully 
114
        stored to underlying shared variable
115
        you can use it to change source of 
116
        minibatches programmatically 
117
    in_memory_size : `int` or `List[int|slice|Ellipsis]`
118
        data size for storing in theano.shared
119

120
    Attributes
121
    ----------
122
    shared : shared tensor
123
        Used for storing data
124
    minibatch : minibatch tensor
125
        Used for training
126

127
    Examples
128
    --------
129
    Consider we have data
130
    >>> data = np.random.rand(100, 100)
131

132
    if we want 1d slice of size 10 we do
133
    >>> x = Minibatch(data, batch_size=10)
134

135
    Note, that your data is cast to `floatX` if it is not integer type
136
    But you still can add `dtype` kwarg for :class:`Minibatch` 
137

138
    in case we want 10 sampled rows and columns
139
    `[(size, seed), (size, seed)]` it is
140
    >>> x = Minibatch(data, batch_size=[(10, 42), (10, 42)], dtype='int32')
141
    >>> assert str(x.dtype) == 'int32'
142

143
    or simpler with default random seed = 42
144
    `[size, size]`
145
    >>> x = Minibatch(data, batch_size=[10, 10])
146

147
    x is a regular :class:`TensorVariable` that supports any math
148
    >>> assert x.eval().shape == (10, 10)
149

150
    You can pass it to your desired model
151
    >>> with pm.Model() as model:
152
    ...     mu = pm.Flat('mu')
153
    ...     sd = pm.HalfNormal('sd')
154
    ...     lik = pm.Normal('lik', mu, sd, observed=x, total_size=(100, 100))
155

156
    Then you can perform regular Variational Inference out of the box
157
    >>> with model:
158
    ...     approx = pm.fit()
159

160
    Notable thing is that :class:`Minibatch` has `shared`, `minibatch`, attributes
161
    you can call later
162
    >>> x.set_value(np.random.laplace(size=(100, 100)))
163

164
    and minibatches will be then from new storage
165
    it directly affects `x.shared`.
166
    the same thing would be but less convenient
167
    >>> x.shared.set_value(pm.floatX(np.random.laplace(size=(100, 100))))
168

169
    programmatic way to change storage is as follows
170
    I import `partial` for simplicity
171
    >>> from functools import partial
172
    >>> datagen = partial(np.random.laplace, size=(100, 100))
173
    >>> x = Minibatch(datagen(), batch_size=10, update_shared_f=datagen)
174
    >>> x.update_shared()
175

176
    To be more concrete about how we get minibatch, here is a demo
177
    1) create shared variable 
178
    >>> shared = theano.shared(data)
179

180
    2) create random slice of size 10
181
    >>> ridx = pm.tt_rng().uniform(size=(10,), low=0, high=data.shape[0]-1e-10).astype('int64')
182

183
    3) take that slice
184
    >>> minibatch = shared[ridx]
185

186
    That's done. Next you can use this minibatch somewhere else. 
187
    You can see that implementation does not require fixed shape
188
    for shared variable. Feel free to use that if needed.
189

190
    Suppose you need some replacements in the graph, e.g. change minibatch to testdata
191
    >>> node = x ** 2  # arbitrary expressions on minibatch `x`
192
    >>> testdata = pm.floatX(np.random.laplace(size=(1000, 10)))
193

194
    Then you should create a dict with replacements
195
    >>> replacements = {x: testdata}
196
    >>> rnode = theano.clone(node, replacements)
197
    >>> assert (testdata ** 2 == rnode.eval()).all()
198

199
    To replace minibatch with it's shared variable you should do
200
    the same things. Minibatch variable is accessible as an attribute
201
    as well as shared, associated with minibatch
202
    >>> replacements = {x.minibatch: x.shared}
203
    >>> rnode = theano.clone(node, replacements)
204

205
    For more complex slices some more code is needed that can seem not so clear
206
    >>> moredata = np.random.rand(10, 20, 30, 40, 50)
207

208
    default `total_size` that can be passed to `PyMC3` random node
209
    is then `(10, 20, 30, 40, 50)` but can be less verbose in some cases
210

211
    1) Advanced indexing, `total_size = (10, Ellipsis, 50)`
212
    >>> x = Minibatch(moredata, [2, Ellipsis, 10])
213

214
    We take slice only for the first and last dimension
215
    >>> assert x.eval().shape == (2, 20, 30, 40, 10)
216

217
    2) Skipping particular dimension, `total_size = (10, None, 30)` 
218
    >>> x = Minibatch(moredata, [2, None, 20])
219
    >>> assert x.eval().shape == (2, 20, 20, 40, 50)
220

221
    3) Mixing that all, `total_size = (10, None, 30, Ellipsis, 50)`
222
    >>> x = Minibatch(moredata, [2, None, 20, Ellipsis, 10])
223
    >>> assert x.eval().shape == (2, 20, 20, 40, 10)
224
    """
225

226
    RNG = collections.defaultdict(list)
11✔
227

228
    @theano.configparser.change_flags(compute_test_value='raise')
11✔
229
    def __init__(self, data, batch_size=128, dtype=None, broadcastable=None, name='Minibatch',
11✔
230
                 random_seed=42, update_shared_f=None, in_memory_size=None):
231
        if dtype is None:
5✔
232
            data = pm.smartfloatX(np.asarray(data))
5✔
233
        else:
234
            data = np.asarray(data, dtype)
×
235
        in_memory_slc = self.make_static_slices(in_memory_size)
5✔
236
        self.shared = theano.shared(data[in_memory_slc])
5✔
237
        self.update_shared_f = update_shared_f
5✔
238
        self.random_slc = self.make_random_slices(self.shared.shape, batch_size, random_seed)
5✔
239
        minibatch = self.shared[self.random_slc]
5✔
240
        if broadcastable is None:
5✔
241
            broadcastable = (False, ) * minibatch.ndim
5✔
242
        minibatch = tt.patternbroadcast(minibatch, broadcastable)
5✔
243
        self.minibatch = minibatch
5✔
244
        super(Minibatch, self).__init__(
5✔
245
            self.minibatch.type, None, None, name=name)
246
        theano.Apply(
5✔
247
            theano.compile.view_op,
248
            inputs=[self.minibatch], outputs=[self])
249
        self.tag.test_value = copy(self.minibatch.tag.test_value)
5✔
250

251
    def rslice(self, total, size, seed):
11✔
252
        if size is None:
5✔
253
            return slice(None)
×
254
        elif isinstance(size, int):
5✔
255
            rng = pm.tt_rng(seed)
5✔
256
            Minibatch.RNG[id(self)].append(rng)
5✔
257
            return (rng
5✔
258
                    .uniform(size=(size, ), low=0.0, high=pm.floatX(total) - 1e-16)
259
                    .astype('int64'))
260
        else:
261
            raise TypeError('Unrecognized size type, %r' % size)
×
262

263
    def __del__(self):
11✔
264
        del Minibatch.RNG[id(self)]
2✔
265

266
    @staticmethod
11✔
267
    def make_static_slices(user_size):
268
        if user_size is None:
5✔
269
            return [Ellipsis]
5✔
270
        elif isinstance(user_size, int):
×
271
            return slice(None, user_size)
×
272
        elif isinstance(user_size, (list, tuple)):
×
273
            slc = list()
×
274
            for i in user_size:
×
275
                if isinstance(i, int):
×
276
                    slc.append(i)
×
277
                elif i is None:
×
278
                    slc.append(slice(None))
×
279
                elif i is Ellipsis:
×
280
                    slc.append(Ellipsis)
×
281
                elif isinstance(i, slice):
×
282
                    slc.append(i)
×
283
                else:
284
                    raise TypeError('Unrecognized size type, %r' % user_size)
×
285
            return slc
×
286
        else:
287
            raise TypeError('Unrecognized size type, %r' % user_size)
×
288

289
    def make_random_slices(self, in_memory_shape, batch_size, default_random_seed):
11✔
290
        if batch_size is None:
5✔
291
            return [Ellipsis]
×
292
        elif isinstance(batch_size, int):
5✔
293
            slc = [self.rslice(in_memory_shape[0], batch_size, default_random_seed)]
5✔
294
        elif isinstance(batch_size, (list, tuple)):
3✔
295
            def check(t):
3✔
296
                if t is Ellipsis or t is None:
3✔
297
                    return True
3✔
298
                else:
299
                    if isinstance(t, (tuple, list)):
3✔
300
                        if not len(t) == 2:
3✔
301
                            return False
×
302
                        else:
303
                            return isinstance(t[0], int) and isinstance(t[1], int)
3✔
304
                    elif isinstance(t, int):
3✔
305
                        return True
3✔
306
                    else:
307
                        return False
×
308
            # end check definition
309
            if not all(check(t) for t in batch_size):
3✔
310
                raise TypeError('Unrecognized `batch_size` type, expected '
×
311
                                'int or List[int|tuple(size, random_seed)] where '
312
                                'size and random seed are both ints, got %r' %
313
                                batch_size)
314
            batch_size = [
3✔
315
                (i, default_random_seed) if isinstance(i, int) else i
316
                for i in batch_size
317
            ]
318
            shape = in_memory_shape
3✔
319
            if Ellipsis in batch_size:
3✔
320
                sep = batch_size.index(Ellipsis)
3✔
321
                begin = batch_size[:sep]
3✔
322
                end = batch_size[sep + 1:]
3✔
323
                if Ellipsis in end:
3✔
324
                    raise ValueError('Double Ellipsis in `batch_size` is restricted, got %r' %
×
325
                                     batch_size)
326
                if len(end) > 0:
3✔
327
                    shp_mid = shape[sep:-len(end)]
3✔
328
                    mid = [tt.arange(s) for s in shp_mid]
3✔
329
                else:
330
                    mid = []
×
331
            else:
332
                begin = batch_size
3✔
333
                end = []
3✔
334
                mid = []
3✔
335
            if (len(begin) + len(end)) > len(in_memory_shape.eval()):
3✔
336
                raise ValueError('Length of `batch_size` is too big, '
×
337
                                 'number of ints is bigger that ndim, got %r'
338
                                 % batch_size)
339
            if len(end) > 0:
3✔
340
                shp_end = shape[-len(end):]
3✔
341
            else:
342
                shp_end = np.asarray([])
3✔
343
            shp_begin = shape[:len(begin)]
3✔
344
            slc_begin = [self.rslice(shp_begin[i], t[0], t[1])
3✔
345
                         if t is not None else tt.arange(shp_begin[i])
346
                         for i, t in enumerate(begin)]
347
            slc_end = [self.rslice(shp_end[i], t[0], t[1])
3✔
348
                       if t is not None else tt.arange(shp_end[i])
349
                       for i, t in enumerate(end)]
350
            slc = slc_begin + mid + slc_end
3✔
351
            slc = slc
3✔
352
        else:
353
            raise TypeError('Unrecognized size type, %r' % batch_size)
×
354
        return pm.theanof.ix_(*slc)
5✔
355

356
    def update_shared(self):
11✔
357
        if self.update_shared_f is None:
×
358
            raise NotImplementedError("No `update_shared_f` was provided to `__init__`")
×
359
        self.set_value(np.asarray(self.update_shared_f(), self.dtype))
×
360

361
    def set_value(self, value):
11✔
362
        self.shared.set_value(np.asarray(value, self.dtype))
×
363

364
    def clone(self):
11✔
365
        ret = self.type()
5✔
366
        ret.name = self.name
5✔
367
        ret.tag = copy(self.tag)
5✔
368
        return ret
5✔
369

370

371
def align_minibatches(batches=None):
11✔
372
    if batches is None:
3✔
373
        for rngs in Minibatch.RNG.values():
3✔
374
            for rng in rngs:
3✔
375
                rng.seed()
3✔
376
    else:
377
        for b in batches:
3✔
378
            if not isinstance(b, Minibatch):
3✔
379
                raise TypeError('{b} is not a Minibatch')
×
380
            for rng in Minibatch.RNG[id(b)]:
3✔
381
                rng.seed()
3✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc