• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

INGEOTEC / microtc / 11389680303

17 Oct 2024 05:07PM UTC coverage: 89.32% (+0.1%) from 89.175%
11389680303

push

github

mgraffg
Normalize emojis

89 of 92 new or added lines in 6 files covered. (96.74%)

1 existing line in 1 file now uncovered.

2074 of 2322 relevant lines covered (89.32%)

2.68 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

86.76
/microtc/textmodel.py
1
# Copyright 2016-2017 Eric S. Tellez
2

3
# Licensed under the Apache License, Version 2.0 (the "License");
4
# you may not use this file except in compliance with the License.
5
# You may obtain a copy of the License at
6

7
#     http://www.apache.org/licenses/LICENSE-2.0
8

9
# Unless required by applicable law or agreed to in writing, software
10
# distributed under the License is distributed on an "AS IS" BASIS,
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
# See the License for the specific language governing permissions and
13
# limitations under the License.
14
import re
3✔
15
import unicodedata
3✔
16
import os
3✔
17
import numpy as np
3✔
18
from microtc.params import OPTION_DELETE, OPTION_GROUP, OPTION_NONE
3✔
19
from microtc.emoticons import EmoticonClassifier, read_emojis, create_data_structure, replace_token
3✔
20
from microtc.utils import get_class, SparseMatrix
3✔
21
from typing import Union
3✔
22

23

24
PUNCTUACTION = ";:,.@\\-\"'/"
3✔
25
SYMBOLS = "()[]¿?¡!{}~<>|"
3✔
26
SKIP_SYMBOLS = set(PUNCTUACTION + SYMBOLS)
3✔
27
SKIP_SYMBOLS_AND_SPACES = set(PUNCTUACTION + SYMBOLS + '\t\n\r ')
3✔
28
# SKIP_WORDS = set(["…", "..", "...", "...."])
29
WEIGHTING = dict(tfidf="microtc.weighting.TFIDF",
3✔
30
                 tf="microtc.weighting.TF",
31
                 entropy="microtc.weighting.Entropy")
32

33

34
def norm_chars(text, del_diac=True, del_dup=True, del_punc=False):
3✔
35
    """
36
    Transform text by removing diacritics, duplicates, and punctuation.
37
    It adds ~ at the beginning, the end, and the spaces are changed by ~.
38

39
    :param text: Text
40
    :type text: str
41
    :param del_diac: Delete diacritics
42
    :type del_diac: bool
43
    :param del_dup: Delete duplicates
44
    :type del_dup: bool
45
    :param del_punc: Delete punctuation symbols
46
    :type del_punc: bool
47
    :rtype: str
48

49
    Example:
50

51
    >>> from microtc.textmodel import norm_chars
52
    >>> norm_chars("Life is good at Méxicoo.")
53
    '~Life~is~god~at~Mexico.~'
54

55
    """
56

57
    cadena = '~'
3✔
58
    prev = '~'
3✔
59
    fin_linea = set(['\n', '\r', ' ', '\t', '\xa0'])
3✔
60
    for u in unicodedata.normalize('NFD', text):
3✔
61
        if del_diac:
3✔
62
            o = ord(u)
3✔
63
            if 0x300 <= o and o <= 0x036F:
3✔
64
                continue
×
65
        if u in fin_linea:
3✔
66
            u = '~'
3✔
67
        elif del_dup and prev == u:
3✔
68
            continue
×
69
        elif del_punc and u in SKIP_SYMBOLS:
3✔
70
            prev = u
3✔
71
            continue
3✔
72
        prev = u
3✔
73
        cadena = cadena + u
3✔
74
    cadena = cadena + '~'
3✔
75
    return cadena
3✔
76

77

78
def get_word_list(text):
3✔
79
    """
80
    Transform a text (begining and ending with ~) to list words.
81
    It is called after :py:func:`microtc.textmodel.norm_chars`.
82

83
    Example
84

85
    >>> from microtc.textmodel import get_word_list
86
    >>> get_word_list("~Someone's house.~")
87
    ['Someone', 's', 'house']
88

89
    :param text: text
90
    :type text: str
91

92
    :rtype: list
93
    """
94

95
    cadena = ''
3✔
96
    prev = ' '
3✔
97
    for u in text[1:len(text)-1]:
3✔
98
        if u in SKIP_SYMBOLS:
3✔
99
            u = ' '
3✔
100
        if prev == ' ' and u == ' ':
3✔
UNCOV
101
            continue
×
102
        if prev == ' ' and u == "'":
3✔
103
            continue
×
104
        cadena = cadena + u
3✔
105
        prev = u
3✔
106
    return cadena.split()
3✔
107

108

109
def expand_qgrams(text, qsize, output):
3✔
110
    """Expands a text into a set of q-grams
111

112
    :param text: Text
113
    :type text: str
114
    :param qsize: q-gram size
115
    :type qsize: int
116
    :param output: output
117
    :type output: list
118

119
    :returns: output
120
    :rtype: list
121

122
    Example:
123

124
    >>> from microtc.textmodel import expand_qgrams
125
    >>> output = list()
126
    >>> expand_qgrams("Good morning.", 3, output)
127
    ['q:Goo', 'q:ood', 'q:od ', 'q:d m', 'q: mo', 'q:mor', 'q:orn', 'q:rni', 'q:nin', 'q:ing', 'q:ng.']
128
    """
129
    unir = "".join
3✔
130
    output.extend(["q:" + unir(a)
3✔
131
                   for a in zip(*[text[i:] for i in range(qsize)])])
132
    # for x in _:
133
    #     output.append("q:" + x)
134
    return output
3✔
135

136

137
def expand_qgrams_word_list(wlist, qsize, output, sep='~'):
3✔
138
    """Expands a list of words into a list of q-grams. It uses `sep` to join words
139

140
    :param wlist: List of words computed by :py:func:`microtc.textmodel.get_word_list`.
141
    :type wlist: list
142
    :param qsize: q-gram size of words
143
    :type qsize: int
144
    :param output: output
145
    :type output: list
146
    :param sep: String used to join the words
147
    :type sep: str
148

149
    :returns: output
150
    :rtype: list
151

152
    Example:
153

154
    >>> from microtc.textmodel import expand_qgrams_word_list
155
    >>> wlist = ["Good", "morning", "Mexico"]
156
    >>> expand_qgrams_word_list(wlist, 2, list())
157
    ['Good~morning', 'morning~Mexico']
158
    """
159

160
    n = len(wlist)
3✔
161

162
    for start in range(n - qsize + 1):
3✔
163
        t = sep.join(wlist[start:start+qsize])
3✔
164
        output.append(t)
3✔
165

166
    return output
3✔
167

168

169
def expand_skipgrams_word_list(wlist, qsize, output, sep='~'):
3✔
170
    """Expands a list of words into a list of skipgrams. It uses `sep` to join words
171

172
    :param wlist: List of words computed by :py:func:`microtc.textmodel.get_word_list`.
173
    :type wlist: list
174
    :param qsize: (qsize, skip) qsize is the q-gram size and skip is the number of words ahead.
175
    :type qsize: tuple
176
    :param output: output
177
    :type output: list
178
    :param sep: String used to join the words
179
    :type sep: str
180

181
    :returns: output
182
    :rtype: list
183

184
    Example:
185

186
    >>> from microtc.textmodel import expand_skipgrams_word_list
187
    >>> wlist = ["Good", "morning", "Mexico"]
188
    >>> expand_skipgrams_word_list(wlist, (2, 1), list())
189
    ['Good~Mexico']
190

191
    """
192
    n = len(wlist)
3✔
193
    qsize, skip = qsize
3✔
194
    for start in range(n - (qsize + (qsize - 1) * skip) + 1):
3✔
195
        if qsize == 2:
3✔
196
            t = wlist[start] + sep + wlist[start+1+skip]
3✔
197
        else:
198
            t = sep.join([wlist[start + i * (1+skip)] for i in range(qsize)])
×
199

200
        output.append(t)
3✔
201

202
    return output
3✔
203

204

205
class TextModel(SparseMatrix):
3✔
206
    """
207

208
    :param docs: Corpus
209
    :type docs: list
210
    :param text: In the case corpus is a dict then text is the key containing the text
211
    :type text: str
212
    :param num_option: Transformations on numbers (none | group | delete)
213
    :type num_option: str
214
    :param usr_option: Transformations on users (none | group | delete)
215
    :type usr_option: str
216
    :param url_option: Transformations on urls (none | group | delete)
217
    :type url_option: str
218
    :param emo_option: Transformations on emojis and emoticons (none | group | delete)
219
    :type emo_option: str
220
    :param hashtag_option: Transformations on hashtag (none | group | delete)
221
    :type hashtag_option: str
222
    :param ent_option: Transformations on entities (none | group | delete)
223
    :type ent_option: str
224

225
    :param lc: Lower case
226
    :type lc: bool
227
    :param del_dup: Remove duplicates e.g. hooola -> hola
228
    :type del_dup: bool
229
    :param del_punc: Remove punctuation symbols
230
    :type del_punc: True
231
    :param del_diac: Remove diacritics
232
    :type del_diac: bool
233
    :param token_list: Tokens > 0 qgrams < 0 word-grams
234
    :type token_list: list
235
    :param token_min_filter: Keep those tokens that appear more times than the parameter (used in weighting class)
236
    :type token_min_filter: int or float
237
    :param token_max_filter: Keep those tokens that appear less times than the parameter (used in weighting class)
238
    :type token_max_filter: int or float
239
    :param q_grams_words: Compute q-grams only on words
240
    :type q_grams_words: bool
241

242
    :param select_ent:
243
    :type select_ent: bool
244
    :param select_suff:
245
    :type select_suff: bool
246
    :param select_conn:
247
    :type select_conn: bool
248

249
    :param weighting: Weighting scheme (tfidf | tf | entropy)
250
    :type weighting: class or str
251

252
    :param norm_emojis: Normalize emojis
253
    :type norm_emojis: bool
254

255
    Usage:
256

257
    >>> from microtc.textmodel import TextModel
258
    >>> corpus = ['buenos dias', 'catedras conacyt', 'categorizacion de texto ingeotec']
259

260
    Using default parameters
261

262
    >>> textmodel = TextModel().fit(corpus)
263

264
    Represent a text whose words are in the corpus and one that does not
265

266
    >>> vector = textmodel['categorizacion ingoetec']
267
    >>> vector2 = textmodel['cat']
268

269
    Using a different token_list
270

271
    >>> textmodel = TextModel(token_list=[[2, 1], -1, 3, 4]).fit(corpus)
272
    >>> vector = textmodel['categorizacion ingoetec']
273
    >>> vector2 = textmodel['cat']
274

275
    Train a classifier
276

277
    >>> from sklearn.svm import LinearSVC
278
    >>> y = [1, 0, 0]
279
    >>> textmodel = TextModel().fit(corpus)
280
    >>> m = LinearSVC().fit(textmodel.transform(corpus), y)
281
    >>> m.predict(textmodel.transform(corpus))
282
    array([1, 0, 0])
283
    """
284

285
    def __init__(self, docs=None, text: str='text',
3✔
286
                 num_option: str=OPTION_GROUP,
287
                 usr_option: str=OPTION_GROUP,
288
                 url_option: str=OPTION_GROUP,
289
                 emo_option: str=OPTION_GROUP,
290
                 hashtag_option: str=OPTION_NONE,
291
                 ent_option: str=OPTION_NONE,
292
                 lc: bool=True, del_dup: bool=False,
293
                 del_punc: bool=True, del_diac: bool=True,
294
                 token_list: list=[-1], 
295
                 token_min_filter: Union[int, float]=0,
296
                 token_max_filter: Union[int, float]=1,
297
                 select_ent: bool=False,
298
                 select_suff: bool=False, select_conn: bool=False,
299
                 weighting: str='tfidf',
300
                 q_grams_words: bool=False,
301
                 max_dimension: bool=False,
302
                 unit_vector: bool=True,
303
                 norm_emojis: bool=True):
304
        self._text = os.getenv('TEXT', default=text)
3✔
305
        self.del_diac = del_diac
3✔
306
        self.num_option = num_option
3✔
307
        self.usr_option = usr_option
3✔
308
        self.url_option = url_option
3✔
309
        self.emo_option = emo_option
3✔
310
        self.ent_option = ent_option
3✔
311
        self.select_ent = select_ent
3✔
312
        self.select_suff = select_suff
3✔
313
        self.select_conn = select_conn
3✔
314
        self.hashtag_option = hashtag_option
3✔
315
        self.lc = lc
3✔
316
        self.del_dup = del_dup
3✔
317
        self.del_punc = del_punc
3✔
318
        self.token_list = token_list
3✔
319
        self.token_min_filter = token_min_filter
3✔
320
        self.token_max_filter = token_max_filter
3✔
321
        self.weighting = weighting
3✔
322
        self.weighting = WEIGHTING.get(weighting, weighting)
3✔
323
        self._q_grams_words = q_grams_words
3✔
324
        self._max_dimension = max_dimension
3✔
325
        self.unit_vector = unit_vector
3✔
326
        self.norm_emojis = norm_emojis
3✔
327
        if emo_option == OPTION_NONE:
3✔
328
            self.emo_map = None
3✔
329
        else:
330
            self.emo_map = EmoticonClassifier()
3✔
331
        if self.norm_emojis:
3✔
332
            self.norm_tokens = read_emojis()
3✔
333
            _ = {x: True for x in self.norm_tokens}
3✔
334
            self.norm_head = create_data_structure(_)
3✔
335
        if docs is not None and len(docs):
3✔
336
            self.fit(docs)
3✔
337

338
    @property
3✔
339
    def unit_vector(self):
3✔
340
        try:
3✔
341
            return self._unit_vector
3✔
342
        except AttributeError:
×
343
            self._unit_vector = True
×
344
        return self._unit_vector
×
345

346
    @unit_vector.setter
3✔
347
    def unit_vector(self, value):
3✔
348
        self._unit_vector = value
3✔
349
        if hasattr(self, 'model'):
3✔
350
            self.model.unit_vector = value
3✔
351

352
    @property
3✔
353
    def q_grams_words(self):
3✔
354
        try:
3✔
355
            return self._q_grams_words
3✔
356
        except AttributeError:
×
357
            return False
×
358

359
    @property
3✔
360
    def max_dimension(self):
3✔
361
        try:
3✔
362
            return self._max_dimension
3✔
363
        except AttributeError:
×
364
            return False
×
365

366
    # @property
367
    # def token_list(self):
368
    #     """Tokenizer parameters"""
369
    #     return self._token_list
370

371
    # @token_list.setter
372
    # def token_list(self, value):
373
    #     """
374
    #     >>> from microtc import TextModel
375
    #     >>> tm = TextModel()
376
    #     >>> tm.token_list = [-2, -1]
377
    #     >>> tm.token_list
378
    #     [-2, -1]
379
    #     """
380
    #     self._token_list = value
381
    #     for x in ['_q_grams', '_n_grams', '_skip_grams']:
382
    #         try:
383
    #             delattr(self, x)
384
    #         except AttributeError:
385
    #             continue
386

387
    @property
3✔
388
    def q_grams(self):
3✔
389
        """q-grams of characters
390
        >>> from microtc import TextModel
391
        >>> tm = TextModel(token_list=[-1, 3, (2, 1)])
392
        >>> tm.q_grams
393
        [3]
394
        """
395
        try:
3✔
396
            q_grams = self._q_grams
3✔
397
        except AttributeError:
3✔
398
            q_grams = [x for x in self.token_list if isinstance(x, int) and x > 0]
3✔
399
            self._q_grams = q_grams
3✔
400
        return q_grams
3✔
401

402
    @property
3✔
403
    def n_grams(self):
3✔
404
        """n-grams of words
405
        >>> from microtc import TextModel
406
        >>> tm = TextModel(token_list=[-1, 3, (2, 1)])
407
        >>> tm.n_grams
408
        [-1]
409
        """
410
        try:
3✔
411
            output = self._n_grams
3✔
412
        except AttributeError:
3✔
413
            output = [x for x in self.token_list if isinstance(x, int) and x < 0]
3✔
414
            self._n_grams = output
3✔
415
        return output
3✔
416

417
    @property
3✔
418
    def skip_grams(self):
3✔
419
        """skip-grams
420
        >>> from microtc import TextModel
421
        >>> tm = TextModel(token_list=[-1, 3, (2, 1)])
422
        >>> tm.skip_grams
423
        [(2, 1)]
424
        """
425
        try:
3✔
426
            output = self._skip_grams
3✔
427
        except AttributeError:
3✔
428
            output = [x for x in self.token_list if not isinstance(x, int)]
3✔
429
            self._skip_grams = output
3✔
430
        return output           
3✔
431

432
    def fit(self, X):
3✔
433
        """
434
        Train the model
435

436
        :param X: Corpus
437
        :type X: list
438
        :rtype: instance
439
        """
440

441
        tokens = [self.tokenize(d) for d in X]
3✔
442
        self.model = get_class(self.weighting)(tokens, X=X,
3✔
443
                                               token_min_filter=self.token_min_filter,
444
                                               token_max_filter=self.token_max_filter,
445
                                               max_dimension=self.max_dimension,
446
                                               unit_vector=self.unit_vector)
447
        return self
3✔
448

449
    def __getitem__(self, text):
3✔
450
        """Convert text into a vector
451

452
        :param text: Text to be transformed
453
        :type text: str
454

455
        :rtype: list
456
        """
457
        return self.model[self.tokenize(text)]
3✔
458

459
    @classmethod
3✔
460
    def params(cls):
3✔
461
        """
462
        Parameters
463

464
        >>> from microtc.textmodel import TextModel
465
        >>> TextModel.params()
466
        odict_keys(['docs', 'text', 'num_option', 'usr_option', 'url_option', 'emo_option', 'hashtag_option', 'ent_option', 'lc', 'del_dup', 'del_punc', 'del_diac', 'token_list', 'token_min_filter', 'token_max_filter', 'select_ent', 'select_suff', 'select_conn', 'weighting', 'q_grams_words', 'max_dimension', 'unit_vector'])
467
        """
468

469
        import inspect
3✔
470
        sig = inspect.signature(cls)
3✔
471
        params = sig.parameters.keys()
3✔
472
        return params
3✔
473

474
    def transform(self, texts):
3✔
475
        """Convert test into a vector
476

477
        :param texts: List of text to be transformed
478
        :type texts: list
479

480
        :rtype: list
481

482
        Example:
483

484
        >>> from microtc.textmodel import TextModel
485
        >>> corpus = ['buenos dias catedras', 'catedras conacyt']
486
        >>> textmodel = TextModel().fit(corpus)
487
        >>> X = textmodel.transform(corpus)
488
        """
489
        return self.tonp([self.__getitem__(x) for x in texts])
3✔
490

491
    def vectorize(self, text):
3✔
492
        raise RuntimeError('Not implemented')
×
493

494
    def tokenize(self, text):
3✔
495
        """Transform text to tokens.
496
        The procedure is:
497

498
        - :py:func:`microtc.textmodel.TextModel.text_transformations`.
499
        - :py:func:`microtc.textmodel.TextModel.compute_tokens`.
500
        - :py:func:`microtc.textmodel.TextModel.select_tokens`.
501

502
        :param text: Text
503
        :type text: str or list
504

505
        :rtype: list
506

507
        Example:
508

509
        >>> from microtc.textmodel import TextModel
510
        >>> tm = TextModel()
511
        >>> tm.tokenize("buenos dias")
512
        ['buenos', 'dias']
513
        >>> tm.tokenize(["buenos", "dias", "tenga usted"])
514
        ['buenos', 'dias', 'tenga', 'usted']
515
        """
516

517
        if isinstance(text, dict):
3✔
518
            text = self.get_text(text)
3✔
519

520
        if isinstance(text, (list, tuple)):
3✔
521
            tokens = []
3✔
522
            for _text in text:
3✔
523
                tokens.extend(self._tokenize(_text))
3✔
524

525
            return tokens
3✔
526
        else:
527
            return self._tokenize(text)
3✔
528

529
    def get_text(self, text):
3✔
530
        """Return self._text key from text
531

532
        :param text: Text
533
        :type text: dict
534
        """
535

536
        return text[self._text]
3✔
537

538
    @property
3✔
539
    def disable_text_transformations(self):
3✔
540
        try:
3✔
541
            return self._disable_text_transformations
3✔
542
        except AttributeError:
3✔
543
            return False
3✔
544

545
    @disable_text_transformations.setter
3✔
546
    def disable_text_transformations(self, v):
3✔
547
        self._disable_text_transformations = v
3✔
548

549
    def text_transformations(self, text):
3✔
550
        """
551
        Text transformations. It starts by analyzing emojis, hashtags, entities,
552
        lower case, numbers, URL, and users. After these transformations are applied
553
        to the text, it calls :py:func:`microtc.textmodel.norm_chars`.
554

555
        :param text:
556
        :type text: str
557

558
        :rtype: str
559

560
        Example:
561

562
        >>> from microtc.textmodel import TextModel
563
        >>> tm = TextModel(del_dup=False)
564
        >>> tm.text_transformations("Life is good at México @mgraffg.")
565
        '~life~is~good~at~mexico~_usr~'
566
        """
567

568
        if text is None:
3✔
569
            text = ''
×
570

571
        if isinstance(text, dict):
3✔
572
            text = self.get_text(text)
×
573

574
        if self.disable_text_transformations:
3✔
575
            return text
3✔
576

577
        if self.emo_map:
3✔
578
            text = self.emo_map.replace(text, option=self.emo_option)
3✔
579

580
        if self.select_ent:
3✔
581
            text = " ".join(re.findall(r"(@\S+|#\S+|[A-Z]\S+)", text))
×
582

583
        if self.hashtag_option == OPTION_DELETE:
3✔
584
            text = re.sub(r"#\S+", "", text)
3✔
585
        elif self.hashtag_option == OPTION_GROUP:
3✔
586
            text = re.sub(r"#\S+", "_htag", text)
3✔
587

588
        if self.ent_option == OPTION_DELETE:
3✔
589
            text = re.sub(r"[A-Z][a-z]+", "", text)
×
590
        elif self.ent_option == OPTION_GROUP:
3✔
591
            text = re.sub(r"[A-Z][a-z]+", "_ent", text)
×
592

593
        if self.lc:
3✔
594
            text = text.lower()
3✔
595

596
        if self.num_option == OPTION_DELETE:
3✔
597
            text = re.sub(r"\d\d*\.?\d*|\d*\.\d\d*", "", text)
3✔
598
        elif self.num_option == OPTION_GROUP:
3✔
599
            text = re.sub(r"\d\d*\.?\d*|\d*\.\d\d*", "_num", text)
3✔
600

601
        if self.url_option == OPTION_DELETE:
3✔
602
            text = re.sub(r"https?://\S+", "", text)
3✔
603
        elif self.url_option == OPTION_GROUP:
3✔
604
            text = re.sub(r"https?://\S+", "_url", text)
3✔
605

606
        if self.usr_option == OPTION_DELETE:
3✔
607
            text = re.sub(r"@\S+", "", text)
3✔
608
        elif self.usr_option == OPTION_GROUP:
3✔
609
            text = re.sub(r"@\S+", "_usr", text)
3✔
610

611
        _ = norm_chars(text, del_diac=self.del_diac,
3✔
612
                       del_dup=self.del_dup,
613
                       del_punc=self.del_punc)
614
        if self.norm_emojis:
3✔
615
            return replace_token(self.norm_tokens,
3✔
616
                                 self.norm_head, _)
NEW
617
        return _
×
618

619
    def get_word_list(self, *args, **kwargs):
3✔
620
        return get_word_list(*args, **kwargs)
3✔
621

622
    def compute_n_grams(self, textlist):
3✔
623
        output = []
3✔
624
        for q in self.n_grams:
3✔
625
            expand_qgrams_word_list(textlist, abs(q), output)
3✔
626
        return output
3✔
627

628
    def compute_skip_grams(self, textlist):
3✔
629
        output = []
3✔
630
        for q in self.skip_grams:
3✔
631
            expand_skipgrams_word_list(textlist, q, output)
3✔
632
        return output
3✔
633

634
    def compute_q_grams(self, text):
3✔
635
        output = []
3✔
636
        for q in self.q_grams:
3✔
637
            expand_qgrams(text, q, output)
3✔
638
        return output
3✔
639

640
    def compute_q_grams_words(self, textlist):
3✔
641
        """
642
        >>> from microtc import TextModel
643
        >>> tm = TextModel(token_list=[3])
644
        >>> tm.compute_q_grams_words(['abc', 'def'])
645
        ['q:~ab', 'q:abc', 'q:bc~', 'q:~de', 'q:def', 'q:ef~']
646
        """
647
        output = []
×
648
        textlist = ['~' + x + '~' for x in textlist]
×
649
        for qsize in self.q_grams:
×
650
            _ = qsize - 1
×
651
            extra = [x for x in textlist if len(x) >= _]
×
652
            qgrams = [["".join(output) for output in zip(*[text[i:] for i in range(qsize)])] 
×
653
                      for text in extra]
654
            for _ in qgrams:
×
655
                [output.append("q:" + x) for x in _]
×
656
        return output       
×
657

658
    def compute_tokens(self, text):
3✔
659
        """
660
        Compute tokens from a text using q-grams of characters and words, and skip-grams.
661

662
        :param text: Text transformed by :py:func:`microtc.textmodel.TextModel.text_transformations`.
663
        :type text: str
664

665
        :rtype: list
666

667
        Example:
668

669
        >>> from microtc.textmodel import TextModel
670
        >>> tm = TextModel(token_list=[-2, -1])
671
        >>> tm.compute_tokens("~Good morning~")
672
        [['Good~morning', 'Good', 'morning'], [], []]
673
        >>> tm = TextModel(token_list=[3])
674
        >>> tm.compute_tokens('abc def')
675
        [[], [], ['q:abc', 'q:bc ', 'q:c d', 'q: de', 'q:def']]
676
        >>> tm = TextModel(token_list=[(2, 1)])
677
        >>> tm.compute_tokens('~abc x de~')
678
        [[], ['abc~de'], []]
679
        >>> tm = TextModel(token_list=[3], q_grams_words=True)
680
        >>> tm.compute_tokens('~abc def~')
681
        [[], [], ['q:~ab', 'q:abc', 'q:bc~', 'q:~de', 'q:def', 'q:ef~']]
682
        """
683
        L = []
3✔
684
        textlist = self.get_word_list(text)
3✔
685
        L.append(self.compute_n_grams(textlist))
3✔
686
        L.append(self.compute_skip_grams(textlist))
3✔
687
        if self.q_grams_words:
3✔
688
            L.append(self.compute_q_grams_words(textlist))
×
689
        else:
690
            L.append(self.compute_q_grams(text))
3✔
691
        return L
3✔
692

693
    def select_tokens(self, L):
3✔
694
        """
695
        Filter tokens using suffix or connections
696

697
        :param L: list of tokens
698
        :type L: list
699

700
        :rtype: list
701
        """
702

703
        if self.select_suff:
3✔
704
            L = [tok for tok in L if tok[-1] in SKIP_SYMBOLS_AND_SPACES]
×
705
            
706
        if self.select_conn:
3✔
707
            L = [tok for tok in L if '~' in tok and tok[0] != '~' and tok[-1] != '~']
×
708
        return L
3✔
709

710
    def _tokenize(self, text):
3✔
711
        text = self.text_transformations(text)
3✔
712
        L = []
3✔
713
        for _ in self.compute_tokens(text):
3✔
714
            L += _
3✔
715
        L = self.select_tokens(L)
3✔
716
        if len(L) == 0:
3✔
717
            L = ['~']
×
718

719
        return L
3✔
720

721
    @property
3✔
722
    def num_terms(self):
3✔
723
        """Dimension which is the number of terms of the corpus
724

725
        >>> from microtc.textmodel import TextModel
726
        >>> corpus = ['buenos dias', 'catedras conacyt', 'categorizacion de texto ingeotec']
727
        >>> textmodel = TextModel().fit(corpus)
728
        >>> _ = textmodel.transform(corpus)
729
        >>> textmodel.num_terms
730
        8
731

732
        :rtype: int
733
        """
734

735
        return self.model.num_terms
3✔
736

737
    @property
3✔
738
    def token_weight(self):
3✔
739
        """
740
        Weight associated to each token id
741

742
        >>> from microtc.textmodel import TextModel
743
        >>> corpus = ['buenos dias', 'catedras conacyt', 'categorizacion de texto ingeotec']
744
        >>> textmodel = TextModel().fit(corpus)
745
        >>> _ = textmodel.transform(corpus)
746
        >>> textmodel.token_weight[5]
747
        1.584962500721156
748
        """
749
        return self.model.wordWeight
×
750

751
    @property
3✔
752
    def id2token(self):
3✔
753
        """
754
        Token identifier to token
755

756
        >>> from microtc.textmodel import TextModel
757
        >>> corpus = ['buenos dias', 'catedras de conacyt', 'categorizacion de texto ingeotec']
758
        >>> textmodel = TextModel().fit(corpus)
759
        >>> _ = textmodel.transform(corpus)
760
        >>> textmodel.id2token[0]
761
        'buenos'
762
        """
763
        try:
×
764
            return self._id2token
×
765
        except AttributeError:
×
766
            self._id2token = {v:k for k, v in self.token2id.items()}
×
767
            return self._id2token
×
768

769
    @property
3✔
770
    def token2id(self):
3✔
771
        """
772
        Token to token identifier
773

774
        >>> from microtc.textmodel import TextModel
775
        >>> corpus = ['buenos dias', 'catedras de conacyt', 'categorizacion de texto ingeotec']
776
        >>> textmodel = TextModel().fit(corpus)
777
        >>> _ = textmodel.transform(corpus)
778
        >>> textmodel.token2id['de']
779
        4
780
        """
781
        return self.model.word2id
3✔
782

783

STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc