• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

neuml / paperai / 14577956040

21 Apr 2025 05:24PM UTC coverage: 97.792% (+0.003%) from 97.789%
14577956040

push

github

davidmezzetti
Update build script

753 of 770 relevant lines covered (97.79%)

0.98 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.08
/src/python/paperai/vectors.py
1
"""
2
Vectors module
3
"""
4

5
import os
1✔
6
import os.path
1✔
7
import sqlite3
1✔
8
import sys
1✔
9
import tempfile
1✔
10

11
from staticvectors import StaticVectorsTrainer
1✔
12
from txtai.pipeline import Tokenizer
1✔
13

14

15
class RowIterator:
1✔
16
    """
17
    Iterates over rows in a database query. Allows for multiple iterations.
18
    """
19

20
    def __init__(self, dbfile):
1✔
21
        """
22
        Initializes RowIterator.
23

24
        Args:
25
            dbfile: path to SQLite file
26
        """
27

28
        # Store database file
29
        self.dbfile = dbfile
1✔
30

31
        self.rows = self.stream(self.dbfile)
1✔
32

33
    def __iter__(self):
1✔
34
        """
35
        Creates a database query generator.
36

37
        Returns:
38
            generator
39
        """
40

41
        # reset the generator
42
        self.rows = self.stream(self.dbfile)
1✔
43
        return self
1✔
44

45
    def __next__(self):
1✔
46
        """
47
        Gets the next result in the current generator.
48

49
        Returns:
50
            tokens
51
        """
52

53
        result = next(self.rows)
1✔
54
        if result is None:
1✔
55
            raise StopIteration
×
56

57
        return result
1✔
58

59
    def stream(self, dbfile):
1✔
60
        """
61
        Connects to SQLite file at dbfile and yields parsed tokens for each row.
62

63
        Args:
64
            dbfile:
65
        """
66

67
        # Connection to database file
68
        db = sqlite3.connect(dbfile)
1✔
69
        cur = db.cursor()
1✔
70

71
        cur.execute("SELECT Text FROM sections")
1✔
72

73
        count = 0
1✔
74
        for section in cur:
1✔
75
            # Tokenize text
76
            tokens = Tokenizer.tokenize(section[0])
1✔
77

78
            count += 1
1✔
79
            if count % 1000 == 0:
1✔
80
                print(f"Streamed {count} documents", end="\r")
1✔
81

82
            # Skip documents with no tokens parsed
83
            if tokens:
1✔
84
                yield tokens
1✔
85

86
        print(f"Iterated over {count} total rows")
1✔
87

88
        # Free database resources
89
        db.close()
1✔
90

91

92
class Vectors:
1✔
93
    """
94
    Methods to build a FastText model.
95
    """
96

97
    @staticmethod
1✔
98
    def tokens(dbfile):
1✔
99
        """
100
        Iterates over each row in dbfile and writes parsed tokens to a temporary file for processing.
101

102
        Args:
103
            dbfile: SQLite file to read
104

105
        Returns:
106
            path to output file
107
        """
108

109
        tokens = None
1✔
110

111
        # Stream tokens to temp working file
112
        with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as output:
1✔
113
            # Save file path
114
            tokens = output.name
1✔
115

116
            for row in RowIterator(dbfile):
1✔
117
                output.write(" ".join(row) + "\n")
1✔
118

119
        return tokens
1✔
120

121
    @staticmethod
1✔
122
    def run(path, size, mincount, output):
1✔
123
        """
124
        Builds a word vector model.
125

126
        Args:
127
            path: model path
128
            size: dimensions for fastText model
129
            mincount: minimum number of times a token must appear in input
130
            output: output file path
131
        """
132

133
        # Derive path to dbfile
134
        dbfile = os.path.join(path, "articles.sqlite")
1✔
135

136
        # Stream tokens to temporary file
137
        tokens = Vectors.tokens(dbfile)
1✔
138

139
        # Build staticvectors model - use SQLite storage for backwards compatibility
140
        trainer = StaticVectorsTrainer()
1✔
141
        trainer(tokens, size=size, mincount=mincount, path=output)
1✔
142

143
        # Remove temporary tokens file
144
        os.remove(tokens)
1✔
145

146

147
if __name__ == "__main__":
1✔
148
    # Create vector model
149
    Vectors.run(
×
150
        sys.argv[1] if len(sys.argv) > 1 else None,
151
        300,
152
        4,
153
        sys.argv[2] if len(sys.argv) > 2 else None,
154
    )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc