14577956040

Committed 21 Apr 2025 05:24PM UTC coverage: 97.792% (+0.003%) from 97.789%

Build # 14577956040

Build Type

push

github

Committed by

davidmezzetti

Commit Message

Update build script

Run Details

753 of 770 relevant lines covered (97.79%)

0.98 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.08

/src/python/paperai/vectors.py

"""
Vectors module
"""

import os
import os.path
import sqlite3
import sys
import tempfile

from staticvectors import StaticVectorsTrainer
from txtai.pipeline import Tokenizer


class RowIterator:
    """
    Iterates over rows in a database query. Allows for multiple iterations.
    """

    def __init__(self, dbfile):
        """
        Initializes RowIterator.

        Args:
            dbfile: path to SQLite file
        """

        # Store database file
        self.dbfile = dbfile

        self.rows = self.stream(self.dbfile)

    def __iter__(self):
        """
        Creates a database query generator.

        Returns:
            generator
        """

        # reset the generator
        self.rows = self.stream(self.dbfile)
        return self

    def __next__(self):
        """
        Gets the next result in the current generator.

        Returns:
            tokens
        """

        result = next(self.rows)
        if result is None:
            raise StopIteration

        return result

    def stream(self, dbfile):
        """
        Connects to SQLite file at dbfile and yields parsed tokens for each row.

        Args:
            dbfile:
        """

        # Connection to database file
        db = sqlite3.connect(dbfile)
        cur = db.cursor()

        cur.execute("SELECT Text FROM sections")

        count = 0
        for section in cur:
            # Tokenize text
            tokens = Tokenizer.tokenize(section[0])

            count += 1
            if count % 1000 == 0:
                print(f"Streamed {count} documents", end="\r")

            # Skip documents with no tokens parsed
            if tokens:
                yield tokens

        print(f"Iterated over {count} total rows")

        # Free database resources
        db.close()


class Vectors:
    """
    Methods to build a FastText model.
    """

    @staticmethod
    def tokens(dbfile):
        """
        Iterates over each row in dbfile and writes parsed tokens to a temporary file for processing.

        Args:
            dbfile: SQLite file to read

        Returns:
            path to output file
        """

        tokens = None

        # Stream tokens to temp working file
        with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as output:
            # Save file path
            tokens = output.name

            for row in RowIterator(dbfile):
                output.write(" ".join(row) + "\n")

        return tokens

    @staticmethod
    def run(path, size, mincount, output):
        """
        Builds a word vector model.

        Args:
            path: model path
            size: dimensions for fastText model
            mincount: minimum number of times a token must appear in input
            output: output file path
        """

        # Derive path to dbfile
        dbfile = os.path.join(path, "articles.sqlite")

        # Stream tokens to temporary file
        tokens = Vectors.tokens(dbfile)

        # Build staticvectors model - use SQLite storage for backwards compatibility
        trainer = StaticVectorsTrainer()
        trainer(tokens, size=size, mincount=mincount, path=output)

        # Remove temporary tokens file
        os.remove(tokens)


if __name__ == "__main__":
    # Create vector model
    Vectors.run(
        sys.argv[1] if len(sys.argv) > 1 else None,
        300,
        4,
        sys.argv[2] if len(sys.argv) > 2 else None,
    )

1	"""
2	Vectors module
3	"""
4
5	import os	1✔
6	import os.path	1✔
7	import sqlite3	1✔
8	import sys	1✔
9	import tempfile	1✔
10
11	from staticvectors import StaticVectorsTrainer	1✔
12	from txtai.pipeline import Tokenizer	1✔
13
14
15	class RowIterator:	1✔
16	"""
17	Iterates over rows in a database query. Allows for multiple iterations.
18	"""
19
20	def __init__(self, dbfile):	1✔
21	"""
22	Initializes RowIterator.
23
24	Args:
25	dbfile: path to SQLite file
26	"""
27
28	# Store database file
29	self.dbfile = dbfile	1✔
30
31	self.rows = self.stream(self.dbfile)	1✔
32
33	def __iter__(self):	1✔
34	"""
35	Creates a database query generator.
36
37	Returns:
38	generator
39	"""
40
41	# reset the generator
42	self.rows = self.stream(self.dbfile)	1✔
43	return self	1✔
44
45	def __next__(self):	1✔
46	"""
47	Gets the next result in the current generator.
48
49	Returns:
50	tokens
51	"""
52
53	result = next(self.rows)	1✔
54	if result is None:	1✔
55	raise StopIteration	×
56
57	return result	1✔
58
59	def stream(self, dbfile):	1✔
60	"""
61	Connects to SQLite file at dbfile and yields parsed tokens for each row.
62
63	Args:
64	dbfile:
65	"""
66
67	# Connection to database file
68	db = sqlite3.connect(dbfile)	1✔
69	cur = db.cursor()	1✔
70
71	cur.execute("SELECT Text FROM sections")	1✔
72
73	count = 0	1✔
74	for section in cur:	1✔
75	# Tokenize text
76	tokens = Tokenizer.tokenize(section[0])	1✔
77
78	count += 1	1✔
79	if count % 1000 == 0:	1✔
80	print(f"Streamed {count} documents", end="\r")	1✔
81
82	# Skip documents with no tokens parsed
83	if tokens:	1✔
84	yield tokens	1✔
85
86	print(f"Iterated over {count} total rows")	1✔
87
88	# Free database resources
89	db.close()	1✔
90
91
92	class Vectors:	1✔
93	"""
94	Methods to build a FastText model.
95	"""
96
97	@staticmethod	1✔
98	def tokens(dbfile):	1✔
99	"""
100	Iterates over each row in dbfile and writes parsed tokens to a temporary file for processing.
101
102	Args:
103	dbfile: SQLite file to read
104
105	Returns:
106	path to output file
107	"""
108
109	tokens = None	1✔
110
111	# Stream tokens to temp working file
112	with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as output:	1✔
113	# Save file path
114	tokens = output.name	1✔
115
116	for row in RowIterator(dbfile):	1✔
117	output.write(" ".join(row) + "\n")	1✔
118
119	return tokens	1✔
120
121	@staticmethod	1✔
122	def run(path, size, mincount, output):	1✔
123	"""
124	Builds a word vector model.
125
126	Args:
127	path: model path
128	size: dimensions for fastText model
129	mincount: minimum number of times a token must appear in input
130	output: output file path
131	"""
132
133	# Derive path to dbfile
134	dbfile = os.path.join(path, "articles.sqlite")	1✔
135
136	# Stream tokens to temporary file
137	tokens = Vectors.tokens(dbfile)	1✔
138
139	# Build staticvectors model - use SQLite storage for backwards compatibility
140	trainer = StaticVectorsTrainer()	1✔
141	trainer(tokens, size=size, mincount=mincount, path=output)	1✔
142
143	# Remove temporary tokens file
144	os.remove(tokens)	1✔
145
146
147	if __name__ == "__main__":	1✔
148	# Create vector model
149	Vectors.run(	×
150	sys.argv[1] if len(sys.argv) > 1 else None,
151	300,
152	4,
153	sys.argv[2] if len(sys.argv) > 2 else None,
154	)

neuml / paperai / 14577956040

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous