11062719915

Committed 27 Sep 2024 01:06AM CUT coverage: 99.946%. Remained the same

Build # 11062719915

Build Type

push

github

Committed by

davidmezzetti

Commit Message

Update documentation

Run Details

7406 of 7410 relevant lines covered (99.95%)

1.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

100.0

/src/python/txtai/ann/numpy.py

"""
NumPy module
"""

import numpy as np

from ..serialize import SerializeFactory

from .base import ANN


class NumPy(ANN):
    """
    Builds an ANN index backed by a NumPy array.
    """

    def __init__(self, config):
        super().__init__(config)

        # Array function definitions
        self.all, self.cat, self.dot, self.zeros = np.all, np.concatenate, np.dot, np.zeros
        self.argsort, self.xor = np.argsort, np.bitwise_xor

        # Scalar quantization
        quantize = self.config.get("quantize")
        self.qbits = quantize if quantize and isinstance(quantize, int) and not isinstance(quantize, bool) else None

    def load(self, path):
        # Load array from file
        try:
            self.backend = self.tensor(np.load(path, allow_pickle=False))
        except ValueError:
            # Backwards compatible support for previously pickled data
            self.backend = self.tensor(SerializeFactory.create("pickle").load(path))

    def index(self, embeddings):
        # Create index
        self.backend = self.tensor(embeddings)

        # Add id offset and index build metadata
        self.config["offset"] = embeddings.shape[0]
        self.metadata(self.settings())

    def append(self, embeddings):
        # Append new data to array
        self.backend = self.cat((self.backend, self.tensor(embeddings)), axis=0)

        # Update id offset and index metadata
        self.config["offset"] += embeddings.shape[0]
        self.metadata()

    def delete(self, ids):
        # Filter any index greater than size of array
        ids = [x for x in ids if x < self.backend.shape[0]]

        # Clear specified ids
        self.backend[ids] = self.tensor(self.zeros((len(ids), self.backend.shape[1])))

    def search(self, queries, limit):
        if self.qbits:
            # Calculate hamming score for integer vectors
            scores = self.hammingscore(queries)
        else:
            # Dot product on normalized vectors is equal to cosine similarity
            scores = self.dot(self.tensor(queries), self.backend.T)

        # Get topn ids
        ids = self.argsort(-scores)[:, :limit]

        # Map results to [(id, score)]
        results = []
        for x, score in enumerate(scores):
            # Add results
            results.append(list(zip(ids[x].tolist(), score[ids[x]].tolist())))

        return results

    def count(self):
        # Get count of non-zero rows (ignores deleted rows)
        return self.backend[~self.all(self.backend == 0, axis=1)].shape[0]

    def save(self, path):
        # Save array to file. Use stream to prevent ".npy" suffix being added.
        with open(path, "wb") as handle:
            np.save(handle, self.numpy(self.backend), allow_pickle=False)

    def tensor(self, array):
        """
        Handles backend-specific code such as loading to a GPU device.

        Args:
            array: data array

        Returns:
            array with backend-specific logic applied
        """

        return array

    def numpy(self, array):
        """
        Handles backend-specific code to convert an array to numpy

        Args:
            array: data array

        Returns:
            numpy array
        """

        return array

    def totype(self, array, dtype):
        """
        Casts array to dtype.

        Args:
            array: input array
            dtype: dtype

        Returns:
            array cast as dtype
        """

        return np.int64(array) if dtype == np.int64 else array

    def settings(self):
        """
        Returns settings for this array.

        Returns:
            dict
        """

        return {"numpy": np.__version__}

    def hammingscore(self, queries):
        """
        Calculates a hamming distance score.

        This is defined as:

            score = 1.0 - (hamming distance / total number of bits)

        Args:
            queries: queries array

        Returns:
            scores
        """

        # Build table of number of bits for each distinct uint8 value
        table = 1 << np.arange(8)
        table = self.tensor(np.array([np.count_nonzero(x & table) for x in np.arange(256)]))

        # Number of different bits
        delta = self.xor(self.tensor(queries[:, None]), self.backend)

        # Cast to long array
        delta = self.totype(delta, np.int64)

        # Calculate score as 1.0 - percentage of different bits
        return 1.0 - (table[delta].sum(axis=2) / (self.config["dimensions"] * 8))

1	"""
2	NumPy module
3	"""
4
5	import numpy as np	1✔
6
7	from ..serialize import SerializeFactory	1✔
8
9	from .base import ANN	1✔
10
11
12	class NumPy(ANN):	1✔
13	"""
14	Builds an ANN index backed by a NumPy array.
15	"""
16
17	def __init__(self, config):	1✔
18	super().__init__(config)	1✔
19
20	# Array function definitions
21	self.all, self.cat, self.dot, self.zeros = np.all, np.concatenate, np.dot, np.zeros	1✔
22	self.argsort, self.xor = np.argsort, np.bitwise_xor	1✔
23
24	# Scalar quantization
25	quantize = self.config.get("quantize")	1✔
26	self.qbits = quantize if quantize and isinstance(quantize, int) and not isinstance(quantize, bool) else None	1✔
27
28	def load(self, path):	1✔
29	# Load array from file
30	try:	1✔
31	self.backend = self.tensor(np.load(path, allow_pickle=False))	1✔
32	except ValueError:	1✔
33	# Backwards compatible support for previously pickled data
34	self.backend = self.tensor(SerializeFactory.create("pickle").load(path))	1✔
35
36	def index(self, embeddings):	1✔
37	# Create index
38	self.backend = self.tensor(embeddings)	1✔
39
40	# Add id offset and index build metadata
41	self.config["offset"] = embeddings.shape[0]	1✔
42	self.metadata(self.settings())	1✔
43
44	def append(self, embeddings):	1✔
45	# Append new data to array
46	self.backend = self.cat((self.backend, self.tensor(embeddings)), axis=0)	1✔
47
48	# Update id offset and index metadata
49	self.config["offset"] += embeddings.shape[0]	1✔
50	self.metadata()	1✔
51
52	def delete(self, ids):	1✔
53	# Filter any index greater than size of array
54	ids = [x for x in ids if x < self.backend.shape[0]]	1✔
55
56	# Clear specified ids
57	self.backend[ids] = self.tensor(self.zeros((len(ids), self.backend.shape[1])))	1✔
58
59	def search(self, queries, limit):	1✔
60	if self.qbits:	1✔
61	# Calculate hamming score for integer vectors
62	scores = self.hammingscore(queries)	1✔
63	else:
64	# Dot product on normalized vectors is equal to cosine similarity
65	scores = self.dot(self.tensor(queries), self.backend.T)	1✔
66
67	# Get topn ids
68	ids = self.argsort(-scores)[:, :limit]	1✔
69
70	# Map results to [(id, score)]
71	results = []	1✔
72	for x, score in enumerate(scores):	1✔
73	# Add results
74	results.append(list(zip(ids[x].tolist(), score[ids[x]].tolist())))	1✔
75
76	return results	1✔
77
78	def count(self):	1✔
79	# Get count of non-zero rows (ignores deleted rows)
80	return self.backend[~self.all(self.backend == 0, axis=1)].shape[0]	1✔
81
82	def save(self, path):	1✔
83	# Save array to file. Use stream to prevent ".npy" suffix being added.
84	with open(path, "wb") as handle:	1✔
85	np.save(handle, self.numpy(self.backend), allow_pickle=False)	1✔
86
87	def tensor(self, array):	1✔
88	"""
89	Handles backend-specific code such as loading to a GPU device.
90
91	Args:
92	array: data array
93
94	Returns:
95	array with backend-specific logic applied
96	"""
97
98	return array	1✔
99
100	def numpy(self, array):	1✔
101	"""
102	Handles backend-specific code to convert an array to numpy
103
104	Args:
105	array: data array
106
107	Returns:
108	numpy array
109	"""
110
111	return array	1✔
112
113	def totype(self, array, dtype):	1✔
114	"""
115	Casts array to dtype.
116
117	Args:
118	array: input array
119	dtype: dtype
120
121	Returns:
122	array cast as dtype
123	"""
124
125	return np.int64(array) if dtype == np.int64 else array	1✔
126
127	def settings(self):	1✔
128	"""
129	Returns settings for this array.
130
131	Returns:
132	dict
133	"""
134
135	return {"numpy": np.__version__}	1✔
136
137	def hammingscore(self, queries):	1✔
138	"""
139	Calculates a hamming distance score.
140
141	This is defined as:
142
143	score = 1.0 - (hamming distance / total number of bits)
144
145	Args:
146	queries: queries array
147
148	Returns:
149	scores
150	"""
151
152	# Build table of number of bits for each distinct uint8 value
153	table = 1 << np.arange(8)	1✔
154	table = self.tensor(np.array([np.count_nonzero(x & table) for x in np.arange(256)]))	1✔
155
156	# Number of different bits
157	delta = self.xor(self.tensor(queries[:, None]), self.backend)	1✔
158
159	# Cast to long array
160	delta = self.totype(delta, np.int64)	1✔
161
162	# Calculate score as 1.0 - percentage of different bits
163	return 1.0 - (table[delta].sum(axis=2) / (self.config["dimensions"] * 8))	1✔

neuml / txtai / 11062719915

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous