• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

neuml / txtai / 11062719915

27 Sep 2024 01:06AM CUT coverage: 99.946%. Remained the same
11062719915

push

github

davidmezzetti
Update documentation

7406 of 7410 relevant lines covered (99.95%)

1.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

100.0
/src/python/txtai/ann/numpy.py
1
"""
2
NumPy module
3
"""
4

5
import numpy as np
1✔
6

7
from ..serialize import SerializeFactory
1✔
8

9
from .base import ANN
1✔
10

11

12
class NumPy(ANN):
1✔
13
    """
14
    Builds an ANN index backed by a NumPy array.
15
    """
16

17
    def __init__(self, config):
1✔
18
        super().__init__(config)
1✔
19

20
        # Array function definitions
21
        self.all, self.cat, self.dot, self.zeros = np.all, np.concatenate, np.dot, np.zeros
1✔
22
        self.argsort, self.xor = np.argsort, np.bitwise_xor
1✔
23

24
        # Scalar quantization
25
        quantize = self.config.get("quantize")
1✔
26
        self.qbits = quantize if quantize and isinstance(quantize, int) and not isinstance(quantize, bool) else None
1✔
27

28
    def load(self, path):
1✔
29
        # Load array from file
30
        try:
1✔
31
            self.backend = self.tensor(np.load(path, allow_pickle=False))
1✔
32
        except ValueError:
1✔
33
            # Backwards compatible support for previously pickled data
34
            self.backend = self.tensor(SerializeFactory.create("pickle").load(path))
1✔
35

36
    def index(self, embeddings):
1✔
37
        # Create index
38
        self.backend = self.tensor(embeddings)
1✔
39

40
        # Add id offset and index build metadata
41
        self.config["offset"] = embeddings.shape[0]
1✔
42
        self.metadata(self.settings())
1✔
43

44
    def append(self, embeddings):
1✔
45
        # Append new data to array
46
        self.backend = self.cat((self.backend, self.tensor(embeddings)), axis=0)
1✔
47

48
        # Update id offset and index metadata
49
        self.config["offset"] += embeddings.shape[0]
1✔
50
        self.metadata()
1✔
51

52
    def delete(self, ids):
1✔
53
        # Filter any index greater than size of array
54
        ids = [x for x in ids if x < self.backend.shape[0]]
1✔
55

56
        # Clear specified ids
57
        self.backend[ids] = self.tensor(self.zeros((len(ids), self.backend.shape[1])))
1✔
58

59
    def search(self, queries, limit):
1✔
60
        if self.qbits:
1✔
61
            # Calculate hamming score for integer vectors
62
            scores = self.hammingscore(queries)
1✔
63
        else:
64
            # Dot product on normalized vectors is equal to cosine similarity
65
            scores = self.dot(self.tensor(queries), self.backend.T)
1✔
66

67
        # Get topn ids
68
        ids = self.argsort(-scores)[:, :limit]
1✔
69

70
        # Map results to [(id, score)]
71
        results = []
1✔
72
        for x, score in enumerate(scores):
1✔
73
            # Add results
74
            results.append(list(zip(ids[x].tolist(), score[ids[x]].tolist())))
1✔
75

76
        return results
1✔
77

78
    def count(self):
1✔
79
        # Get count of non-zero rows (ignores deleted rows)
80
        return self.backend[~self.all(self.backend == 0, axis=1)].shape[0]
1✔
81

82
    def save(self, path):
1✔
83
        # Save array to file. Use stream to prevent ".npy" suffix being added.
84
        with open(path, "wb") as handle:
1✔
85
            np.save(handle, self.numpy(self.backend), allow_pickle=False)
1✔
86

87
    def tensor(self, array):
1✔
88
        """
89
        Handles backend-specific code such as loading to a GPU device.
90

91
        Args:
92
            array: data array
93

94
        Returns:
95
            array with backend-specific logic applied
96
        """
97

98
        return array
1✔
99

100
    def numpy(self, array):
1✔
101
        """
102
        Handles backend-specific code to convert an array to numpy
103

104
        Args:
105
            array: data array
106

107
        Returns:
108
            numpy array
109
        """
110

111
        return array
1✔
112

113
    def totype(self, array, dtype):
1✔
114
        """
115
        Casts array to dtype.
116

117
        Args:
118
            array: input array
119
            dtype: dtype
120

121
        Returns:
122
            array cast as dtype
123
        """
124

125
        return np.int64(array) if dtype == np.int64 else array
1✔
126

127
    def settings(self):
1✔
128
        """
129
        Returns settings for this array.
130

131
        Returns:
132
            dict
133
        """
134

135
        return {"numpy": np.__version__}
1✔
136

137
    def hammingscore(self, queries):
1✔
138
        """
139
        Calculates a hamming distance score.
140

141
        This is defined as:
142

143
            score = 1.0 - (hamming distance / total number of bits)
144

145
        Args:
146
            queries: queries array
147

148
        Returns:
149
            scores
150
        """
151

152
        # Build table of number of bits for each distinct uint8 value
153
        table = 1 << np.arange(8)
1✔
154
        table = self.tensor(np.array([np.count_nonzero(x & table) for x in np.arange(256)]))
1✔
155

156
        # Number of different bits
157
        delta = self.xor(self.tensor(queries[:, None]), self.backend)
1✔
158

159
        # Cast to long array
160
        delta = self.totype(delta, np.int64)
1✔
161

162
        # Calculate score as 1.0 - percentage of different bits
163
        return 1.0 - (table[delta].sum(axis=2) / (self.config["dimensions"] * 8))
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc