• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

neuml / txtai / 3790706974

pending completion
3790706974

push

github

davidmezzetti
Only normalize topn scores

7 of 7 new or added lines in 1 file covered. (100.0%)

4475 of 4479 relevant lines covered (99.91%)

1.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

100.0
/src/python/txtai/ann/faiss.py
1
"""
2
Faiss module
3
"""
4

5
import math
1✔
6

7
import numpy as np
1✔
8

9
from faiss import index_factory, IO_FLAG_MMAP, METRIC_INNER_PRODUCT, read_index, write_index
1✔
10

11
from .base import ANN
1✔
12

13

14
class Faiss(ANN):
1✔
15
    """
16
    Builds an ANN index using the Faiss library.
17
    """
18

19
    def load(self, path):
1✔
20
        # Load index
21
        self.backend = read_index(path, IO_FLAG_MMAP if self.setting("mmap") is True else 0)
1✔
22

23
    def index(self, embeddings):
1✔
24
        # Configure embeddings index. Inner product is equal to cosine similarity on normalized vectors.
25
        params = self.configure(embeddings.shape[0])
1✔
26
        self.backend = index_factory(embeddings.shape[1], params, METRIC_INNER_PRODUCT)
1✔
27

28
        # Train model
29
        self.backend.train(embeddings)
1✔
30

31
        # Add embeddings - position in embeddings is used as the id
32
        self.backend.add_with_ids(embeddings, np.arange(embeddings.shape[0], dtype=np.int64))
1✔
33

34
        # Add id offset and index build metadata
35
        self.config["offset"] = embeddings.shape[0]
1✔
36
        self.metadata({"components": params})
1✔
37

38
    def append(self, embeddings):
1✔
39
        new = embeddings.shape[0]
1✔
40

41
        # Append new ids - position in embeddings + existing offset is used as the id
42
        self.backend.add_with_ids(embeddings, np.arange(self.config["offset"], self.config["offset"] + new, dtype=np.int64))
1✔
43

44
        # Update id offset and index metadata
45
        self.config["offset"] += new
1✔
46
        self.metadata(None)
1✔
47

48
    def delete(self, ids):
1✔
49
        # Remove specified ids
50
        self.backend.remove_ids(np.array(ids, dtype=np.int64))
1✔
51

52
    def search(self, queries, limit):
1✔
53
        # Run the query
54
        self.backend.nprobe = self.nprobe()
1✔
55
        scores, ids = self.backend.search(queries, limit)
1✔
56

57
        # Map results to [(id, score)]
58
        results = []
1✔
59
        for x, score in enumerate(scores):
1✔
60
            results.append(list(zip(ids[x].tolist(), score.tolist())))
1✔
61

62
        return results
1✔
63

64
    def count(self):
1✔
65
        return self.backend.ntotal
1✔
66

67
    def save(self, path):
1✔
68
        # Write index
69
        write_index(self.backend, path)
1✔
70

71
    def configure(self, count):
1✔
72
        """
73
        Configures settings for a new index.
74

75
        Args:
76
            count: initial number of embeddings rows
77

78
        Returns:
79
            user-specified or generated components setting
80
        """
81

82
        # Lookup components setting
83
        components = self.setting("components")
1✔
84

85
        if components:
1✔
86
            return components
1✔
87

88
        # Get storage setting
89
        storage = "SQ8" if self.setting("quantize", self.config.get("quantize")) else "Flat"
1✔
90

91
        # Small index, use storage directly with IDMap
92
        if count <= 5000:
1✔
93
            return f"IDMap,{storage}"
1✔
94

95
        x = self.cells(count)
1✔
96
        components = f"IVF{x},{storage}"
1✔
97

98
        return components
1✔
99

100
    def cells(self, count):
1✔
101
        """
102
        Calculates the number of IVF cells for an IVF index.
103

104
        Args:
105
            count: number of embeddings rows
106

107
        Returns:
108
            number of IVF cells
109
        """
110

111
        # Calculate number of IVF cells where x = min(4 * sqrt(embeddings count), embeddings count / 39)
112
        # Faiss requires at least 39 * x data points
113
        return min(round(4 * math.sqrt(count)), int(count / 39))
1✔
114

115
    def nprobe(self):
1✔
116
        """
117
        Gets or derives the nprobe search parameter.
118

119
        Returns:
120
            nprobe setting
121
        """
122

123
        # Get size of embeddings index
124
        count = self.count()
1✔
125

126
        default = 6 if count <= 5000 else round(self.cells(count) / 16)
1✔
127
        return self.setting("nprobe", default)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc