• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 17320654362

29 Aug 2025 09:54AM UTC coverage: 92.098% (-0.01%) from 92.108%
17320654362

Pull #9729

github

web-flow
Merge 193734226 into 95dafdc20
Pull Request #9729: feat(websearch): add exclude_subdomains parameter to SerperDevWebSearch

12925 of 14034 relevant lines covered (92.1%)

0.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

75.0
haystack/components/websearch/serper_dev.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import json
1✔
6
from typing import Any, Optional, Union
1✔
7
from urllib.parse import urlparse
1✔
8

9
import requests
1✔
10

11
from haystack import ComponentError, Document, component, default_from_dict, default_to_dict, logging
1✔
12
from haystack.utils import Secret, deserialize_secrets_inplace
1✔
13

14
logger = logging.getLogger(__name__)
1✔
15

16

17
SERPERDEV_BASE_URL = "https://google.serper.dev/search"
1✔
18

19

20
class SerperDevError(ComponentError): ...
1✔
21

22

23
@component
1✔
24
class SerperDevWebSearch:
1✔
25
    """
26
    Uses [Serper](https://serper.dev/) to search the web for relevant documents.
27

28
    See the [Serper Dev website](https://serper.dev/) for more details.
29

30
    Usage example:
31
    ```python
32
    from haystack.components.websearch import SerperDevWebSearch
33
    from haystack.utils import Secret
34

35
    websearch = SerperDevWebSearch(top_k=10, api_key=Secret.from_token("test-api-key"))
36
    results = websearch.run(query="Who is the boyfriend of Olivia Wilde?")
37

38
    assert results["documents"]
39
    assert results["links"]
40

41
    # Example with domain filtering - exclude subdomains
42
    websearch_filtered = SerperDevWebSearch(
43
        top_k=10,
44
        allowed_domains=["example.com"],
45
        exclude_subdomains=True,  # Only results from example.com, not blog.example.com
46
        api_key=Secret.from_token("test-api-key")
47
    )
48
    results_filtered = websearch_filtered.run(query="search query")
49
    ```
50
    """
51

52
    def __init__(
1✔
53
        self,
54
        api_key: Secret = Secret.from_env_var("SERPERDEV_API_KEY"),
55
        top_k: Optional[int] = 10,
56
        allowed_domains: Optional[list[str]] = None,
57
        search_params: Optional[dict[str, Any]] = None,
58
        *,
59
        exclude_subdomains: bool = False,
60
    ):
61
        """
62
        Initialize the SerperDevWebSearch component.
63

64
        :param api_key: API key for the Serper API.
65
        :param top_k: Number of documents to return.
66
        :param allowed_domains: List of domains to limit the search to.
67
        :param exclude_subdomains: Whether to exclude subdomains when filtering by allowed_domains.
68
            If True, only results from the exact domains in allowed_domains will be returned.
69
            If False, results from subdomains will also be included. Defaults to False.
70
        :param search_params: Additional parameters passed to the Serper API.
71
            For example, you can set 'num' to 20 to increase the number of search results.
72
            See the [Serper website](https://serper.dev/) for more details.
73
        """
74
        self.api_key = api_key
1✔
75
        self.top_k = top_k
1✔
76
        self.allowed_domains = allowed_domains
1✔
77
        self.exclude_subdomains = exclude_subdomains
1✔
78
        self.search_params = search_params or {}
1✔
79

80
        # Ensure that the API key is resolved.
81
        _ = self.api_key.resolve_value()
1✔
82

83
    def to_dict(self) -> dict[str, Any]:
1✔
84
        """
85
        Serializes the component to a dictionary.
86

87
        :returns:
88
                Dictionary with serialized data.
89
        """
90
        return default_to_dict(
1✔
91
            self,
92
            top_k=self.top_k,
93
            allowed_domains=self.allowed_domains,
94
            exclude_subdomains=self.exclude_subdomains,
95
            search_params=self.search_params,
96
            api_key=self.api_key.to_dict(),
97
        )
98

99
    @classmethod
1✔
100
    def from_dict(cls, data: dict[str, Any]) -> "SerperDevWebSearch":
1✔
101
        """
102
        Serializes the component to a dictionary.
103

104
        :returns:
105
                Dictionary with serialized data.
106
        """
107
        deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
1✔
108
        return default_from_dict(cls, data)
1✔
109

110
    def _is_domain_allowed(self, url: str) -> bool:
1✔
111
        """
112
        Check if a URL's domain is allowed based on allowed_domains and exclude_subdomains settings.
113

114
        :param url: The URL to check.
115
        :returns: True if the domain is allowed, False otherwise.
116
        """
117
        if not self.allowed_domains:
1✔
118
            return True
1✔
119

120
        try:
1✔
121
            parsed = urlparse(url)
1✔
122
            domain = parsed.netloc.lower()
1✔
123

124
            for allowed_domain in self.allowed_domains:
1✔
125
                allowed_domain = allowed_domain.lower()
1✔
126

127
                if self.exclude_subdomains:
1✔
128
                    # Exact domain match only
129
                    if domain == allowed_domain:
1✔
130
                        return True
1✔
131
                # Allow subdomains (current behavior)
132
                elif domain == allowed_domain or domain.endswith("." + allowed_domain):
1✔
133
                    return True
1✔
134

135
            return False
1✔
136
        except Exception:
×
137
            # If URL parsing fails, allow the result to be safe
138
            return True
×
139

140
    @component.output_types(documents=list[Document], links=list[str])
1✔
141
    def run(self, query: str) -> dict[str, Union[list[Document], list[str]]]:
1✔
142
        """
143
        Use [Serper](https://serper.dev/) to search the web.
144

145
        :param query: Search query.
146
        :returns: A dictionary with the following keys:
147
            - "documents": List of documents returned by the search engine.
148
            - "links": List of links returned by the search engine.
149
        :raises SerperDevError: If an error occurs while querying the SerperDev API.
150
        :raises TimeoutError: If the request to the SerperDev API times out.
151
        """
152
        query_prepend = "OR ".join(f"site:{domain} " for domain in self.allowed_domains) if self.allowed_domains else ""
1✔
153

154
        payload = json.dumps(
1✔
155
            {"q": query_prepend + query, "gl": "us", "hl": "en", "autocorrect": True, **self.search_params}
156
        )
157
        headers = {"X-API-KEY": self.api_key.resolve_value(), "Content-Type": "application/json"}
1✔
158

159
        try:
1✔
160
            response = requests.post(SERPERDEV_BASE_URL, headers=headers, data=payload, timeout=30)
1✔
161
            response.raise_for_status()  # Will raise an HTTPError for bad responses
1✔
162
        except requests.Timeout as error:
1✔
163
            raise TimeoutError(f"Request to {self.__class__.__name__} timed out.") from error
1✔
164

165
        except requests.RequestException as e:
1✔
166
            raise SerperDevError(f"An error occurred while querying {self.__class__.__name__}. Error: {e}") from e
1✔
167

168
        # If we reached this point, it means the request was successful and we can proceed
169
        json_result = response.json()
1✔
170

171
        # we get the snippet from the json result and put it in the content field of the document
172
        organic = [
1✔
173
            Document(meta={k: v for k, v in d.items() if k != "snippet"}, content=d.get("snippet"))
174
            for d in json_result["organic"]
175
            if self._is_domain_allowed(d.get("link", ""))
176
        ]
177

178
        # answer box is what search engine shows as a direct answer to the query
179
        answer_box = []
1✔
180
        if "answerBox" in json_result:
1✔
181
            answer_dict = json_result["answerBox"]
×
182
            highlighted_answers = answer_dict.get("snippetHighlighted")
×
183
            answer_box_content = None
×
184
            # Check if highlighted_answers is a list and has at least one element
185
            if isinstance(highlighted_answers, list) and len(highlighted_answers) > 0:
×
186
                answer_box_content = highlighted_answers[0]
×
187
            elif isinstance(highlighted_answers, str):
×
188
                answer_box_content = highlighted_answers
×
189
            if not answer_box_content:
×
190
                for key in ["snippet", "answer", "title"]:
×
191
                    if key in answer_dict:
×
192
                        answer_box_content = answer_dict[key]
×
193
                        break
×
194
            if answer_box_content and self._is_domain_allowed(answer_dict.get("link", "")):
×
195
                answer_box = [
×
196
                    Document(
197
                        content=answer_box_content,
198
                        meta={"title": answer_dict.get("title", ""), "link": answer_dict.get("link", "")},
199
                    )
200
                ]
201

202
        # these are related questions that search engine shows
203
        people_also_ask = []
1✔
204
        if "peopleAlsoAsk" in json_result:
1✔
205
            for result in json_result["peopleAlsoAsk"]:
×
206
                if self._is_domain_allowed(result.get("link", "")):
×
207
                    title = result.get("title", "")
×
208
                    people_also_ask.append(
×
209
                        Document(
210
                            content=result["snippet"] if result.get("snippet") else title,
211
                            meta={"title": title, "link": result.get("link", None)},
212
                        )
213
                    )
214

215
        documents = answer_box + organic + people_also_ask
1✔
216

217
        links = [result["link"] for result in json_result["organic"] if self._is_domain_allowed(result.get("link", ""))]
1✔
218

219
        logger.debug(
1✔
220
            "Serper Dev returned {number_documents} documents for the query '{query}'",
221
            number_documents=len(documents),
222
            query=query,
223
        )
224
        return {"documents": documents[: self.top_k], "links": links[: self.top_k]}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc