17320654362

Committed 29 Aug 2025 09:54AM UTC coverage: 92.098% (-0.01%) from 92.108%

Build # 17320654362

Build Type

Pull #9729

github

Committed by

web-flow

Commit Message

Merge 193734226 into 95dafdc20

Pull Request Pull Request #9729: feat(websearch): add exclude_subdomains parameter to SerperDevWebSearch

Run Details

12925 of 14034 relevant lines covered (92.1%)

0.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

75.0

haystack/components/websearch/serper_dev.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

import json
from typing import Any, Optional, Union
from urllib.parse import urlparse

import requests

from haystack import ComponentError, Document, component, default_from_dict, default_to_dict, logging
from haystack.utils import Secret, deserialize_secrets_inplace

logger = logging.getLogger(__name__)


SERPERDEV_BASE_URL = "https://google.serper.dev/search"


class SerperDevError(ComponentError): ...


@component
class SerperDevWebSearch:
    """
    Uses [Serper](https://serper.dev/) to search the web for relevant documents.

    See the [Serper Dev website](https://serper.dev/) for more details.

    Usage example:
    ```python
    from haystack.components.websearch import SerperDevWebSearch
    from haystack.utils import Secret

    websearch = SerperDevWebSearch(top_k=10, api_key=Secret.from_token("test-api-key"))
    results = websearch.run(query="Who is the boyfriend of Olivia Wilde?")

    assert results["documents"]
    assert results["links"]

    # Example with domain filtering - exclude subdomains
    websearch_filtered = SerperDevWebSearch(
        top_k=10,
        allowed_domains=["example.com"],
        exclude_subdomains=True,  # Only results from example.com, not blog.example.com
        api_key=Secret.from_token("test-api-key")
    )
    results_filtered = websearch_filtered.run(query="search query")
    ```
    """

    def __init__(
        self,
        api_key: Secret = Secret.from_env_var("SERPERDEV_API_KEY"),
        top_k: Optional[int] = 10,
        allowed_domains: Optional[list[str]] = None,
        search_params: Optional[dict[str, Any]] = None,
        *,
        exclude_subdomains: bool = False,
    ):
        """
        Initialize the SerperDevWebSearch component.

        :param api_key: API key for the Serper API.
        :param top_k: Number of documents to return.
        :param allowed_domains: List of domains to limit the search to.
        :param exclude_subdomains: Whether to exclude subdomains when filtering by allowed_domains.
            If True, only results from the exact domains in allowed_domains will be returned.
            If False, results from subdomains will also be included. Defaults to False.
        :param search_params: Additional parameters passed to the Serper API.
            For example, you can set 'num' to 20 to increase the number of search results.
            See the [Serper website](https://serper.dev/) for more details.
        """
        self.api_key = api_key
        self.top_k = top_k
        self.allowed_domains = allowed_domains
        self.exclude_subdomains = exclude_subdomains
        self.search_params = search_params or {}

        # Ensure that the API key is resolved.
        _ = self.api_key.resolve_value()

    def to_dict(self) -> dict[str, Any]:
        """
        Serializes the component to a dictionary.

        :returns:
                Dictionary with serialized data.
        """
        return default_to_dict(
            self,
            top_k=self.top_k,
            allowed_domains=self.allowed_domains,
            exclude_subdomains=self.exclude_subdomains,
            search_params=self.search_params,
            api_key=self.api_key.to_dict(),
        )

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> "SerperDevWebSearch":
        """
        Serializes the component to a dictionary.

        :returns:
                Dictionary with serialized data.
        """
        deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
        return default_from_dict(cls, data)

    def _is_domain_allowed(self, url: str) -> bool:
        """
        Check if a URL's domain is allowed based on allowed_domains and exclude_subdomains settings.

        :param url: The URL to check.
        :returns: True if the domain is allowed, False otherwise.
        """
        if not self.allowed_domains:
            return True

        try:
            parsed = urlparse(url)
            domain = parsed.netloc.lower()

            for allowed_domain in self.allowed_domains:
                allowed_domain = allowed_domain.lower()

                if self.exclude_subdomains:
                    # Exact domain match only
                    if domain == allowed_domain:
                        return True
                # Allow subdomains (current behavior)
                elif domain == allowed_domain or domain.endswith("." + allowed_domain):
                    return True

            return False
        except Exception:
            # If URL parsing fails, allow the result to be safe
            return True

    @component.output_types(documents=list[Document], links=list[str])
    def run(self, query: str) -> dict[str, Union[list[Document], list[str]]]:
        """
        Use [Serper](https://serper.dev/) to search the web.

        :param query: Search query.
        :returns: A dictionary with the following keys:
            - "documents": List of documents returned by the search engine.
            - "links": List of links returned by the search engine.
        :raises SerperDevError: If an error occurs while querying the SerperDev API.
        :raises TimeoutError: If the request to the SerperDev API times out.
        """
        query_prepend = "OR ".join(f"site:{domain} " for domain in self.allowed_domains) if self.allowed_domains else ""

        payload = json.dumps(
            {"q": query_prepend + query, "gl": "us", "hl": "en", "autocorrect": True, **self.search_params}
        )
        headers = {"X-API-KEY": self.api_key.resolve_value(), "Content-Type": "application/json"}

        try:
            response = requests.post(SERPERDEV_BASE_URL, headers=headers, data=payload, timeout=30)
            response.raise_for_status()  # Will raise an HTTPError for bad responses
        except requests.Timeout as error:
            raise TimeoutError(f"Request to {self.__class__.__name__} timed out.") from error

        except requests.RequestException as e:
            raise SerperDevError(f"An error occurred while querying {self.__class__.__name__}. Error: {e}") from e

        # If we reached this point, it means the request was successful and we can proceed
        json_result = response.json()

        # we get the snippet from the json result and put it in the content field of the document
        organic = [
            Document(meta={k: v for k, v in d.items() if k != "snippet"}, content=d.get("snippet"))
            for d in json_result["organic"]
            if self._is_domain_allowed(d.get("link", ""))
        ]

        # answer box is what search engine shows as a direct answer to the query
        answer_box = []
        if "answerBox" in json_result:
            answer_dict = json_result["answerBox"]
            highlighted_answers = answer_dict.get("snippetHighlighted")
            answer_box_content = None
            # Check if highlighted_answers is a list and has at least one element
            if isinstance(highlighted_answers, list) and len(highlighted_answers) > 0:
                answer_box_content = highlighted_answers[0]
            elif isinstance(highlighted_answers, str):
                answer_box_content = highlighted_answers
            if not answer_box_content:
                for key in ["snippet", "answer", "title"]:
                    if key in answer_dict:
                        answer_box_content = answer_dict[key]
                        break
            if answer_box_content and self._is_domain_allowed(answer_dict.get("link", "")):
                answer_box = [
                    Document(
                        content=answer_box_content,
                        meta={"title": answer_dict.get("title", ""), "link": answer_dict.get("link", "")},
                    )
                ]

        # these are related questions that search engine shows
        people_also_ask = []
        if "peopleAlsoAsk" in json_result:
            for result in json_result["peopleAlsoAsk"]:
                if self._is_domain_allowed(result.get("link", "")):
                    title = result.get("title", "")
                    people_also_ask.append(
                        Document(
                            content=result["snippet"] if result.get("snippet") else title,
                            meta={"title": title, "link": result.get("link", None)},
                        )
                    )

        documents = answer_box + organic + people_also_ask

        links = [result["link"] for result in json_result["organic"] if self._is_domain_allowed(result.get("link", ""))]

        logger.debug(
            "Serper Dev returned {number_documents} documents for the query '{query}'",
            number_documents=len(documents),
            query=query,
        )
        return {"documents": documents[: self.top_k], "links": links[: self.top_k]}

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	import json	1✔
6	from typing import Any, Optional, Union	1✔
7	from urllib.parse import urlparse	1✔
8
9	import requests	1✔
10
11	from haystack import ComponentError, Document, component, default_from_dict, default_to_dict, logging	1✔
12	from haystack.utils import Secret, deserialize_secrets_inplace	1✔
13
14	logger = logging.getLogger(__name__)	1✔
15
16
17	SERPERDEV_BASE_URL = "https://google.serper.dev/search"	1✔
18
19
20	class SerperDevError(ComponentError): ...	1✔
21
22
23	@component	1✔
24	class SerperDevWebSearch:	1✔
25	"""
26	Uses [Serper](https://serper.dev/) to search the web for relevant documents.
27
28	See the [Serper Dev website](https://serper.dev/) for more details.
29
30	Usage example:
31	```python
32	from haystack.components.websearch import SerperDevWebSearch
33	from haystack.utils import Secret
34
35	websearch = SerperDevWebSearch(top_k=10, api_key=Secret.from_token("test-api-key"))
36	results = websearch.run(query="Who is the boyfriend of Olivia Wilde?")
37
38	assert results["documents"]
39	assert results["links"]
40
41	# Example with domain filtering - exclude subdomains
42	websearch_filtered = SerperDevWebSearch(
43	top_k=10,
44	allowed_domains=["example.com"],
45	exclude_subdomains=True, # Only results from example.com, not blog.example.com
46	api_key=Secret.from_token("test-api-key")
47	)
48	results_filtered = websearch_filtered.run(query="search query")
49	```
50	"""
51
52	def __init__(	1✔
53	self,
54	api_key: Secret = Secret.from_env_var("SERPERDEV_API_KEY"),
55	top_k: Optional[int] = 10,
56	allowed_domains: Optional[list[str]] = None,
57	search_params: Optional[dict[str, Any]] = None,
58	*,
59	exclude_subdomains: bool = False,
60	):
61	"""
62	Initialize the SerperDevWebSearch component.
63
64	:param api_key: API key for the Serper API.
65	:param top_k: Number of documents to return.
66	:param allowed_domains: List of domains to limit the search to.
67	:param exclude_subdomains: Whether to exclude subdomains when filtering by allowed_domains.
68	If True, only results from the exact domains in allowed_domains will be returned.
69	If False, results from subdomains will also be included. Defaults to False.
70	:param search_params: Additional parameters passed to the Serper API.
71	For example, you can set 'num' to 20 to increase the number of search results.
72	See the [Serper website](https://serper.dev/) for more details.
73	"""
74	self.api_key = api_key	1✔
75	self.top_k = top_k	1✔
76	self.allowed_domains = allowed_domains	1✔
77	self.exclude_subdomains = exclude_subdomains	1✔
78	self.search_params = search_params or {}	1✔
79
80	# Ensure that the API key is resolved.
81	_ = self.api_key.resolve_value()	1✔
82
83	def to_dict(self) -> dict[str, Any]:	1✔
84	"""
85	Serializes the component to a dictionary.
86
87	:returns:
88	Dictionary with serialized data.
89	"""
90	return default_to_dict(	1✔
91	self,
92	top_k=self.top_k,
93	allowed_domains=self.allowed_domains,
94	exclude_subdomains=self.exclude_subdomains,
95	search_params=self.search_params,
96	api_key=self.api_key.to_dict(),
97	)
98
99	@classmethod	1✔
100	def from_dict(cls, data: dict[str, Any]) -> "SerperDevWebSearch":	1✔
101	"""
102	Serializes the component to a dictionary.
103
104	:returns:
105	Dictionary with serialized data.
106	"""
107	deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])	1✔
108	return default_from_dict(cls, data)	1✔
109
110	def _is_domain_allowed(self, url: str) -> bool:	1✔
111	"""
112	Check if a URL's domain is allowed based on allowed_domains and exclude_subdomains settings.
113
114	:param url: The URL to check.
115	:returns: True if the domain is allowed, False otherwise.
116	"""
117	if not self.allowed_domains:	1✔
118	return True	1✔
119
120	try:	1✔
121	parsed = urlparse(url)	1✔
122	domain = parsed.netloc.lower()	1✔
123
124	for allowed_domain in self.allowed_domains:	1✔
125	allowed_domain = allowed_domain.lower()	1✔
126
127	if self.exclude_subdomains:	1✔
128	# Exact domain match only
129	if domain == allowed_domain:	1✔
130	return True	1✔
131	# Allow subdomains (current behavior)
132	elif domain == allowed_domain or domain.endswith("." + allowed_domain):	1✔
133	return True	1✔
134
135	return False	1✔
136	except Exception:	×
137	# If URL parsing fails, allow the result to be safe
138	return True	×
139
140	@component.output_types(documents=list[Document], links=list[str])	1✔
141	def run(self, query: str) -> dict[str, Union[list[Document], list[str]]]:	1✔
142	"""
143	Use [Serper](https://serper.dev/) to search the web.
144
145	:param query: Search query.
146	:returns: A dictionary with the following keys:
147	- "documents": List of documents returned by the search engine.
148	- "links": List of links returned by the search engine.
149	:raises SerperDevError: If an error occurs while querying the SerperDev API.
150	:raises TimeoutError: If the request to the SerperDev API times out.
151	"""
152	query_prepend = "OR ".join(f"site:{domain} " for domain in self.allowed_domains) if self.allowed_domains else ""	1✔
153
154	payload = json.dumps(	1✔
155	{"q": query_prepend + query, "gl": "us", "hl": "en", "autocorrect": True, **self.search_params}
156	)
157	headers = {"X-API-KEY": self.api_key.resolve_value(), "Content-Type": "application/json"}	1✔
158
159	try:	1✔
160	response = requests.post(SERPERDEV_BASE_URL, headers=headers, data=payload, timeout=30)	1✔
161	response.raise_for_status() # Will raise an HTTPError for bad responses	1✔
162	except requests.Timeout as error:	1✔
163	raise TimeoutError(f"Request to {self.__class__.__name__} timed out.") from error	1✔
164
165	except requests.RequestException as e:	1✔
166	raise SerperDevError(f"An error occurred while querying {self.__class__.__name__}. Error: {e}") from e	1✔
167
168	# If we reached this point, it means the request was successful and we can proceed
169	json_result = response.json()	1✔
170
171	# we get the snippet from the json result and put it in the content field of the document
172	organic = [	1✔
173	Document(meta={k: v for k, v in d.items() if k != "snippet"}, content=d.get("snippet"))
174	for d in json_result["organic"]
175	if self._is_domain_allowed(d.get("link", ""))
176	]
177
178	# answer box is what search engine shows as a direct answer to the query
179	answer_box = []	1✔
180	if "answerBox" in json_result:	1✔
181	answer_dict = json_result["answerBox"]	×
182	highlighted_answers = answer_dict.get("snippetHighlighted")	×
183	answer_box_content = None	×
184	# Check if highlighted_answers is a list and has at least one element
185	if isinstance(highlighted_answers, list) and len(highlighted_answers) > 0:	×
186	answer_box_content = highlighted_answers[0]	×
187	elif isinstance(highlighted_answers, str):	×
188	answer_box_content = highlighted_answers	×
189	if not answer_box_content:	×
190	for key in ["snippet", "answer", "title"]:	×
191	if key in answer_dict:	×
192	answer_box_content = answer_dict[key]	×
193	break	×
194	if answer_box_content and self._is_domain_allowed(answer_dict.get("link", "")):	×
195	answer_box = [	×
196	Document(
197	content=answer_box_content,
198	meta={"title": answer_dict.get("title", ""), "link": answer_dict.get("link", "")},
199	)
200	]
201
202	# these are related questions that search engine shows
203	people_also_ask = []	1✔
204	if "peopleAlsoAsk" in json_result:	1✔
205	for result in json_result["peopleAlsoAsk"]:	×
206	if self._is_domain_allowed(result.get("link", "")):	×
207	title = result.get("title", "")	×
208	people_also_ask.append(	×
209	Document(
210	content=result["snippet"] if result.get("snippet") else title,
211	meta={"title": title, "link": result.get("link", None)},
212	)
213	)
214
215	documents = answer_box + organic + people_also_ask	1✔
216
217	links = [result["link"] for result in json_result["organic"] if self._is_domain_allowed(result.get("link", ""))]	1✔
218
219	logger.debug(	1✔
220	"Serper Dev returned {number_documents} documents for the query '{query}'",
221	number_documents=len(documents),
222	query=query,
223	)
224	return {"documents": documents[: self.top_k], "links": links[: self.top_k]}	1✔

deepset-ai / haystack / 17320654362

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous