• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

SwissDataScienceCenter / renku-data-services / 19761684713

28 Nov 2025 10:52AM UTC coverage: 86.512%. First build
19761684713

Pull #1124

github

web-flow
Merge 620c95085 into b4b6751d7
Pull Request #1124: feat: report buildrun failures to sentry

1 of 11 new or added lines in 1 file covered. (9.09%)

23521 of 27188 relevant lines covered (86.51%)

1.52 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

29.3
/components/renku_data_services/session/k8s_client.py
1
"""An abstraction over the kr8s kubernetes client and the k8s-watcher."""
2

3
from collections.abc import AsyncIterable
2✔
4
from typing import TYPE_CHECKING
2✔
5

6
import httpx
2✔
7
import sentry_sdk
2✔
8
from kr8s import NotFoundError, ServerError
2✔
9
from kr8s.asyncio.objects import APIObject, Pod
2✔
10

11
from renku_data_services import errors
2✔
12
from renku_data_services.errors.errors import CannotStartBuildError
2✔
13
from renku_data_services.k8s.constants import ClusterId
2✔
14
from renku_data_services.k8s.models import GVK, K8sObjectFilter, K8sObjectMeta
2✔
15
from renku_data_services.notebooks.api.classes.k8s_client import DEFAULT_K8S_CLUSTER
2✔
16
from renku_data_services.notebooks.util.retries import retry_with_exponential_backoff_async
2✔
17
from renku_data_services.session import crs, models
2✔
18
from renku_data_services.session.constants import (
2✔
19
    BUILD_RUN_GVK,
20
    DUMMY_TASK_RUN_USER_ID,
21
    TASK_RUN_GVK,
22
)
23
from renku_data_services.session.crs import BuildRun, TaskRun
2✔
24

25
if TYPE_CHECKING:
2✔
26
    from renku_data_services.k8s.clients import K8sClusterClientsPool
×
27

28

29
# NOTE The type ignore below is because the kr8s library has no type stubs, they claim pyright better handles type hints
30
class ShipwrightBuildRunV1Beta1Kr8s(APIObject):
2✔
31
    """Spec for Shipwright BuildRuns used by the k8s client."""
32

33
    kind: str = BUILD_RUN_GVK.kind
2✔
34
    version: str = BUILD_RUN_GVK.group_version
2✔
35
    namespaced: bool = True
2✔
36
    plural: str = "buildruns"
2✔
37
    singular: str = "buildrun"
2✔
38
    scalable: bool = False
2✔
39
    endpoint: str = "buildruns"
2✔
40

41

42
# NOTE The type ignore below is because the kr8s library has no type stubs, they claim pyright better handles type hints
43
class TektonTaskRunV1Kr8s(APIObject):
2✔
44
    """Spec for Tekton TaskRuns used by the k8s client."""
45

46
    kind: str = TASK_RUN_GVK.kind
2✔
47
    version: str = TASK_RUN_GVK.group_version
2✔
48
    namespaced: bool = True
2✔
49
    plural: str = "taskruns"
2✔
50
    singular: str = "taskrun"
2✔
51
    scalable: bool = False
2✔
52
    endpoint: str = "taskruns"
2✔
53

54

55
class ShipwrightClient:
2✔
56
    """The K8s client that combines a base client and a cache.
57

58
    No authentication or authorization is performed - this is the responsibility of the caller.
59
    """
60

61
    def __init__(
2✔
62
        self,
63
        client: "K8sClusterClientsPool",
64
        namespace: str,
65
    ) -> None:
66
        self.client = client
×
67
        self.namespace = namespace
×
68

69
    @staticmethod
2✔
70
    def cluster_id() -> ClusterId:
2✔
71
        """Cluster id of the main cluster."""
72
        return DEFAULT_K8S_CLUSTER
×
73

74
    async def list_build_runs(self, user_id: str) -> AsyncIterable[BuildRun]:
2✔
75
        """Get a list of Shipwright BuildRuns."""
76
        builds = self.client.list(K8sObjectFilter(namespace=self.namespace, gvk=BUILD_RUN_GVK, user_id=user_id))
×
77
        async for build in builds:
×
78
            yield BuildRun.model_validate(build.manifest.to_dict())
×
79
        return
×
80

81
    async def get_build_run(self, name: str, user_id: str) -> BuildRun | None:
2✔
82
        """Get a Shipwright BuildRun."""
83
        result = await self.client.get(
×
84
            K8sObjectMeta(
85
                name=name,
86
                namespace=self.namespace,
87
                cluster=self.cluster_id(),
88
                gvk=BUILD_RUN_GVK,
89
                user_id=user_id,
90
            )
91
        )
92
        if result is None:
×
93
            return None
×
94

95
        return BuildRun.model_validate(result.manifest.to_dict())
×
96

97
    async def create_build_run(self, manifest: BuildRun, user_id: str) -> BuildRun:
2✔
98
        """Create a new Shipwright BuildRun."""
99
        manifest.metadata.namespace = self.namespace
×
100
        build_run_name = manifest.metadata.name
×
101
        await self.client.create(
×
102
            K8sObjectMeta(
103
                name=build_run_name,
104
                namespace=self.namespace,
105
                cluster=self.cluster_id(),
106
                gvk=BUILD_RUN_GVK,
107
                user_id=user_id,
108
            ).with_manifest(manifest=manifest.model_dump(exclude_none=True, mode="json"))
109
        )
110
        build_resource = await retry_with_exponential_backoff_async(lambda x: x is None)(self.get_build_run)(
×
111
            build_run_name, user_id
112
        )
113
        if build_resource is None:
×
114
            raise CannotStartBuildError(message=f"Cannot create the image build {build_run_name}")
×
115
        return build_resource
×
116

117
    async def delete_build_run(self, name: str, user_id: str) -> None:
2✔
118
        """Delete a Shipwright BuildRun."""
119
        return await self.client.delete(
×
120
            K8sObjectMeta(
121
                name=name,
122
                namespace=self.namespace,
123
                cluster=self.cluster_id(),
124
                gvk=BUILD_RUN_GVK,
125
                user_id=user_id,
126
            )
127
        )
128

129
    async def cancel_build_run(self, name: str, user_id: str) -> BuildRun:
2✔
130
        """Cancel a Shipwright BuildRun."""
131
        build = await self.client.patch(
×
132
            K8sObjectMeta(
133
                name=name,
134
                namespace=self.namespace,
135
                cluster=self.cluster_id(),
136
                gvk=BUILD_RUN_GVK,
137
                user_id=user_id,
138
            ),
139
            patch={"spec": {"state": "BuildRunCanceled"}},
140
        )
141
        return BuildRun.model_validate(build.manifest.to_dict())
×
142

143
    async def get_task_run(self, name: str) -> TaskRun | None:
2✔
144
        """Get a Tekton TaskRun.
145

146
        Note: since we can't store custom labels on tekton task runs, we use hard-coded fixed user id in the cache db.
147
        """
148
        task = await self.client.get(
×
149
            K8sObjectMeta(
150
                name=name,
151
                namespace=self.namespace,
152
                cluster=self.cluster_id(),
153
                gvk=TASK_RUN_GVK,
154
                user_id=DUMMY_TASK_RUN_USER_ID,
155
            )
156
        )
157
        if task is None:
×
158
            return task
×
159
        return TaskRun.model_validate(task.manifest.to_dict())
×
160

161
    async def create_image_build(self, params: models.ShipwrightBuildRunParams, user_id: str) -> None:
2✔
162
        """Create a new BuildRun in Shipwright to support a newly created build."""
163
        metadata = crs.Metadata(name=params.name)
×
164
        if params.annotations:
×
165
            metadata.annotations = params.annotations
×
166
        if params.labels:
×
167
            metadata.labels = params.labels
×
168

169
        retention: crs.Retention | None = None
×
170
        if params.retention_after_failed or params.retention_after_succeeded:
×
171
            retention_after_failed = (
×
172
                int(params.retention_after_failed.total_seconds()) if params.retention_after_failed else None
173
            )
174
            retention_after_succeeded = (
×
175
                int(params.retention_after_succeeded.total_seconds()) if params.retention_after_succeeded else None
176
            )
177
            retention = crs.Retention(
×
178
                ttlAfterFailed=f"{retention_after_failed}s" if retention_after_failed else None,
179
                ttlAfterSucceeded=f"{retention_after_succeeded}s" if retention_after_succeeded else None,
180
            )
181

182
        build_run = BuildRun(
×
183
            metadata=metadata,
184
            spec=crs.BuildRunSpec(
185
                build=crs.Build(
186
                    spec=crs.BuildSpec(
187
                        source=crs.GitSource(
188
                            git=crs.Git(url=params.git_repository, revision=params.git_repository_revision),
189
                            contextDir=params.context_dir,
190
                        ),
191
                        strategy=crs.Strategy(kind="BuildStrategy", name=params.build_strategy_name),
192
                        paramValues=[
193
                            crs.ParamValue(name="frontend", value=params.frontend),
194
                            crs.ParamValue(name="run-image", value=params.run_image),
195
                            crs.ParamValue(name="builder-image", value=params.build_image),
196
                        ],
197
                        output=crs.BuildOutput(
198
                            image=params.output_image,
199
                            pushSecret=params.push_secret_name,
200
                        ),
201
                        timeout=f"{params.build_timeout.total_seconds()}s" if params.build_timeout else None,
202
                        nodeSelector=params.node_selector,
203
                        tolerations=params.tolerations,
204
                    )
205
                ),
206
                retention=retention,
207
            ),
208
        )
209
        await self.create_build_run(build_run, user_id)
×
210

211
    async def update_image_build_status(self, buildrun_name: str, user_id: str) -> models.ShipwrightBuildStatusUpdate:
2✔
212
        """Update the status of a build by pulling the corresponding BuildRun from Shipwright."""
213
        k8s_build = await self.get_build_run(name=buildrun_name, user_id=user_id)
×
214

215
        if k8s_build is None:
×
216
            # Report this condition to Sentry: we expected to find a buildrun object but found none
NEW
217
            try:
×
NEW
218
                raise errors.ProgrammingError(message=f"Build run {buildrun_name} not found.")
×
NEW
219
            except Exception as e:
×
NEW
220
                sentry_sdk.capture_exception(e)
×
221
            return models.ShipwrightBuildStatusUpdate(
×
222
                update=models.ShipwrightBuildStatusUpdateContent(status=models.BuildStatus.failed)
223
            )
224

225
        k8s_build_status = k8s_build.status
×
226
        completion_time = k8s_build_status.completionTime if k8s_build_status else None
×
227

228
        if k8s_build_status is None or completion_time is None:
×
229
            return models.ShipwrightBuildStatusUpdate(update=None)
×
230

231
        conditions = k8s_build_status.conditions
×
232
        # NOTE: You can get a condition like this in some cases during autoscaling or for other reasons
233
        #   message: Not all Steps in the Task have finished executing
234
        #   reason: Running
235
        #   status: Unknown
236
        #   /type: Succeeded
237
        # or
238
        #   message: TaskRun Pod exceeded available resources
239
        #   reason: ExceededNodeResources
240
        #   status: Unknown
241
        #   /type: Succeeded
242
        # In this case we want to keep waiting - the buildrun is still running.
243
        # A fully successful completion condition looks like this:
244
        #   reason: Succeeded
245
        #   status: True
246
        #   /type: Succeeded
247
        # See https://shipwright.io/docs/build/buildrun/#understanding-the-state-of-a-buildrun
248
        # NOTE: In the examples above I put / before the type field because mypy parses that and fails.
249
        # So I needed something to keep mypy happy. The real name of the field is "type"
250
        condition = next(filter(lambda c: c.type == "Succeeded", conditions or []), None)
×
251

252
        if condition is not None and condition.status not in ["True", "False"]:
×
253
            # The buildrun is still running or pending
254
            return models.ShipwrightBuildStatusUpdate(update=None)
×
255

256
        buildSpec = k8s_build_status.buildSpec
×
257
        output = buildSpec.output if buildSpec else None
×
258
        result_image = output.image if output else "unknown"
×
259

260
        source = buildSpec.source if buildSpec else None
×
261
        git_obj = source.git if source else None
×
262
        result_repository_url = git_obj.url if git_obj else "unknown"
×
263

264
        source_2 = k8s_build_status.source
×
265
        git_obj_2 = source_2.git if source_2 else None
×
266
        result_repository_git_commit_sha = git_obj_2.commitSha if git_obj_2 else None
×
267
        result_repository_git_commit_sha = result_repository_git_commit_sha or "unknown"
×
268

269
        if condition is not None and condition.reason == "Succeeded" and condition.status == "True":
×
270
            return models.ShipwrightBuildStatusUpdate(
×
271
                update=models.ShipwrightBuildStatusUpdateContent(
272
                    status=models.BuildStatus.succeeded,
273
                    completed_at=completion_time,
274
                    result=models.BuildResult(
275
                        completed_at=completion_time,
276
                        image=result_image,
277
                        repository_url=result_repository_url,
278
                        repository_git_commit_sha=result_repository_git_commit_sha,
279
                    ),
280
                )
281
            )
282
        else:
283
            # Report the failed buildrun to Sentry
NEW
284
            try:
×
NEW
285
                raise errors.ProgrammingError(message=f"Build run {buildrun_name} detected as failed.")
×
NEW
286
            except Exception as e:
×
NEW
287
                scope = sentry_sdk.get_current_scope()
×
NEW
288
                scope.set_context(key="build_run", value=k8s_build.model_dump(mode="json"))
×
NEW
289
                scope.capture_exception(e)
×
290
            return models.ShipwrightBuildStatusUpdate(
×
291
                update=models.ShipwrightBuildStatusUpdateContent(
292
                    status=models.BuildStatus.failed,
293
                    completed_at=completion_time,
294
                    error_reason=condition.reason if condition is not None else None,
295
                )
296
            )
297

298
    async def get_image_build_logs(
2✔
299
        self, buildrun_name: str, user_id: str, max_log_lines: int | None = None
300
    ) -> dict[str, str]:
301
        """Get the logs from a Shipwright BuildRun."""
302
        buildrun = await self.get_build_run(name=buildrun_name, user_id=user_id)
×
303
        if not buildrun:
×
304
            raise errors.MissingResourceError(message=f"Cannot find buildrun {buildrun_name} to retrieve logs.")
×
305
        status = buildrun.status
×
306
        task_run_name = status.taskRunName if status else None
×
307
        if not task_run_name:
×
308
            raise errors.MissingResourceError(
×
309
                message=f"The buildrun {buildrun_name} has no taskrun to retrieve logs from."
310
            )
311
        taskrun = await self.get_task_run(name=task_run_name)
×
312
        if not taskrun:
×
313
            raise errors.MissingResourceError(
×
314
                message=f"Cannot find taskrun from buildrun {buildrun_name} to retrieve logs."
315
            )
316
        pod_name = taskrun.status.podName if taskrun.status else None
×
317
        if not pod_name:
×
318
            raise errors.MissingResourceError(message=f"The buildrun {buildrun_name} has no pod to retrieve logs from.")
×
319
        return await self._get_pod_logs(name=pod_name, max_log_lines=max_log_lines)
×
320

321
    async def _get_pod_logs(self, name: str, max_log_lines: int | None = None) -> dict[str, str]:
2✔
322
        """Get the logs of all containers in a given pod."""
323
        result = await self.client.get(
×
324
            K8sObjectMeta(
325
                name=name, namespace=self.namespace, cluster=self.cluster_id(), gvk=GVK(kind="Pod", version="v1")
326
            )
327
        )
328
        logs: dict[str, str] = {}
×
329
        if result is None:
×
330
            return logs
×
331
        cluster = await self.client.cluster_by_id(result.cluster)
×
332

333
        obj = result.to_api_object(cluster.api)
×
334
        result = Pod(resource=obj, namespace=obj.namespace, api=cluster.api)
×
335

336
        containers = [container.name for container in result.spec.containers + result.spec.get("initContainers", [])]
×
337
        for container in containers:
×
338
            try:
×
339
                # NOTE: calling pod.logs without a container name set crashes the library
340
                clogs: list[str] = [clog async for clog in result.logs(container=container, tail_lines=max_log_lines)]
×
341
            except httpx.ResponseNotRead:
×
342
                # NOTE: This occurs when the container is still starting, but we try to read its logs
343
                continue
×
344
            except httpx.HTTPStatusError as err:
×
345
                # NOTE: This occurs when the container is waiting to start, but we try to read its logs
346
                if err.response.status_code == 400:
×
347
                    continue
×
348
                raise
×
349
            except NotFoundError as err:
×
350
                raise errors.MissingResourceError(message=f"The pod {name} does not exist.") from err
×
351
            except ServerError as err:
×
352
                if err.response is not None and err.response.status_code == 404:
×
353
                    raise errors.MissingResourceError(message=f"The pod {name} does not exist.") from err
×
354
                raise
×
355
            else:
356
                logs[container] = "\n".join(clogs)
×
357
        return logs
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc