• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

SwissDataScienceCenter / renku-python / 6875247711

15 Nov 2023 09:16AM UTC coverage: 82.786% (-0.05%) from 82.831%
6875247711

Pull #3300

github

web-flow
Merge e2d3269e8 into 4726f660e
Pull Request #3300: chore: do not always retry load tests requests

25441 of 30731 relevant lines covered (82.79%)

3.12 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.84
/renku/domain_model/dataset.py
1
# Copyright Swiss Data Science Center (SDSC). A partnership between
2
# École Polytechnique Fédérale de Lausanne (EPFL) and
3
# Eidgenössische Technische Hochschule Zürich (ETHZ).
4
#
5
# Licensed under the Apache License, Version 2.0 (the "License");
6
# you may not use this file except in compliance with the License.
7
# You may obtain a copy of the License at
8
#
9
#     http://www.apache.org/licenses/LICENSE-2.0
10
#
11
# Unless required by applicable law or agreed to in writing, software
12
# distributed under the License is distributed on an "AS IS" BASIS,
13
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
# See the License for the specific language governing permissions and
15
# limitations under the License.
16
"""Models representing datasets."""
7✔
17

18
import copy
7✔
19
import os
7✔
20
import posixpath
7✔
21
from datetime import datetime
7✔
22
from pathlib import Path
7✔
23
from typing import TYPE_CHECKING, Dict, List, Optional, Union, cast
7✔
24
from urllib.parse import quote, urlparse
7✔
25
from uuid import uuid4
7✔
26

27
import deal
7✔
28
import marshmallow
7✔
29

30
from renku.core import errors
7✔
31
from renku.core.util.datetime8601 import fix_datetime, local_now, parse_date
7✔
32
from renku.core.util.git import get_entity_from_revision
7✔
33
from renku.core.util.metadata import is_linked_file
7✔
34
from renku.core.util.os import get_absolute_path
7✔
35
from renku.core.util.urls import get_slug
7✔
36
from renku.domain_model.constant import NO_VALUE, NON_EXISTING_ENTITY_CHECKSUM
7✔
37
from renku.domain_model.image import ImageObject
7✔
38
from renku.domain_model.project_context import project_context
7✔
39
from renku.infrastructure.immutable import Immutable, Slots
7✔
40
from renku.infrastructure.persistent import Persistent
7✔
41

42
if TYPE_CHECKING:
7✔
43
    from renku.domain_model.entity import Entity
×
44
    from renku.domain_model.provenance.agent import Person
×
45
    from renku.domain_model.provenance.annotation import Annotation
×
46

47

48
def is_dataset_slug_valid(slug: Optional[str]) -> bool:
7✔
49
    """Check if a given slug is valid."""
50
    # NOTE: Empty string, ``""``, isn't a valid name.
51
    return slug is not None and slug != "" and slug == get_slug(slug, lowercase=False)
6✔
52

53

54
def generate_default_slug(name: str, version: Optional[str] = None) -> str:
7✔
55
    """Get dataset slug."""
56
    max_length = 24
4✔
57
    # For compatibility with older versions use name as slug if it is valid; otherwise, use encoded name
58
    if is_dataset_slug_valid(name):
4✔
59
        return name
3✔
60

61
    slug = get_slug(name)
3✔
62
    slug = slug[:max_length]
3✔
63

64
    if version:
3✔
65
        max_version_length = 10
3✔
66
        version_slug = get_slug(version)[:max_version_length]
3✔
67
        slug = f"{slug[:-(len(version_slug) + 1)]}_{version_slug}"
3✔
68

69
    return get_slug(slug)
3✔
70

71

72
class Url:
7✔
73
    """Represents a schema URL reference."""
74

75
    def __init__(
7✔
76
        self,
77
        *,
78
        id: Optional[str] = None,
79
        url: Optional[Union[str, Dict[str, str]]] = None,
80
        url_str: Optional[str] = None,
81
        url_id: Optional[str] = None,
82
    ):
83
        self.id: str
4✔
84
        self.url: Union[str, Dict[str, str]]
4✔
85
        self.url_str: Optional[str] = url_str
4✔
86
        self.url_id: Optional[str] = url_id
4✔
87

88
        if not url:
4✔
89
            self.url = self._get_default_url()
4✔
90
        else:
91
            self.url = url
3✔
92
            if isinstance(self.url, dict):
3✔
93
                if "_id" in self.url:
×
94
                    self.url["@id"] = self.url.pop("_id")
×
95
                self.url_id = self.url["@id"]
×
96
                self.url_str = None
×
97
            elif isinstance(self.url, str):
3✔
98
                self.url_str = self.url
3✔
99
                self.url_id = None
3✔
100

101
        if not id or id.startswith("_:"):
4✔
102
            self.id = Url.generate_id(url_str=self.url_str, url_id=self.url_id)
4✔
103
        else:
104
            self.id = id
×
105

106
    def __repr__(self) -> str:
7✔
107
        return f"<Url {self.value}>"
×
108

109
    @staticmethod
7✔
110
    def generate_id(url_str, url_id):
7✔
111
        """Generate an identifier for Url."""
112
        url = url_str or url_id
4✔
113
        id = urlparse(url)._replace(scheme="").geturl().strip("/") if url else uuid4().hex
4✔
114
        id = quote(id, safe="/")
4✔
115

116
        return f"/urls/{id}"
4✔
117

118
    @property
7✔
119
    def value(self) -> str:
7✔
120
        """Returns the url value as string."""
121
        return cast(str, self.url_str or self.url_id)
4✔
122

123
    def _get_default_url(self):
7✔
124
        """Define default value for url field."""
125
        if self.url_str:
4✔
126
            return self.url_str
4✔
127
        elif self.url_id:
4✔
128
            return {"@id": self.url_id}
4✔
129
        else:
130
            raise NotImplementedError("Either url_id or url_str has to be set")
×
131

132

133
class DatasetTag(Persistent):
7✔
134
    """Represents a Tag of an instance of a dataset."""
135

136
    def __init__(
7✔
137
        self,
138
        *,
139
        dataset_id: Url,
140
        date_created: Optional[datetime] = None,
141
        description: Optional[str] = None,
142
        id: Optional[str] = None,
143
        name: str,
144
    ):
145
        if not id:
3✔
146
            id = DatasetTag.generate_id(dataset_id=dataset_id.value, name=name)
3✔
147

148
        self.dataset_id: Url = dataset_id
3✔
149
        self.date_created: datetime = parse_date(date_created) or local_now()
3✔
150
        self.description: Optional[str] = description
3✔
151
        self.id: str = id
3✔
152
        self.name: str = name
3✔
153

154
    @staticmethod
7✔
155
    def generate_id(dataset_id: str, name: str) -> str:
7✔
156
        """Define default value for id field."""
157
        identifier = Path(dataset_id).name
3✔
158
        name = quote(f"{name}@{identifier}", safe="")
3✔
159
        return f"/dataset-tags/{name}"
3✔
160

161

162
class Language(Immutable):
7✔
163
    """Represent a language of an object."""
164

165
    __slots__ = ("alternate_name", "name")
7✔
166

167
    def __init__(self, name: str, alternate_name: Optional[str] = None, id: Optional[str] = None):
7✔
168
        id = id or Language.generate_id(name)
×
169
        super().__init__(alternate_name=alternate_name, id=id, name=name)
×
170

171
    @staticmethod
7✔
172
    def generate_id(name: str) -> str:
7✔
173
        """Generate @id field."""
174
        name = quote(name, safe="")
×
175
        return f"/languages/{name}"
×
176

177

178
class RemoteEntity(Slots):
7✔
179
    """Reference to an Entity in a remote repository."""
180

181
    __slots__ = ("checksum", "id", "path", "url")
7✔
182

183
    def __init__(self, *, checksum: Optional[str], id: Optional[str] = None, path: Union[Path, str], url: str):
7✔
184
        super().__init__()
3✔
185
        self.checksum: str = checksum or NON_EXISTING_ENTITY_CHECKSUM
3✔
186
        self.id: str = id or RemoteEntity.generate_id(checksum=self.checksum, path=path, url=url)
3✔
187
        self.path: str = str(path)
3✔
188
        self.url: str = url
3✔
189

190
    @staticmethod
7✔
191
    def generate_id(checksum: str, path: Union[Path, str], url: str) -> str:
7✔
192
        """Generate an id."""
193
        parsed_url = urlparse(url)
3✔
194
        prefix = quote(posixpath.join(parsed_url.netloc.strip("/"), parsed_url.path.strip("/")))
3✔
195
        path = quote(str(path).strip("/"))
3✔
196
        return f"/remote-entities/{prefix}/{checksum}/{path}"
3✔
197

198
    def __eq__(self, other):
7✔
199
        if self is other:
3✔
200
            return True
1✔
201
        if not isinstance(other, RemoteEntity):
3✔
202
            return False
1✔
203
        return self.checksum == other.checksum and self.path == other.path and self.url == other.url
3✔
204

205
    def __hash__(self):
7✔
206
        return hash((self.checksum, self.path, self.url))
×
207

208

209
class DatasetFile(Slots):
7✔
210
    """A file in a dataset."""
211

212
    __slots__ = ("based_on", "date_added", "date_removed", "entity", "id", "is_external", "source", "linked", "size")
7✔
213

214
    @deal.ensure(lambda self, *_, result, **kwargs: self.date_removed is None or self.date_removed >= self.date_added)
7✔
215
    def __init__(
7✔
216
        self,
217
        *,
218
        entity: "Entity",
219
        based_on: Optional[RemoteEntity] = None,
220
        date_added: Optional[datetime] = None,
221
        date_removed: Optional[datetime] = None,
222
        id: Optional[str] = None,
223
        is_external: Optional[bool] = False,
224
        linked: Optional[bool] = False,
225
        source: Optional[Union[Path, str]] = None,
226
        size: Optional[int] = None,
227
    ):
228
        from renku.domain_model.entity import Entity
6✔
229

230
        assert entity is None or isinstance(entity, Entity), f"Invalid entity type: '{entity}'"
6✔
231

232
        super().__init__()
6✔
233

234
        self.based_on: Optional[RemoteEntity] = based_on
6✔
235
        self.date_added: datetime = fix_datetime(date_added) or local_now()
6✔
236
        self.date_removed: Optional[datetime] = fix_datetime(date_removed)
6✔
237
        self.entity: "Entity" = entity
6✔
238
        self.id: str = id or DatasetFile.generate_id()
6✔
239
        self.is_external: bool = is_external or False
6✔
240
        self.linked: bool = linked or False
6✔
241
        self.source: Optional[str] = str(source)
6✔
242
        self.size: Optional[int] = size
6✔
243

244
    @classmethod
7✔
245
    def from_path(
7✔
246
        cls,
247
        path: Union[str, Path],
248
        source=None,
249
        based_on: Optional[RemoteEntity] = None,
250
        checksum: Optional[str] = None,
251
        size: Optional[int] = None,
252
    ) -> "DatasetFile":
253
        """Return an instance from a path."""
254
        from renku.domain_model.entity import Entity
6✔
255

256
        # NOTE: Data is added from an external storage and isn't pulled yet
257
        if based_on and not (project_context.path / path).exists():
6✔
258
            checksum = based_on.checksum if based_on.checksum else NON_EXISTING_ENTITY_CHECKSUM
1✔
259
            id = Entity.generate_id(checksum=checksum, path=path)
1✔
260
            entity = Entity(id=id, checksum=checksum, path=path)
1✔
261
        else:
262
            entity = get_entity_from_revision(
6✔
263
                repository=project_context.repository, path=path, bypass_cache=True, checksum=checksum
264
            )
265

266
        is_external = False
6✔
267
        linked = is_linked_file(path=path, project_path=project_context.path)
6✔
268
        return cls(entity=entity, is_external=is_external, source=source, based_on=based_on, linked=linked, size=size)
6✔
269

270
    @staticmethod
7✔
271
    def generate_id():
7✔
272
        """Generate an identifier for DatasetFile.
273

274
        NOTE: ID should not rely on Entity properties because the same Entity can be added and removed multiple times.
275
        So, it should be marked by different DatasetFiles.
276
        """
277
        return f"/dataset-files/{uuid4().hex}"
6✔
278

279
    @classmethod
7✔
280
    def from_dataset_file(cls, other: "DatasetFile") -> "DatasetFile":
7✔
281
        """Return a copy with a different id."""
282
        self = other.copy()
3✔
283
        self.id = DatasetFile.generate_id()
3✔
284

285
        return self
3✔
286

287
    def __repr__(self) -> str:
7✔
288
        return f"<DatasetFile {self.entity.path}>"
×
289

290
    def correct_linked_attribute(self):
7✔
291
        """Replace ``is_external`` attribute with ``linked`` for linked dataset files."""
292
        if self.is_external and is_linked_file(self.entity.path, project_path=project_context.path):
6✔
293
            self.linked = True
×
294
            self.is_external = False
×
295
        elif not hasattr(self, "linked"):
6✔
296
            self.linked = False
×
297

298
    def copy(self) -> "DatasetFile":
7✔
299
        """Return a clone of this object."""
300
        return copy.copy(self)
4✔
301

302
    def is_equal_to(self, other: "DatasetFile"):
7✔
303
        """Compare content.
304

305
        NOTE: id is generated randomly and should not be included in this comparison.
306
        """
307
        return (
4✔
308
            self.based_on == other.based_on
309
            and self.date_added == other.date_added
310
            and self.date_removed == other.date_removed
311
            and self.entity == other.entity
312
            and self.is_external == other.is_external
313
            and self.linked == other.linked
314
            and self.source == other.source
315
        )
316

317
    @deal.ensure(lambda self, *_, result, **kwargs: self.date_removed >= self.date_added)
7✔
318
    def remove(self, date: Optional[datetime] = None):
7✔
319
        """Create a new instance and mark it as removed."""
320
        date_removed = fix_datetime(date) or local_now()
4✔
321
        self.date_removed = date_removed
4✔
322

323
    def is_removed(self) -> bool:
7✔
324
        """Return true if dataset is removed and should not be accessed."""
325
        return self.date_removed is not None
6✔
326

327
    def has_valid_checksum(self) -> bool:
7✔
328
        """Return if file has a valid checksum."""
329
        return (
1✔
330
            bool(self.entity.checksum)
331
            and self.entity.checksum != NON_EXISTING_ENTITY_CHECKSUM
332
            and (
333
                self.based_on is None
334
                or self.based_on.checksum != NON_EXISTING_ENTITY_CHECKSUM
335
                or bool(self.based_on.checksum)
336
            )
337
        )
338

339
    def has_valid_size(self) -> bool:
7✔
340
        """Return if file has a valid size."""
341
        return self.size is not None
1✔
342

343

344
class Dataset(Persistent):
7✔
345
    """Represent a dataset."""
346

347
    date_modified: Optional[datetime] = None  # type: ignore
7✔
348
    storage: Optional[str] = None
7✔
349
    datadir: Optional[str] = None
7✔
350

351
    @deal.ensure(
7✔
352
        lambda self, *_, result, **kwargs: (self.date_created is not None and self.date_published is None)
353
        or (self.date_created is None and self.date_published is not None)
354
    )
355
    @deal.ensure(
7✔
356
        lambda self, *_, result, **kwargs: (self.date_created is not None and self.date_modified >= self.date_created)
357
        or (self.date_published is not None and self.date_modified >= self.date_published)
358
    )
359
    @deal.ensure(
7✔
360
        lambda self, *_, result, **kwargs: self.date_removed is None or self.date_removed >= self.date_modified
361
    )
362
    def __init__(
7✔
363
        self,
364
        *,
365
        annotations: Optional[List["Annotation"]] = None,
366
        creators: Optional[List["Person"]] = None,
367
        datadir: Optional[Path] = None,
368
        dataset_files: Optional[List[DatasetFile]] = None,
369
        date_created: Optional[datetime] = None,
370
        date_modified: Optional[datetime] = None,
371
        date_published: Optional[datetime] = None,
372
        date_removed: Optional[datetime] = None,
373
        derived_from: Optional[Url] = None,
374
        description: Optional[str] = None,
375
        id: Optional[str] = None,
376
        identifier: Optional[str] = None,
377
        images: Optional[List[ImageObject]] = None,
378
        in_language: Optional[Language] = None,
379
        initial_identifier: Optional[str] = None,
380
        keywords: Optional[List[str]] = None,
381
        license: Optional[str] = None,
382
        name: Optional[str] = None,
383
        project_id: Optional[str] = None,
384
        same_as: Optional[Url] = None,
385
        slug: Optional[str] = None,
386
        storage: Optional[str] = None,
387
        title: Optional[str] = None,
388
        version: Optional[str] = None,
389
    ):
390
        if not slug:
6✔
391
            if title:  # NOTE: Old metadata which only has name/title
2✔
392
                slug, name, title = name, title, None
×
393
            elif not name:
2✔
394
                raise errors.ParameterError("Either 'slug', 'name' or 'title' must be set.", show_prefix=False)
×
395

396
            # NOTE: At this point we have new metadata with slug/name
397
            slug = slug or generate_default_slug(name, version)
2✔
398
        elif title:
6✔
399
            # NOTE: When both slug and title are set, copy title to name. This happens when transitioning from the old
400
            # metadata to the new one.
401
            name, title = title, None
×
402

403
        self._validate_slug(slug)
6✔
404
        self._validate_creator(creators)
6✔
405

406
        # if `date_published` is set, we are probably dealing with an imported dataset so `date_created` is not needed
407
        if date_published:
6✔
408
            date_created = None
2✔
409
        else:
410
            date_created = fix_datetime(date_created) or local_now()
6✔
411
        if initial_identifier is None:
6✔
412
            assert identifier is None, "Initial identifier can be None only when creating a new Dataset."
6✔
413
            initial_identifier = identifier = uuid4().hex
6✔
414

415
        self.identifier = identifier or uuid4().hex
6✔
416
        self.id = id or Dataset.generate_id(identifier=self.identifier)
6✔
417

418
        self.name: Optional[str] = name
6✔
419
        self.slug: str = slug
6✔
420
        self.title: Optional[str] = None
6✔
421

422
        self.creators: List["Person"] = creators or []
6✔
423
        # `dataset_files` includes existing files and those that have been removed in the previous version
424
        self.dataset_files: List[DatasetFile] = dataset_files or []
6✔
425
        self.date_created: Optional[datetime] = date_created
6✔
426
        self.date_modified: datetime = date_modified or local_now()
6✔
427
        self.date_published: Optional[datetime] = fix_datetime(date_published)
6✔
428
        self.date_removed: Optional[datetime] = fix_datetime(date_removed)
6✔
429
        self.derived_from: Optional[Url] = derived_from
6✔
430
        self.description: Optional[str] = description
6✔
431
        self.images: List[ImageObject] = images or []
6✔
432
        self.in_language: Optional[Language] = in_language
6✔
433
        self.initial_identifier: str = initial_identifier
6✔
434
        self.keywords: List[str] = keywords or []
6✔
435
        self.license: Optional[str] = license
6✔
436
        self.project_id: Optional[str] = project_id
6✔
437
        self.same_as: Optional[Url] = same_as
6✔
438
        self.storage: Optional[str] = storage
6✔
439
        self.version: Optional[str] = version
6✔
440
        self.annotations: List["Annotation"] = annotations or []
6✔
441

442
        if datadir:
6✔
443
            self.datadir: Optional[str] = str(datadir)
3✔
444

445
        self._correct_linked_files()
6✔
446

447
    def __setstate__(self, state):
7✔
448
        super().__setstate__(state)
6✔
449
        self._adjust_slug_and_name()
6✔
450
        self._correct_linked_files()
6✔
451

452
    def _correct_linked_files(self):
7✔
453
        """Fix linked dataset files."""
454
        for file in self.dataset_files:
6✔
455
            file.correct_linked_attribute()
6✔
456

457
    def _adjust_slug_and_name(self):
7✔
458
        """Replace name/title with slug/name if needed."""
459
        slug = getattr(self, "slug", None)
6✔
460
        if not slug:  # NOTE: Dataset doesn't have new metadata since slug isn't set
6✔
461
            self.slug, self.name, self.title = self.name, self.title, None  # type: ignore
3✔
462
        else:
463
            assert self.title is None, f"Invalid slug: '{slug}', name: '{self.name}', and title: '{self.title}' values"
6✔
464

465
    @staticmethod
7✔
466
    def generate_id(identifier: str) -> str:
7✔
467
        """Generate an identifier for Dataset."""
468
        return f"/datasets/{identifier}"
6✔
469

470
    @staticmethod
7✔
471
    def _validate_slug(slug: Optional[str]):
7✔
472
        if not is_dataset_slug_valid(slug):
6✔
473
            raise errors.ParameterError(f"Invalid dataset slug: '{slug}'")
×
474

475
    @staticmethod
7✔
476
    def _validate_creator(creators):
7✔
477
        from renku.domain_model.provenance.agent import Person, SoftwareAgent
6✔
478

479
        creators = creators or []
6✔
480
        for creator in creators:
6✔
481
            if not isinstance(creator, (Person, SoftwareAgent)):
6✔
482
                raise ValueError(f"Invalid creator type: {creator}")
1✔
483

484
    @property
7✔
485
    def files(self) -> List[DatasetFile]:
7✔
486
        """Return list of existing files."""
487
        return [f for f in self.dataset_files if not f.is_removed()]
6✔
488

489
    @property
7✔
490
    def creators_csv(self):
7✔
491
        """Comma-separated list of creators associated with dataset."""
492
        return ", ".join(creator.name for creator in self.creators)
4✔
493

494
    @property
7✔
495
    def creators_full_csv(self):
7✔
496
        """Comma-separated list of creators with full identity."""
497
        return ", ".join(creator.full_identity for creator in self.creators)
4✔
498

499
    @property
7✔
500
    def keywords_csv(self):
7✔
501
        """Comma-separated list of keywords associated with dataset."""
502
        return ", ".join(self.keywords or [])
2✔
503

504
    def get_datadir(self) -> Path:
7✔
505
        """Return dataset's data directory relative to project's root."""
506
        if self.datadir:
6✔
507
            return Path(self.datadir)
3✔
508

509
        return Path(os.path.join(project_context.datadir, self.slug))
6✔
510

511
    def __repr__(self) -> str:
7✔
512
        return f"<Dataset {self.identifier} {self.slug}>"
×
513

514
    def is_derivation(self) -> bool:
7✔
515
        """Return if a dataset has correct derived_from."""
516
        return self.derived_from is not None and not self.same_as and self.id != self.derived_from.url_id
4✔
517

518
    def copy(self) -> "Dataset":
7✔
519
        """Return a clone of this dataset."""
520
        try:
4✔
521
            self.unfreeze()
4✔
522
            dataset = copy.copy(self)
4✔
523
        finally:
524
            self.freeze()
4✔
525

526
        dataset.annotations = [a.copy() for a in self.annotations]
4✔
527
        dataset.creators = self.creators.copy()
4✔
528
        dataset.dataset_files = [f.copy() for f in self.dataset_files]
4✔
529
        dataset.images = list(dataset.images or [])
4✔
530
        dataset.keywords = list(dataset.keywords or [])
4✔
531
        return dataset
4✔
532

533
    def replace_identifier(self, identifier: Optional[str] = None):
7✔
534
        """Replace dataset's identifier and update relevant fields.
535

536
        NOTE: Call this only for newly-created/-imported datasets that don't have a mutability chain because it sets
537
        `initial_identifier`.
538
        """
539
        assert self.derived_from is None, (
×
540
            f"Replacing identifier of dataset '{self.slug}:{self.identifier}' "
541
            f"that is derived from {self.derived_from.url_id}"
542
        )
543

544
        self._assign_new_identifier(identifier)
×
545
        # NOTE: Do not unset `same_as` because it can be set for imported datasets
546

547
    @deal.ensure(
7✔
548
        lambda self, *_, result, **kwargs: (self.date_created is not None and self.date_published is None)
549
        or (self.date_created is None and self.date_published is not None)
550
    )
551
    @deal.ensure(
7✔
552
        lambda self, *_, result, **kwargs: (self.date_created is not None and self.date_modified >= self.date_created)
553
        or (self.date_published is not None and self.date_modified >= self.date_published)
554
    )
555
    @deal.ensure(lambda self, *_, result, **kwargs: self.derived_from is not None)
7✔
556
    def derive_from(
7✔
557
        self,
558
        dataset: "Dataset",
559
        creator: Optional["Person"],
560
        identifier: Optional[str] = None,
561
        date_created: Optional[datetime] = None,
562
    ):
563
        """Make `self` a derivative of `dataset` and update related fields."""
564
        assert dataset is not None, "Cannot derive from None"
4✔
565
        assert self is not dataset, f"Cannot derive from the same dataset '{self.slug}:{self.identifier}'"
4✔
566
        assert not identifier or self.id != identifier, f"Cannot derive from the same id '{self.slug}:{identifier}'"
4✔
567

568
        self._assign_new_identifier(identifier)
4✔
569
        # NOTE: Setting `initial_identifier` is required for migration of broken projects
570
        self.initial_identifier = dataset.initial_identifier
4✔
571
        self.derived_from = Url(url_id=dataset.id)
4✔
572
        self.same_as = None
4✔
573
        self.date_created = date_created or dataset.date_created
4✔
574
        self.date_modified = local_now()
4✔
575
        self.date_published = dataset.date_published
4✔
576

577
        if creator and hasattr(creator, "email") and not any(c for c in self.creators if c.email == creator.email):
4✔
578
            self.creators.append(creator)
3✔
579

580
    def _assign_new_identifier(self, identifier: Optional[str]):
7✔
581
        identifier = identifier or uuid4().hex
4✔
582
        self.initial_identifier = identifier
4✔
583
        self.identifier = identifier
4✔
584
        self.id = Dataset.generate_id(identifier)
4✔
585
        # NOTE: We also need to re-assign the _p_oid since identifier has changed
586
        self.reassign_oid()
4✔
587

588
    @deal.ensure(lambda self, *_, result, **kwargs: self.date_removed >= self.date_modified)
7✔
589
    def remove(self, date: Optional[datetime] = None):
7✔
590
        """Mark the dataset as removed."""
591
        self.date_removed = fix_datetime(date) or local_now()
1✔
592

593
    def is_removed(self) -> bool:
7✔
594
        """Return true if dataset is removed."""
595
        return self.date_removed is not None
4✔
596

597
    def find_file(self, path: Union[Path, str]) -> Optional[DatasetFile]:
7✔
598
        """Find a file in the dataset using its relative path."""
599
        path = str(path)
6✔
600
        for file in self.dataset_files:
6✔
601
            if str(file.entity.path) == path and not file.is_removed():
4✔
602
                return file
4✔
603

604
        return None
6✔
605

606
    def update_files_from(self, current_dataset: "Dataset", date: Optional[datetime] = None):
7✔
607
        """Check `current_files` to reuse existing entries and mark removed files."""
608
        new_files: Dict[str, DatasetFile] = {f.entity.path: f for f in self.files}
4✔
609
        current_files: Dict[str, DatasetFile] = {f.entity.path: f for f in current_dataset.files}
4✔
610

611
        files = []
4✔
612

613
        for path, file in new_files.items():
4✔
614
            # Use existing entries from `current_files` to avoid creating new ids
615
            current_file = current_files.pop(path, None)
4✔
616
            if current_file and file.is_equal_to(current_file):
4✔
617
                files.append(current_file)
4✔
618
            else:
619
                files.append(file)
4✔
620

621
        # NOTE: Whatever remains in `current_files` are removed in the newer version
622
        for removed_file in current_files.values():
4✔
623
            removed_file = DatasetFile.from_dataset_file(removed_file)
3✔
624
            removed_file.remove(date)
3✔
625
            files.append(removed_file)
3✔
626

627
        self.dataset_files = files
4✔
628

629
    def update_metadata_from(self, other: "Dataset", exclude=None):
7✔
630
        """Update metadata from another dataset."""
631
        updatable_fields = [
3✔
632
            "creators",
633
            "date_created",
634
            "date_published",
635
            "derived_from",
636
            "description",
637
            "images",
638
            "in_language",
639
            "keywords",
640
            "license",
641
            "name",
642
            "same_as",
643
            "version",
644
        ]
645
        for name in updatable_fields:
3✔
646
            value = getattr(other, name)
3✔
647
            if exclude and name in exclude:
3✔
648
                continue
3✔
649
            setattr(self, name, value)
3✔
650

651
        if self.date_published is not None:
3✔
652
            self.date_created = None
1✔
653

654
        # NOTE: Fix image IDs, in some cases the image IDs set by the providers can be malformed
655
        # and not match the SHACL definition for Renku. This cannot be addressed in the dataset
656
        # providers because the dataset providers do not have access to the dataset ID which is needed
657
        # for setting the dataset image ID.
658
        if isinstance(self.images, list):
3✔
659
            for image_ind in range(len(self.images)):
3✔
660
                self.images[image_ind].id = ImageObject.generate_id(self.id, self.images[image_ind].position)
×
661

662
    def update_metadata(self, **kwargs):
7✔
663
        """Updates metadata."""
664
        editable_attributes = ["creators", "description", "keywords", "name"]
3✔
665
        for name, value in kwargs.items():
3✔
666
            if name not in editable_attributes:
3✔
667
                raise errors.ParameterError(f"Cannot edit field: '{name}'")
×
668
            if value is not NO_VALUE and value != getattr(self, name):
3✔
669
                setattr(self, name, value)
3✔
670

671
    def unlink_file(self, path, missing_ok=False) -> Optional[DatasetFile]:
7✔
672
        """Mark a file as removed using its relative path."""
673
        assert not self.immutable, f"Dataset is immutable {self}"
3✔
674

675
        file = self.find_file(path)
3✔
676

677
        if not file:
3✔
678
            if not missing_ok:
1✔
679
                raise errors.InvalidFileOperation(f"File cannot be found: {path}")
×
680
            return None
1✔
681

682
        file.remove()
3✔
683

684
        return file
3✔
685

686
    def is_within_datadir(self, path: Union[Path, str]) -> bool:
7✔
687
        """Return True if a given path is inside dataset's data directory."""
688
        datadir = get_absolute_path(self.get_datadir())
1✔
689
        absolute_path = get_absolute_path(path)
1✔
690
        return os.path.commonpath([absolute_path, datadir]) == datadir
1✔
691

692
    def add_or_update_files(self, files: Union[DatasetFile, List[DatasetFile]]):
7✔
693
        """Add new files or update existing files."""
694
        assert not self.immutable, f"Dataset is immutable {self}"
6✔
695

696
        if isinstance(files, DatasetFile):
6✔
697
            files = [files]
2✔
698

699
        new_files = []
6✔
700

701
        for file in cast(List[DatasetFile], files):
6✔
702
            existing_file = self.find_file(file.entity.path)
6✔
703
            if not existing_file:
6✔
704
                new_files.append(file)
6✔
705
            elif file.entity.checksum != existing_file.entity.checksum or file.date_added != existing_file.date_added:
3✔
706
                self.dataset_files.remove(existing_file)
3✔
707
                new_files.append(file)
3✔
708

709
        if not new_files:
6✔
710
            return
1✔
711

712
        self.dataset_files += new_files
6✔
713
        self._p_changed = True
6✔
714

715
    def clear_files(self):
7✔
716
        """Remove all files."""
717
        self.dataset_files = []
1✔
718

719

720
class DatasetCreatorsJson(marshmallow.Schema):
7✔
721
    """Schema for the dataset creators."""
722

723
    name = marshmallow.fields.String()
7✔
724
    email = marshmallow.fields.String()
7✔
725
    affiliation = marshmallow.fields.String()
7✔
726

727

728
class AnnotationJson(marshmallow.Schema):
7✔
729
    """Schema for Annotations."""
730

731
    source = marshmallow.fields.String()
7✔
732
    body = marshmallow.fields.Dict()
7✔
733

734

735
class DatasetDetailsJson(marshmallow.Schema):
7✔
736
    """Serialize a dataset to a response object."""
737

738
    slug = marshmallow.fields.String(required=True)
7✔
739
    version = marshmallow.fields.String(allow_none=True)
7✔
740
    created_at = marshmallow.fields.String(allow_none=True, attribute="date_created")
7✔
741

742
    name = marshmallow.fields.String()
7✔
743
    creators = marshmallow.fields.List(marshmallow.fields.Nested(DatasetCreatorsJson))
7✔
744
    description = marshmallow.fields.String()
7✔
745
    keywords = marshmallow.fields.List(marshmallow.fields.String())
7✔
746
    identifier = marshmallow.fields.String()
7✔
747
    storage = marshmallow.fields.String()
7✔
748

749
    annotations = marshmallow.fields.List(marshmallow.fields.Nested(AnnotationJson))
7✔
750

751
    data_directory = marshmallow.fields.Method("get_datadir")
7✔
752

753
    @staticmethod
7✔
754
    def get_datadir(obj):
7✔
755
        """Get data directory."""
756
        if isinstance(obj, dict):
2✔
757
            return str(obj.get("datadir_path", obj.get("datadir", "")))
1✔
758
        if hasattr(obj, "datadir_path"):
1✔
759
            return obj.datadir_path
1✔
760

761
        return str(obj.get_datadir())
1✔
762

763

764
class DatasetFileDetailsJson(marshmallow.Schema):
7✔
765
    """Serialize dataset files to a response object."""
766

767
    path = marshmallow.fields.String()
7✔
768
    created = marshmallow.fields.DateTime()
7✔
769
    added = marshmallow.fields.DateTime()
7✔
770

771
    size = marshmallow.fields.String()
7✔
772
    is_lfs = marshmallow.fields.Boolean()
7✔
773

774
    dataset_id = marshmallow.fields.String()
7✔
775
    dataset_slug = marshmallow.fields.String()
7✔
776

777
    creators = marshmallow.fields.List(marshmallow.fields.Nested(DatasetCreatorsJson))
7✔
778

779

780
class ImageObjectJson(marshmallow.Schema):
7✔
781
    """ImageObject json schema."""
782

783
    content_url = marshmallow.fields.String()
7✔
784
    position = marshmallow.fields.Integer()
7✔
785

786

787
class ImageObjectRequestJson(marshmallow.Schema):
7✔
788
    """ImageObject json schema."""
789

790
    file_id = marshmallow.fields.String()
7✔
791
    content_url = marshmallow.fields.String()
7✔
792
    position = marshmallow.fields.Integer(load_default=0)
7✔
793
    mirror_locally = marshmallow.fields.Bool(load_default=False)
7✔
794

795

796
def get_file_path_in_dataset(dataset: Dataset, dataset_file: DatasetFile) -> Path:
7✔
797
    """Return path of a file relative to dataset's data dir."""
798
    try:
1✔
799
        return (project_context.path / dataset_file.entity.path).relative_to(
1✔
800
            project_context.path / dataset.get_datadir()
801
        )
802
    except ValueError:  # NOTE: File is not in the dataset's data dir
×
803
        return Path(dataset_file.entity.path)
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc