• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

cortex-lab / alyx / 28589232838

02 Jul 2026 12:15PM UTC coverage: 86.733% (+0.2%) from 86.582%
28589232838

Pull #1007

github

web-flow
Merge b86f446ae into 5e2ad3f63
Pull Request #1007: Data Noice model for information about datasets

8839 of 10191 relevant lines covered (86.73%)

0.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.15
alyx/data/models.py
1
import logging
1✔
2
import markdown as _markdown
1✔
3
from enum import IntEnum
1✔
4
from one.alf.spec import QC
1✔
5

6
from django.core.validators import RegexValidator
1✔
7
from django.db import models
1✔
8
from django.conf import settings
1✔
9
from django.utils import timezone
1✔
10
from django.contrib.contenttypes.fields import GenericForeignKey
1✔
11
from django.contrib.contenttypes.models import ContentType
1✔
12

13
from actions.models import Session
1✔
14
from alyx.base import BaseModel, modify_fields, BaseManager, CharNullField, BaseQuerySet, ALF_SPEC
1✔
15

16
logger = logging.getLogger(__name__)
1✔
17

18

19
def _related_string(field):
1✔
20
    return "%(app_label)s_%(class)s_" + field + "_related"
1✔
21

22

23
def default_timezone():
1✔
24
    return settings.TIME_ZONE
1✔
25

26

27
# Data repositories
28
# ------------------------------------------------------------------------------------------------
29

30
class NameManager(models.Manager):
1✔
31
    def get_by_natural_key(self, name):
1✔
32
        return self.get(name=name)
×
33

34

35
class DataRepositoryType(BaseModel):
1✔
36
    """
37
    A type of data repository, e.g. local SAMBA file server; web archive; LTO tape
38
    """
39
    objects = NameManager()
1✔
40

41
    name = models.CharField(max_length=255, unique=True)
1✔
42

43
    class Meta:
1✔
44
        ordering = ('name',)
1✔
45

46
    def __str__(self):
1✔
47
        return "<DataRepositoryType '%s'>" % self.name
×
48

49

50
class DataRepository(BaseModel):
1✔
51
    """
52
    A data repository e.g. a particular local drive, specific cloud storage
53
    location, or a specific tape.
54

55
    Stores an absolute path to the repository root as a URI (e.g. for SMB
56
    file://myserver.mylab.net/Data/ALF/; for web
57
    https://www.neurocloud.edu/Data/). Additional information about the
58
    repository can stored in JSON  in a type-specific manner (e.g. which
59
    cardboard box to find a tape in)
60
    """
61
    objects = NameManager()
1✔
62

63
    name = models.CharField(max_length=255, unique=True)
1✔
64
    repository_type = models.ForeignKey(
1✔
65
        DataRepositoryType, null=True, blank=True, on_delete=models.CASCADE)
66
    hostname = models.CharField(
1✔
67
        max_length=200, blank=True,
68
        validators=[RegexValidator(r'^[a-zA-Z0-9\.\-\_]+$',
69
                                   message='Invalid hostname',
70
                                   code='invalid_hostname')],
71
        help_text="Host name of the network drive")
72
    data_url = models.URLField(
1✔
73
        blank=True, null=True,
74
        help_text="URL of the data repository, if it is accessible via HTTP")
75
    timezone = models.CharField(
1✔
76
        max_length=64, blank=True, default=default_timezone,
77
        help_text="Timezone of the server "
78
        "(see https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)")
79
    globus_path = models.CharField(
1✔
80
        max_length=1000, blank=True,
81
        help_text="absolute path to the repository on the server e.g. /mnt/something/")
82
    globus_endpoint_id = models.UUIDField(
1✔
83
        blank=True, null=True, help_text="UUID of the globus endpoint")
84
    globus_is_personal = models.BooleanField(
1✔
85
        null=True, blank=True, help_text="whether the Globus endpoint is personal or not. "
86
        "By default, Globus cannot transfer a file between two personal endpoints.")
87

88
    def __str__(self):
1✔
89
        return "<DataRepository '%s'>" % self.name
1✔
90

91
    class Meta:
1✔
92
        verbose_name_plural = "data repositories"
1✔
93
        ordering = ('name',)
1✔
94

95

96
# Datasets
97
# ------------------------------------------------------------------------------------------------
98

99
class DataFormat(BaseModel):
1✔
100
    """
101
    A descriptor to accompany a Dataset or DataCollection, saying what sort of information is
102
    contained in it. E.g. "Neuropixels raw data, formatted as flat binary file" "eye camera
103
    movie as mj2", etc. Normally each DatasetType will correspond to a specific 3-part alf name
104
    (for individual files) or the first word of the alf names (for DataCollections)
105
    """
106

107
    objects = NameManager()
1✔
108

109
    name = models.CharField(
1✔
110
        max_length=255, unique=True,
111
        help_text="short identifying name, e.g. 'npy'")
112

113
    description = models.CharField(
1✔
114
        max_length=255, blank=True,
115
        help_text="Human-readable description of the file format e.g. 'npy-formatted square "
116
        "numerical array'.")
117

118
    file_extension = models.CharField(
1✔
119
        max_length=255,
120
        validators=[RegexValidator(r'^\.[^\.]+$',
121
                                   message='Invalid file extension, should start with a dot',
122
                                   code='invalid_file_extension')],
123
        help_text="file extension, starting with a dot.")
124

125
    matlab_loader_function = models.CharField(
1✔
126
        max_length=255, blank=True,
127
        help_text="Name of MATLAB loader function'.")
128

129
    python_loader_function = models.CharField(
1✔
130
        max_length=255, blank=True,
131
        help_text="Name of Python loader function'.")
132

133
    class Meta:
1✔
134
        verbose_name_plural = "data formats"
1✔
135
        ordering = ('name',)
1✔
136

137
    def __str__(self):
1✔
138
        return "<DataFormat '%s'>" % self.name
1✔
139

140

141
class DatasetType(BaseModel):
1✔
142
    """
143
    A descriptor to accompany a Dataset or DataCollection, saying what sort of information is
144
    contained in it. E.g. "Neuropixels raw data, formatted as flat binary file" "eye camera
145
    movie as mj2", etc. Normally each DatasetType will correspond to a specific 3-part alf name
146
    (for individual files) or the first word of the alf names (for DataCollections)
147
    """
148

149
    objects = NameManager()
1✔
150

151
    name = models.CharField(
1✔
152
        max_length=255, unique=True, blank=True, null=False,
153
        help_text="Short identifying nickname, e.g. 'spikes.times'")
154

155
    created_by = models.ForeignKey(
1✔
156
        settings.AUTH_USER_MODEL, blank=True, null=True,
157
        on_delete=models.CASCADE,
158
        related_name=_related_string('created_by'),
159
        help_text="The creator of the data.")
160

161
    description = models.CharField(
1✔
162
        max_length=1023, blank=True,
163
        help_text="Human-readable description of data type. Should say what is in the file, and "
164
        "how to read it. For DataCollections, it should list what Datasets are expected in the "
165
        "the collection. E.g. 'Files related to spike events, including spikes.times.npy, "
166
        "spikes.clusters.npy, spikes.amps.npy, spikes.depths.npy")
167

168
    filename_pattern = CharNullField(
1✔
169
        max_length=255, unique=True, null=True, blank=True,
170
        help_text="File name pattern (with wildcards) for this file in ALF naming convention. "
171
        "E.g. 'spikes.times.*' or '*.timestamps.*', or 'spikes.*.*' for a DataCollection, which "
172
        "would include all files starting with the word 'spikes'. NB: Case-insensitive matching."
173
        "If null, the name field must match the object.attribute part of the filename."
174
    )
175

176
    class Meta:
1✔
177
        ordering = ('name',)
1✔
178

179
    def __str__(self):
1✔
180
        return "<DatasetType %s>" % self.name
1✔
181

182
    def save(self, *args, **kwargs):
1✔
183
        """Ensure filename_pattern is lower case."""
184
        if self.filename_pattern:
1✔
185
            self.filename_pattern = self.filename_pattern.lower()
1✔
186
        return super().save(*args, **kwargs)
1✔
187

188

189
class BaseExperimentalData(BaseModel):
1✔
190
    """
191
    Abstract base class for all data acquisition models. Never used directly.
192

193
    Contains an Session link, to provide information about who did the experiment etc. Note that
194
    sessions can be organized hierarchically, and this can point to any level of the hierarchy
195
    """
196
    session = models.ForeignKey(
1✔
197
        Session, blank=True, null=True,
198
        on_delete=models.CASCADE,
199
        related_name=_related_string('session'),
200
        help_text="The Session to which this data belongs")
201

202
    created_by = models.ForeignKey(
1✔
203
        settings.AUTH_USER_MODEL, blank=True, null=True,
204
        on_delete=models.CASCADE,
205
        related_name=_related_string('created_by'),
206
        help_text="The creator of the data.")
207

208
    created_datetime = models.DateTimeField(
1✔
209
        blank=True, null=True, default=timezone.now,
210
        help_text="The creation datetime.")
211

212
    generating_software = models.CharField(
1✔
213
        max_length=255, blank=True,
214
        help_text="e.g. 'ChoiceWorld 0.8.3'")
215

216
    provenance_directory = models.ForeignKey(
1✔
217
        'data.Dataset', blank=True, null=True,
218
        on_delete=models.CASCADE,
219
        related_name=_related_string('provenance'),
220
        help_text="link to directory containing intermediate results")
221

222
    class Meta:
1✔
223
        abstract = True
1✔
224

225

226
def default_dataset_type():
1✔
227
    return DatasetType.objects.get_or_create(name='unknown')[0].pk
1✔
228

229

230
def default_data_format():
1✔
231
    return DataFormat.objects.get_or_create(name='unknown')[0].pk
1✔
232

233

234
class Tag(BaseModel):
1✔
235
    objects = NameManager()
1✔
236
    name = models.CharField(max_length=255, blank=True, help_text="Long name", unique=True)
1✔
237
    description = models.CharField(max_length=1023, blank=True)
1✔
238
    protected = models.BooleanField(default=False)
1✔
239
    public = models.BooleanField(default=False)
1✔
240
    hash = models.CharField(blank=True, null=True, max_length=64,
1✔
241
                            help_text=("Hash of the data buffer, SHA-1 is 40 hex chars, while md5"
242
                                       "is 32 hex chars"))
243

244
    class Meta:
1✔
245
        ordering = ('name',)
1✔
246

247
    def __str__(self):
1✔
248
        return "<Tag %s>" % self.name
×
249

250

251
class Revision(BaseModel):
1✔
252
    """
253
    Dataset revision information
254
    """
255
    objects = NameManager()
1✔
256
    name_validator = RegexValidator(f"^{ALF_SPEC['revision']}$",
1✔
257
                                    "Revisions must only contain letters, "
258
                                    "numbers, hyphens, underscores and forward slashes.")
259
    name = models.CharField(max_length=255, blank=True, help_text="Long name",
1✔
260
                            unique=True, null=False, validators=[name_validator])
261
    description = models.CharField(max_length=1023, blank=True)
1✔
262
    created_datetime = models.DateTimeField(blank=True, null=True, default=timezone.now,
1✔
263
                                            help_text="created date")
264

265
    class Meta:
1✔
266
        ordering = ('name',)
1✔
267

268
    def __str__(self):
1✔
269
        return "<Revision %s>" % self.name
×
270

271
    def save(self, *args, **kwargs):
1✔
272
        self.clean_fields()
1✔
273
        return super(Revision, self).save(*args, **kwargs)
1✔
274

275

276
class DatasetQuerySet(BaseQuerySet):
1✔
277
    """A Queryset that checks for protected datasets before deletion"""
278

279
    def delete(self, force=False):
1✔
280
        if (protected := self.filter(tags__protected=True)).exists():
1✔
281
            if force:
1✔
282
                logger.warning('The following protected datasets will be deleted:\n%s',
1✔
283
                               '\n'.join(map(str, protected.values_list('name', 'session_id'))))
284
            else:
285
                logger.error(
1✔
286
                    'The following protected datasets cannot be deleted without force=True:\n%s',
287
                    '\n'.join(map(str, protected.values_list('name', 'session_id'))))
288
                raise models.ProtectedError(
1✔
289
                    f'Failed to delete {protected.count()} dataset(s) due to protected tags',
290
                    protected)
291
        super().delete()
1✔
292

293

294
class DatasetManager(BaseManager):
1✔
295
    def get_queryset(self):
1✔
296
        qs = DatasetQuerySet(self.model, using=self._db)
1✔
297
        qs = qs.select_related('dataset_type', 'data_format')
1✔
298
        return qs
1✔
299

300

301
@modify_fields(name={
1✔
302
    'blank': False,
303
})
304
class Dataset(BaseExperimentalData):
1✔
305
    """
306
    A chunk of data that is stored outside the database, most often a rectangular binary array.
307
    There can be multiple FileRecords for one Dataset, which will be different physical files,
308
    all containing identical data, with the same MD5.
309

310
    Note that by convention, binary arrays are stored as .npy and text arrays as .tsv
311
    """
312
    objects = DatasetManager()
1✔
313

314
    # Generic foreign key to arbitrary model instances allows polymorphic relationships
315
    content_type = models.ForeignKey(ContentType, on_delete=models.CASCADE, null=True, blank=True)
1✔
316
    object_id = models.UUIDField(help_text="UUID of an object whose type matches content_type.",
1✔
317
                                 null=True, blank=True)
318
    content_object = GenericForeignKey()
1✔
319

320
    file_size = models.BigIntegerField(blank=True, null=True, help_text="Size in bytes")
1✔
321

322
    md5 = models.UUIDField(blank=True, null=True,
1✔
323
                           help_text="MD5 hash of the data buffer")
324

325
    hash = models.CharField(blank=True, null=False, max_length=64,
1✔
326
                            help_text=("Hash of the data buffer, SHA-1 is 40 hex chars, while md5"
327
                                       "is 32 hex chars"))
328

329
    # here we usually refer to version as an algorithm version such as ibllib-1.4.2
330
    version = models.CharField(blank=True, null=False, max_length=64,
1✔
331
                               help_text="version of the algorithm generating the file")
332

333
    # the collection comprises session sub-folders
334
    collection_validator = RegexValidator(f"^{ALF_SPEC['collection']}$",
1✔
335
                                          "Collections must only contain letters, "
336
                                          "numbers, hyphens, underscores and forward slashes.")
337
    collection = models.CharField(blank=True, null=False, max_length=255,
1✔
338
                                  help_text='file subcollection or subfolder',
339
                                  validators=[collection_validator])
340

341
    dataset_type = models.ForeignKey(
1✔
342
        DatasetType, blank=False, null=False, on_delete=models.SET_DEFAULT,
343
        default=default_dataset_type)
344

345
    data_format = models.ForeignKey(
1✔
346
        DataFormat, blank=False, null=False, on_delete=models.SET_DEFAULT,
347
        default=default_data_format)
348

349
    revision = models.ForeignKey(
1✔
350
        Revision, blank=True, null=True, on_delete=models.SET_NULL)
351

352
    tags = models.ManyToManyField('data.Tag', blank=True, related_name='datasets')
1✔
353

354
    auto_datetime = models.DateTimeField(auto_now=True, blank=True, null=True,
1✔
355
                                         verbose_name='last updated')
356

357
    default_dataset = models.BooleanField(default=True,
1✔
358
                                          help_text="Whether this dataset is the default "
359
                                                    "latest revision")
360

361
    QC_CHOICES = [(e.value, e.name) for e in QC]
1✔
362
    qc = models.IntegerField(default=QC.NOT_SET, choices=QC_CHOICES,
1✔
363
                             help_text=' / '.join([str(q[0]) + ': ' + q[1] for q in QC_CHOICES]))
364

365
    @property
1✔
366
    def is_online(self):
1✔
367
        fr = self.file_records.filter(data_repository__globus_is_personal=False)
1✔
368
        return bool(fr.count() and any(fr.values_list('exists', flat=True)))
1✔
369

370
    @property
1✔
371
    def is_protected(self):
1✔
372
        return bool(self.tags.filter(protected=True).count())
1✔
373

374
    @property
1✔
375
    def is_public(self):
1✔
376
        return bool(self.tags.filter(public=True).count())
1✔
377

378
    @property
1✔
379
    def data_url(self):
1✔
380
        records = self.file_records.filter(data_repository__data_url__isnull=False, exists=True)
1✔
381
        # returns preferentially globus non-personal endpoint
382
        if records:
1✔
383
            order_keys = ('data_repository__globus_is_personal', '-data_repository__name')
×
384
            return records.order_by(*order_keys)[0].data_url
×
385

386
    def __str__(self):
1✔
387
        date = self.created_datetime.strftime('%d/%m/%Y at %H:%M')
1✔
388
        return "<Dataset %s %s '%s' by %s on %s>" % (
1✔
389
            str(self.pk)[:8], getattr(self.dataset_type, 'name', ''),
390
            self.name, self.created_by, date)
391

392
    def save(self, *args, **kwargs):
1✔
393
        # when a dataset is saved / created make sure the probe insertion is set in the reverse m2m
394
        super(Dataset, self).save(*args, **kwargs)
1✔
395
        if not self.collection:
1✔
396
            return
1✔
397
        self.clean_fields()  # Validate collection field
1✔
398
        from experiments.models import ProbeInsertion, FOV
1✔
399
        parts = self.collection.rsplit('/')
1✔
400
        if len(parts) > 1:
1✔
401
            name = parts[1]
1✔
402
            pis = ProbeInsertion.objects.filter(session=self.session, name=name)
1✔
403
            if len(pis):
1✔
404
                self.probe_insertion.set(pis.values_list('pk', flat=True))
1✔
405
            fovs = FOV.objects.filter(session=self.session, name=name)
1✔
406
            if len(fovs):
1✔
407
                self.field_of_view.set(fovs.values_list('pk', flat=True))
×
408

409
    def delete(self, *args, force=False, **kwargs):
1✔
410
        # If a dataset is protected and force=False, raise an exception
411
        # NB This is not called when bulk deleting or in cascading deletes
412
        if self.is_protected and not force:
1✔
413
            tags = self.tags.filter(protected=True).values_list('name', flat=True)
1✔
414
            tags_str = '"' + '", "'.join(tags) + '"'
1✔
415
            logger.error(f'Dataset {self.name} is protected by tag(s); use force=True.')
1✔
416
            raise models.ProtectedError(
1✔
417
                f'Failed to delete dataset {self.name} due to protected tag(s) {tags_str}', self)
418
        super().delete(*args, **kwargs)
×
419

420

421
# Files
422
# ------------------------------------------------------------------------------------------------
423
class FileRecordManager(models.Manager):
1✔
424
    def get_queryset(self):
1✔
425
        qs = super(FileRecordManager, self).get_queryset()
1✔
426
        qs = qs.select_related('data_repository')
1✔
427
        return qs
1✔
428

429

430
class FileRecord(BaseModel):
1✔
431
    """
432
    A single file on disk or tape. Normally specified by a path within an archive. If required,
433
    more details can be in the JSON
434
    """
435

436
    objects = FileRecordManager()
1✔
437

438
    dataset = models.ForeignKey(Dataset, related_name='file_records', on_delete=models.CASCADE)
1✔
439

440
    data_repository = models.ForeignKey(
1✔
441
        'DataRepository', on_delete=models.CASCADE)
442

443
    relative_path = models.CharField(
1✔
444
        max_length=1000,
445
        validators=[RegexValidator(r'^[a-zA-Z0-9\_][^\\\:]+$',
446
                                   message='Invalid path',
447
                                   code='invalid_path')],
448
        help_text="path name within repository")
449

450
    exists = models.BooleanField(
1✔
451
        default=False, help_text="Whether the file exists in the data repository", )
452

453
    class Meta:
1✔
454
        unique_together = (('data_repository', 'relative_path'),)
1✔
455

456
    @property
1✔
457
    def data_url(self):
1✔
458
        root = self.data_repository.data_url
1✔
459
        if not root:
1✔
460
            return None
1✔
461
        from one.alf.path import add_uuid_string
×
462
        return root + add_uuid_string(self.relative_path, self.dataset.pk).as_posix()
×
463

464
    def save(self, *args, **kwargs):
1✔
465
        """this is to trigger the update of the auto-date field"""
466
        super(FileRecord, self).save(*args, **kwargs)
1✔
467
        # Save the dataset as well to make sure the auto datetime in the dateset is updated when
468
        # associated file record is saved
469
        self.dataset.save()
1✔
470

471
    def __str__(self):
1✔
472
        return "<FileRecord '%s' by %s>" % (self.relative_path, self.dataset.created_by)
×
473

474

475
# Download table
476
# ------------------------------------------------------------------------------------------------
477

478
class Download(BaseModel):
1✔
479
    user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
1✔
480
    dataset = models.ForeignKey(Dataset, on_delete=models.CASCADE)
1✔
481
    first_download = models.DateTimeField(auto_now_add=True)
1✔
482
    last_download = models.DateTimeField(auto_now=True)
1✔
483
    count = models.IntegerField(default=0)
1✔
484
    projects = models.ManyToManyField('subjects.Project', blank=True)
1✔
485

486
    class Meta:
1✔
487
        unique_together = (('user', 'dataset'),)
1✔
488

489
    def increment(self):
1✔
490
        self.count += 1
1✔
491
        self.save()
1✔
492

493
    def __str__(self):
1✔
494
        return '<Download of %s dataset by %s (%d)>' % (
×
495
            self.dataset.dataset_type.name, self.user.username, self.count)
496

497

498
def new_download(dataset, user, projects=()):
1✔
499
    d, _ = Download.objects.get_or_create(user=user, dataset=dataset)
1✔
500
    d.projects.add(*projects)
1✔
501
    d.increment()
1✔
502
    return d
1✔
503

504

505
class DataNotice(BaseModel):
1✔
506
    """A notice about data quality issues that may affect one or more datasets."""
507

508
    class IMPORTANCE(IntEnum):
1✔
509
        CRITICAL = 50
1✔
510
        MAJOR = 40
1✔
511
        MINOR = 30
1✔
512
        INSIGNIFICANT = 20
1✔
513

514
    description = models.TextField(blank=True)
1✔
515
    importance = models.IntegerField(
1✔
516
        default=IMPORTANCE.INSIGNIFICANT, choices=[(x.value, x.name) for x in IMPORTANCE],
517
        help_text=' / '.join([f'{q.value}: {q.name}' for q in IMPORTANCE]))
518

519
    datasets = models.ManyToManyField(
1✔
520
        Dataset, blank=True, related_name='data_notices')
521
    created_by = models.ForeignKey(
1✔
522
        settings.AUTH_USER_MODEL,
523
        null=True,
524
        blank=True,
525
        on_delete=models.SET_NULL,
526
        related_name='data_notices',
527
    )
528
    created_datetime = models.DateTimeField(auto_now_add=True)
1✔
529
    version_affected = models.CharField(max_length=64, blank=True)
1✔
530
    affected_date_start = models.DateField(null=True, blank=True)
1✔
531
    affected_date_end = models.DateField(null=True, blank=True)
1✔
532

533
    def description_html(self):
1✔
534
        """Render description as safe HTML via markdown."""
535
        if not self.description:
×
536
            return ''
×
537
        return _markdown.markdown(self.description, extensions=['extra'])
×
538

539
    def importance_panel_class(self):
1✔
540
        """Bootstrap panel class for this notice's importance level."""
541
        return {
×
542
            self.IMPORTANCE.CRITICAL: 'danger',
543
            self.IMPORTANCE.MAJOR: 'warning',
544
            self.IMPORTANCE.MINOR: 'info',
545
            self.IMPORTANCE.INSIGNIFICANT: 'default',
546
        }.get(self.importance, 'default')
547

548
    def importance_badge_color(self):
1✔
549
        """Hex color for the importance badge."""
550
        return {
×
551
            self.IMPORTANCE.CRITICAL: '#c9302c',
552
            self.IMPORTANCE.MAJOR: '#ec971f',
553
            self.IMPORTANCE.MINOR: '#31b0d5',
554
            self.IMPORTANCE.INSIGNIFICANT: '#6c757d',
555
        }.get(self.importance, '#6c757d')
556

557
    class Meta:
1✔
558
        ordering = ('-importance', '-created_datetime', 'name')
1✔
559

560
    def __str__(self):
1✔
561
        return self.name or str(self.id)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc