• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

mindersec / minder / 25537948165

08 May 2026 05:06AM UTC coverage: 60.464% (+0.02%) from 60.442%
25537948165

Pull #6179

github

web-flow
Merge ca585a055 into 884258a7b
Pull Request #6179: build(deps): bump github.com/google/osv-scalibr from 0.3.4 to 0.4.5

63 of 85 new or added lines in 2 files covered. (74.12%)

1 existing line in 1 file now uncovered.

20444 of 33812 relevant lines covered (60.46%)

39.06 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

50.97
/internal/engine/ingester/diff/diff.go
1
// SPDX-FileCopyrightText: Copyright 2023 The Minder Authors
2
// SPDX-License-Identifier: Apache-2.0
3

4
// Package diff provides the diff rule data ingest engine
5
package diff
6

7
import (
8
        "bufio"
9
        "cmp"
10
        "context"
11
        "fmt"
12
        "math"
13
        "os"
14
        "path/filepath"
15
        "regexp"
16
        "slices"
17
        "strconv"
18
        "strings"
19

20
        "github.com/go-git/go-billy/v5"
21
        "github.com/go-git/go-billy/v5/helper/iofs"
22
        scalibr "github.com/google/osv-scalibr"
23
        scalibr_cfg "github.com/google/osv-scalibr/binary/proto/config_go_proto"
24
        "github.com/google/osv-scalibr/extractor"
25
        scalibr_fs "github.com/google/osv-scalibr/fs"
26
        scalibr_plugin "github.com/google/osv-scalibr/plugin"
27
        "github.com/google/osv-scalibr/plugin/list"
28
        "github.com/google/osv-scalibr/purl"
29
        "github.com/rs/zerolog"
30
        "google.golang.org/protobuf/reflect/protoreflect"
31

32
        pbinternal "github.com/mindersec/minder/internal/proto"
33
        pb "github.com/mindersec/minder/pkg/api/protobuf/go/minder/v1"
34
        "github.com/mindersec/minder/pkg/engine/v1/interfaces"
35
        "github.com/mindersec/minder/pkg/entities/v1/checkpoints"
36
)
37

38
const (
39
        // DiffRuleDataIngestType is the type of the diff rule data ingest engine
40
        DiffRuleDataIngestType = "diff"
41
        prFilesPerPage         = 30
42
        wildcard               = "*"
43
)
44

45
// Diff is the diff rule data ingest engine
46
type Diff struct {
47
        cli interfaces.GitHubListAndClone
48
        cfg *pb.DiffType
49
}
50

51
// NewDiffIngester creates a new diff ingester
52
func NewDiffIngester(
53
        cfg *pb.DiffType,
54
        cli interfaces.GitHubListAndClone,
55
) (*Diff, error) {
4✔
56
        if cfg == nil {
4✔
57
                cfg = &pb.DiffType{}
×
58
        }
×
59

60
        if cli == nil {
4✔
61
                return nil, fmt.Errorf("provider is nil")
×
62
        }
×
63

64
        return &Diff{
4✔
65
                cfg: cfg,
4✔
66
                cli: cli,
4✔
67
        }, nil
4✔
68
}
69

70
// GetType returns the type of the diff rule data ingest engine
71
func (*Diff) GetType() string {
3✔
72
        return DiffRuleDataIngestType
3✔
73
}
3✔
74

75
// GetConfig returns the config for the diff rule data ingest engine
76
func (di *Diff) GetConfig() protoreflect.ProtoMessage {
6✔
77
        return di.cfg
6✔
78
}
6✔
79

80
// Ingest ingests a diff from a pull request in accordance with its type
81
func (di *Diff) Ingest(
82
        ctx context.Context,
83
        ent protoreflect.ProtoMessage,
84
        _ map[string]any,
85
) (*interfaces.Ingested, error) {
4✔
86
        pr, ok := ent.(*pbinternal.PullRequest)
4✔
87
        if !ok {
4✔
88
                return nil, fmt.Errorf("entity is not a pull request")
×
89
        }
×
90

91
        // The GitHub Go API takes an int32, but our proto stores an int64; make sure we don't overflow
92
        if pr.Number > math.MaxInt {
4✔
93
                return nil, fmt.Errorf("pr number is too large")
×
94
        }
×
95
        prNumber := int(pr.Number)
4✔
96

4✔
97
        switch di.cfg.GetType() {
4✔
98
        case "", pb.DiffTypeDep:
×
99
                return di.getDepTypeDiff(ctx, prNumber, pr)
×
100

101
        case pb.DiffTypeNewDeps:
4✔
102
                // TODO: once we've tested some, convert DiffTypeDep to use this algorithm.
4✔
103
                return di.getScalibrTypeDiff(ctx, prNumber, pr)
4✔
104

105
        case pb.DiffTypeFull:
×
106
                return di.getFullTypeDiff(ctx, prNumber, pr)
×
107

108
        default:
×
109
                return nil, fmt.Errorf("unknown diff type")
×
110
        }
111
}
112

113
func (di *Diff) getDepTypeDiff(ctx context.Context, prNumber int, pr *pbinternal.PullRequest) (*interfaces.Ingested, error) {
×
114
        deps := pbinternal.PrDependencies{Pr: pr}
×
115
        page := 0
×
116

×
117
        for {
×
118
                prFiles, resp, err := di.cli.ListFiles(ctx, pr.RepoOwner, pr.RepoName, prNumber, prFilesPerPage, page)
×
119
                if err != nil {
×
120
                        return nil, fmt.Errorf("error getting pull request files: %w", err)
×
121
                }
×
122

123
                for _, file := range prFiles {
×
124
                        fileDiffs, err := di.ingestFileForDepDiff(file.GetFilename(), file.GetPatch(), file.GetRawURL(), *zerolog.Ctx(ctx))
×
125
                        if err != nil {
×
126
                                return nil, fmt.Errorf("error ingesting file %s: %w", file.GetFilename(), err)
×
127
                        }
×
128
                        deps.Deps = append(deps.Deps, fileDiffs...)
×
129
                }
130

131
                if resp.NextPage == 0 {
×
132
                        break
×
133
                }
134

135
                page = resp.NextPage
×
136
        }
137

138
        return &interfaces.Ingested{Object: &deps, Checkpoint: checkpoints.NewCheckpointV1Now()}, nil
×
139
}
140

141
func (di *Diff) getFullTypeDiff(ctx context.Context, prNumber int, pr *pbinternal.PullRequest) (*interfaces.Ingested, error) {
×
142
        diff := &pbinternal.PrContents{Pr: pr}
×
143
        page := 0
×
144

×
145
        for {
×
146
                prFiles, resp, err := di.cli.ListFiles(ctx, pr.RepoOwner, pr.RepoName, prNumber, prFilesPerPage, page)
×
147
                if err != nil {
×
148
                        return nil, fmt.Errorf("error getting pull request files: %w", err)
×
149
                }
×
150

151
                for _, file := range prFiles {
×
152
                        fileDiffs, err := ingestFileForFullDiff(file.GetFilename(), file.GetPatch(), file.GetRawURL())
×
153
                        if err != nil {
×
154
                                return nil, fmt.Errorf("error ingesting file %s: %w", file.GetFilename(), err)
×
155
                        }
×
156
                        diff.Files = append(diff.Files, fileDiffs)
×
157
                }
158

159
                if resp.NextPage == 0 {
×
160
                        break
×
161
                }
162

163
                page = resp.NextPage
×
164
        }
165

166
        return &interfaces.Ingested{Object: diff, Checkpoint: checkpoints.NewCheckpointV1Now()}, nil
×
167
}
168

169
func (di *Diff) ingestFileForDepDiff(
170
        filename, patchContents, patchUrl string,
171
        logger zerolog.Logger,
172
) ([]*pbinternal.PrDependencies_ContextualDependency, error) {
×
173
        parser := di.getParserForFile(filename, logger)
×
174
        if parser == nil {
×
175
                return nil, nil
×
176
        }
×
177

178
        depBatch, err := parser(patchContents)
×
179
        if err != nil {
×
180
                return nil, fmt.Errorf("error parsing file %s: %w", filename, err)
×
181
        }
×
182

183
        batchCtxDeps := make([]*pbinternal.PrDependencies_ContextualDependency, 0, len(depBatch))
×
184
        for i := range depBatch {
×
185
                dep := depBatch[i]
×
186
                batchCtxDeps = append(batchCtxDeps, &pbinternal.PrDependencies_ContextualDependency{
×
187
                        Dep: dep,
×
188
                        File: &pbinternal.PrDependencies_ContextualDependency_FilePatch{
×
189
                                Name:     filename,
×
190
                                PatchUrl: patchUrl,
×
191
                        },
×
192
                })
×
193
        }
×
194

195
        return batchCtxDeps, nil
×
196
}
197

198
func (di *Diff) getScalibrTypeDiff(ctx context.Context, _ int, pr *pbinternal.PullRequest) (*interfaces.Ingested, error) {
4✔
199
        deps := pbinternal.PrDependencies{Pr: pr}
4✔
200

4✔
201
        // TODO: we should be able to just fetch the additional commits between base and target.
4✔
202
        // Our current Git abstraction isn't quite powerful enough, so we do two shallow clones.
4✔
203

4✔
204
        baseInventory, err := di.scalibrInventory(ctx, pr.BaseCloneUrl, pr.BaseRef)
4✔
205
        if err != nil {
4✔
206
                return nil, fmt.Errorf("failed to clone base from %s at %q: %w", pr.BaseCloneUrl, pr.BaseRef, err)
×
207
        }
×
208
        newInventory, err := di.scalibrInventory(ctx, pr.TargetCloneUrl, pr.TargetRef)
4✔
209
        if err != nil {
4✔
210
                return nil, fmt.Errorf("failed to clone fork from %s at %q: %w", pr.TargetCloneUrl, pr.TargetRef, err)
×
211
        }
×
212

213
        newDeps := setDifference(baseInventory, newInventory, inventorySorter)
4✔
214

4✔
215
        deps.Deps = make([]*pbinternal.PrDependencies_ContextualDependency, 0, len(newDeps))
4✔
216
        for _, inventory := range newDeps {
10✔
217
                for _, filename := range inventory.Locations {
12✔
218
                        deps.Deps = append(deps.Deps, &pbinternal.PrDependencies_ContextualDependency{
6✔
219
                                Dep: &pbinternal.Dependency{
6✔
220
                                        Ecosystem: inventoryToEcosystem(inventory),
6✔
221
                                        Name:      inventory.Name,
6✔
222
                                        Version:   inventory.Version,
6✔
223
                                },
6✔
224
                                File: &pbinternal.PrDependencies_ContextualDependency_FilePatch{
6✔
225
                                        Name:     filename,
6✔
226
                                        PatchUrl: "", // TODO: do we need this?
6✔
227
                                },
6✔
228
                        })
6✔
229
                }
6✔
230
        }
231

232
        return &interfaces.Ingested{Object: &deps, Checkpoint: checkpoints.NewCheckpointV1Now()}, nil
4✔
233
}
234

235
func inventorySorter(a *extractor.Package, b *extractor.Package) int {
22✔
236
        // If we compare by name and version first, we can avoid serializing Locations to strings
22✔
237
        res := cmp.Or(cmp.Compare(a.Name, b.Name), cmp.Compare(a.Version, b.Version))
22✔
238
        if res != 0 {
42✔
239
                return res
20✔
240
        }
20✔
241
        // TODO: Locations should probably be sorted, but scalibr is going to export a compare function.
242
        aLoc := fmt.Sprintf("%v", a.Locations)
2✔
243
        bLoc := fmt.Sprintf("%v", b.Locations)
2✔
244
        return cmp.Compare(aLoc, bLoc)
2✔
245
}
246

247
func (di *Diff) scalibrInventory(ctx context.Context, repoURL string, ref string) ([]*extractor.Package, error) {
8✔
248
        clone, err := di.cli.Clone(ctx, repoURL, ref)
8✔
249
        if err != nil {
8✔
250
                return nil, err
×
251
        }
×
252

253
        tree, err := clone.Worktree()
8✔
254
        if err != nil {
8✔
255
                return nil, err
×
256
        }
×
257
        return scanFs(ctx, tree.Filesystem, map[string]string{})
8✔
258
}
259

260
func scanFs(ctx context.Context, memFS billy.Filesystem, _ map[string]string) ([]*extractor.Package, error) {
8✔
261
        // have to down-cast here, because scalibr needs multiple io/fs types
8✔
262
        wrapped, ok := iofs.New(memFS).(scalibr_fs.FS)
8✔
263
        if !ok {
8✔
264
                return nil, fmt.Errorf("error converting filesystem to ReadDirFS")
×
265
        }
×
266

267
        desiredCaps := scalibr_plugin.Capabilities{
8✔
268
                OS:            scalibr_plugin.OSLinux,
8✔
269
                Network:       scalibr_plugin.NetworkOffline,
8✔
270
                DirectFS:      false,
8✔
271
                RunningSystem: false,
8✔
272
        }
8✔
273

8✔
274
        // TODO: it's unfortunate that scalibr spills files to disk.  File an upstream bug?
8✔
275
        // NOTE: since we require NetworkOffline, we may not actually download anything...
8✔
276
        tmpDir, err := os.MkdirTemp("", "minder-scalibr-*")
8✔
277
        if err != nil {
8✔
NEW
278
                return nil, fmt.Errorf("failed to create temporary scalibr directory: %w", err)
×
NEW
279
        }
×
280
        defer func() {
16✔
281
                _ = os.RemoveAll(tmpDir)
8✔
282
        }()
8✔
283
        cfg := scalibr_cfg.PluginConfig{
8✔
284
                MaxFileSizeBytes:  1024 * 1024,
8✔
285
                LocalRegistry:     tmpDir,
8✔
286
                DisableGoogleAuth: true,
8✔
287
        }
8✔
288

8✔
289
        scalibrFs := scalibr_fs.ScanRoot{FS: wrapped}
8✔
290
        plugins, err := list.FromCapabilities(&desiredCaps, &cfg)
8✔
291
        if err != nil {
8✔
NEW
292
                return nil, err
×
NEW
293
        }
×
294
        // unknownbinariesextr uses file extension to determine "binary-ness", and triggers on e.g. .py files
295
        skipPlugins := []string{"ffa/unknownbinariesextr"}
8✔
296
        plugins = slices.DeleteFunc(plugins, func(p scalibr_plugin.Plugin) bool {
2,032✔
297
                return slices.Contains(skipPlugins, p.Name())
2,024✔
298
        })
2,024✔
299
        scanConfig := scalibr.ScanConfig{
8✔
300
                ScanRoots:    []*scalibr_fs.ScanRoot{&scalibrFs},
8✔
301
                Plugins:      plugins,
8✔
302
                Capabilities: &desiredCaps,
8✔
303
        }
8✔
304

8✔
305
        scanner := scalibr.New()
8✔
306
        scanResults := scanner.Scan(ctx, &scanConfig)
8✔
307

8✔
308
        if scanResults == nil || scanResults.Status == nil {
8✔
309
                return nil, fmt.Errorf("error scanning files: no results")
×
310
        }
×
311
        if scanResults.Status.Status == scalibr_plugin.ScanStatusSucceeded {
8✔
NEW
312
                return scanResults.Inventory.Packages, nil
×
UNCOV
313
        }
×
314
        // Scalibr runs a lot of plugins and aggregates the result.  Some of these are picky, and
315
        // fail for random reasons.  Accept partial success, but log the failing plugins.
316
        if scanResults.Status.Status == scalibr_plugin.ScanStatusPartiallySucceeded {
16✔
317
                known_bad := []string{
8✔
318
                        "endoflife/linuxdistro", // https://github.com/google/osv-scalibr/pull/2068
8✔
319
                        "rust/cargoauditable",   // https://github.com/go-git/go-billy/pull/208
8✔
320
                }
8✔
321
                for _, ps := range scanResults.PluginStatus {
896✔
322
                        if ps.Status.Status != scalibr_plugin.ScanStatusSucceeded {
897✔
323
                                if !slices.Contains(known_bad, ps.Name) {
9✔
NEW
324
                                        zerolog.Ctx(ctx).Warn().Str("plugin", ps.Name).Str("status", ps.Status.FailureReason).
×
NEW
325
                                                Msg("Scalibr plugin failed")
×
NEW
326
                                }
×
327
                        }
328
                }
329
                return scanResults.Inventory.Packages, nil
8✔
330
        }
NEW
331
        return nil, fmt.Errorf("error scanning files: %s", scanResults.Status)
×
332
}
333

334
func inventoryToEcosystem(inventory *extractor.Package) pbinternal.DepEcosystem {
6✔
335
        if inventory == nil {
6✔
336
                zerolog.Ctx(context.Background()).Warn().Msg("nil ecosystem scanning diffs")
×
337
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_UNSPECIFIED
×
338
        }
×
339

340
        package_url := inventory.PURL()
6✔
341
        if package_url == nil {
6✔
NEW
342
                package_url = &purl.PackageURL{}
×
NEW
343
        }
×
344

345
        // Sometimes Scalibr uses the string "PyPI" instead of "pypi" when reporting the ecosystem.
346
        switch package_url.Type {
6✔
347
        // N.B. using an enum here abitrarily restricts our ability to add new
348
        // ecosystems without a core minder change.  Switching to strings ala
349
        // purl might be an improvement.
350
        case purl.TypePyPi:
2✔
351
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_PYPI
2✔
352
        case purl.TypeNPM:
2✔
353
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_NPM
2✔
354
        case purl.TypeGolang:
2✔
355
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_GO
2✔
356
        default:
×
357
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_UNSPECIFIED
×
358
        }
359
}
360

361
// ingestFileForFullDiff processes a given file's patch from a pull request.
362
// It scans through the patch line by line, identifying the changes made.
363
// If it's a hunk header, it extracts the starting line number. If it's an addition, it records the line content and its number.
364
// The function also increments the line number for context lines (lines that provide context but haven't been modified).
365
func ingestFileForFullDiff(filename, patch, patchUrl string) (*pbinternal.PrContents_File, error) {
×
366
        var result []*pbinternal.PrContents_File_Line
×
367

×
368
        scanner := bufio.NewScanner(strings.NewReader(patch))
×
369
        regex := regexp.MustCompile(`@@ -\d+,\d+ \+(\d+),\d+ @@`)
×
370

×
371
        var currentLineNumber int64
×
372
        var err error
×
373
        for scanner.Scan() {
×
374
                line := scanner.Text()
×
375

×
376
                if matches := regex.FindStringSubmatch(line); matches != nil {
×
377
                        currentLineNumber, err = strconv.ParseInt(matches[1], 10, 32)
×
378
                        if err != nil {
×
379
                                return nil, fmt.Errorf("error parsing line number from the hunk header: %w", err)
×
380
                        }
×
381
                } else if strings.HasPrefix(line, "+") {
×
382
                        result = append(result, &pbinternal.PrContents_File_Line{
×
383
                                Content: line[1:],
×
384
                                // see the use of strconv.ParseInt above: this is a safe downcast
×
385
                                // nolint: gosec
×
386
                                LineNumber: int32(currentLineNumber),
×
387
                        })
×
388

×
389
                        currentLineNumber++
×
390
                } else if !strings.HasPrefix(line, "-") {
×
391
                        currentLineNumber++
×
392
                }
×
393
        }
394

395
        if err := scanner.Err(); err != nil {
×
396
                return nil, fmt.Errorf("error reading patch: %w", err)
×
397
        }
×
398

399
        return &pbinternal.PrContents_File{
×
400
                Name:         filename,
×
401
                FilePatchUrl: patchUrl,
×
402
                PatchLines:   result,
×
403
        }, nil
×
404
}
405

406
func (di *Diff) getEcosystemForFile(filename string) DependencyEcosystem {
5✔
407
        lastComponent := filepath.Base(filename)
5✔
408

5✔
409
        for _, ecoMapping := range di.cfg.Ecosystems {
10✔
410
                if match, _ := filepath.Match(ecoMapping.Depfile, lastComponent); match {
8✔
411
                        return DependencyEcosystem(ecoMapping.Name)
3✔
412
                }
3✔
413
        }
414
        return DepEcosystemNone
2✔
415
}
416

417
func (di *Diff) getParserForFile(filename string, logger zerolog.Logger) ecosystemParser {
×
418
        eco := di.getEcosystemForFile(filename)
×
419
        if eco == DepEcosystemNone {
×
420
                logger.Debug().
×
421
                        Str("filename", filename).
×
422
                        Msg("No ecosystem found, skipping")
×
423
                return nil
×
424
        }
×
425

426
        logger.Debug().
×
427
                Str("filename", filename).
×
428
                Str("package-ecosystem", string(eco)).
×
429
                Msg("matched ecosystem")
×
430

×
431
        return newEcosystemParser(eco)
×
432
}
433

434
// Computes the set of elements in updated which are not in base.
435
// Note: this function may permute (sort) the order of elements in base and updated.
436
func setDifference[Slice ~[]E, E any](base Slice, updated Slice, sorter func(a, b E) int) Slice {
8✔
437

8✔
438
        slices.SortFunc(base, sorter)
8✔
439
        slices.SortFunc(updated, sorter)
8✔
440

8✔
441
        baseIdx, newIdx := 0, 0
8✔
442
        ret := make(Slice, 0)
8✔
443
        for baseIdx < len(base) && newIdx < len(updated) {
25✔
444
                cmpResult := sorter(base[baseIdx], updated[newIdx])
17✔
445
                if cmpResult < 0 {
23✔
446
                        baseIdx++
6✔
447
                } else if cmpResult > 0 {
22✔
448
                        ret = append(ret, updated[newIdx])
5✔
449
                        newIdx++
5✔
450
                } else {
11✔
451
                        baseIdx++
6✔
452
                        newIdx++
6✔
453
                }
6✔
454
        }
455
        if newIdx < len(updated) {
11✔
456
                ret = append(ret, updated[newIdx:]...)
3✔
457
        }
3✔
458

459
        // TODO: add metric for number of deps scanned vs total deps
460

461
        return ret
8✔
462
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc