• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

mindersec / minder / 24718498200

21 Apr 2026 10:54AM UTC coverage: 60.096%. First build
24718498200

Pull #6372

github

web-flow
Merge f94afb8fc into fef41377a
Pull Request #6372: feat(engine): add commit iteration support for pull request evaluation

9 of 70 new or added lines in 2 files covered. (12.86%)

20233 of 33668 relevant lines covered (60.1%)

35.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

44.65
/internal/engine/ingester/diff/diff.go
1
// SPDX-FileCopyrightText: Copyright 2023 The Minder Authors
2
// SPDX-License-Identifier: Apache-2.0
3

4
// Package diff provides the diff rule data ingest engine
5
package diff
6

7
import (
8
        "bufio"
9
        "cmp"
10
        "context"
11
        "fmt"
12
        "math"
13
        "path/filepath"
14
        "regexp"
15
        "slices"
16
        "strconv"
17
        "strings"
18

19
        "github.com/go-git/go-billy/v5"
20
        "github.com/go-git/go-billy/v5/helper/iofs"
21
        "github.com/google/go-github/v63/github"
22
        scalibr "github.com/google/osv-scalibr"
23
        "github.com/google/osv-scalibr/extractor"
24
        scalibr_fs "github.com/google/osv-scalibr/fs"
25
        scalibr_plugin "github.com/google/osv-scalibr/plugin"
26
        "github.com/google/osv-scalibr/plugin/list"
27
        "github.com/google/osv-scalibr/purl"
28
        "github.com/rs/zerolog"
29
        "google.golang.org/protobuf/reflect/protoreflect"
30

31
        pbinternal "github.com/mindersec/minder/internal/proto"
32
        pb "github.com/mindersec/minder/pkg/api/protobuf/go/minder/v1"
33
        "github.com/mindersec/minder/pkg/engine/v1/interfaces"
34
        "github.com/mindersec/minder/pkg/entities/v1/checkpoints"
35
)
36

37
const (
38
        // DiffRuleDataIngestType is the type of the diff rule data ingest engine
39
        DiffRuleDataIngestType = "diff"
40
        prFilesPerPage         = 30
41
        wildcard               = "*"
42
)
43

44
// Diff is the diff rule data ingest engine
45
type Diff struct {
46
        cli interfaces.GitHubListAndClone
47
        cfg *pb.DiffType
48
}
49

50
// NewDiffIngester creates a new diff ingester
51
func NewDiffIngester(
52
        cfg *pb.DiffType,
53
        cli interfaces.GitHubListAndClone,
54
) (*Diff, error) {
4✔
55
        if cfg == nil {
4✔
56
                cfg = &pb.DiffType{}
×
57
        }
×
58

59
        if cli == nil {
4✔
60
                return nil, fmt.Errorf("provider is nil")
×
61
        }
×
62

63
        return &Diff{
4✔
64
                cfg: cfg,
4✔
65
                cli: cli,
4✔
66
        }, nil
4✔
67
}
68

69
// GetType returns the type of the diff rule data ingest engine
70
func (*Diff) GetType() string {
3✔
71
        return DiffRuleDataIngestType
3✔
72
}
3✔
73

74
// GetConfig returns the config for the diff rule data ingest engine
75
func (di *Diff) GetConfig() protoreflect.ProtoMessage {
6✔
76
        return di.cfg
6✔
77
}
6✔
78

79
// Ingest ingests a diff from a pull request in accordance with its type
80
func (di *Diff) Ingest(
81
        ctx context.Context,
82
        ent protoreflect.ProtoMessage,
83
        _ map[string]any,
84
) (*interfaces.Ingested, error) {
4✔
85
        pr, ok := ent.(*pbinternal.PullRequest)
4✔
86
        if !ok {
4✔
87
                return nil, fmt.Errorf("entity is not a pull request")
×
88
        }
×
89

90
        // The GitHub Go API takes an int32, but our proto stores an int64; make sure we don't overflow
91
        if pr.Number > math.MaxInt {
4✔
92
                return nil, fmt.Errorf("pr number is too large")
×
93
        }
×
94
        prNumber := int(pr.Number)
4✔
95

4✔
96
        // Best-effort experiment: if the underlying provider supports listing PR commits,
4✔
97
        // iterate over them here so we can observe commit-level data during evaluation. This
4✔
98
        // is intentionally non-blocking for providers that do not implement the method.
4✔
99
        if err := di.logPullRequestCommits(ctx, pr, prNumber); err != nil {
4✔
NEW
100
                zerolog.Ctx(ctx).Debug().Err(err).Msg("failed to list pull request commits")
×
NEW
101
        }
×
102

103
        switch di.cfg.GetType() {
4✔
104
        case "", pb.DiffTypeDep:
×
105
                return di.getDepTypeDiff(ctx, prNumber, pr)
×
106

107
        case pb.DiffTypeNewDeps:
4✔
108
                // TODO: once we've tested some, convert DiffTypeDep to use this algorithm.
4✔
109
                return di.getScalibrTypeDiff(ctx, prNumber, pr)
4✔
110

111
        case pb.DiffTypeFull:
×
112
                return di.getFullTypeDiff(ctx, prNumber, pr)
×
113

114
        default:
×
115
                return nil, fmt.Errorf("unknown diff type")
×
116
        }
117
}
118

119
// commitLister is an internal, optional trait that providers can implement to
120
// expose pull request commits for a given PR.
121
type commitLister interface {
122
        ListPullRequestCommits(
123
                ctx context.Context,
124
                owner string,
125
                repo string,
126
                prNumber int,
127
                perPage int,
128
                pageNumber int,
129
        ) ([]*github.RepositoryCommit, *github.Response, error)
130
}
131

132
// logPullRequestCommits iterates over all commits in the given pull request,
133
// logging their SHAs and messages when the underlying provider implements the
134
// commitLister trait. This is a no-op for providers that do not support it.
135
func (di *Diff) logPullRequestCommits(
136
        ctx context.Context,
137
        pr *pbinternal.PullRequest,
138
        prNumber int,
139
) error {
4✔
140
        cli, ok := di.cli.(commitLister)
4✔
141
        if !ok {
8✔
142
                return nil
4✔
143
        }
4✔
144

NEW
145
        logger := zerolog.Ctx(ctx)
×
NEW
146
        page := 0
×
NEW
147
        const prCommitsPerPage = 50
×
NEW
148

×
NEW
149
        for {
×
NEW
150
                commits, resp, err := cli.ListPullRequestCommits(
×
NEW
151
                        ctx,
×
NEW
152
                        pr.RepoOwner,
×
NEW
153
                        pr.RepoName,
×
NEW
154
                        prNumber,
×
NEW
155
                        prCommitsPerPage,
×
NEW
156
                        page,
×
NEW
157
                )
×
NEW
158
                if err != nil {
×
NEW
159
                        return fmt.Errorf("error listing pull request commits: %w", err)
×
NEW
160
                }
×
161

NEW
162
                for _, c := range commits {
×
NEW
163
                        sha := c.GetSHA()
×
NEW
164
                        msg := c.GetCommit().GetMessage()
×
NEW
165
                        firstLine := strings.Split(msg, "\n")[0]
×
NEW
166

×
NEW
167
                        logger.Debug().
×
NEW
168
                                Str("pr_repo", fmt.Sprintf("%s/%s", pr.RepoOwner, pr.RepoName)).
×
NEW
169
                                Int64("pr_number", pr.Number).
×
NEW
170
                                Str("commit_sha", sha).
×
NEW
171
                                Str("commit_msg", firstLine).
×
NEW
172
                                Msg("pull request commit")
×
NEW
173
                }
×
174

NEW
175
                if resp == nil || resp.NextPage == 0 {
×
NEW
176
                        break
×
177
                }
178

NEW
179
                page = resp.NextPage
×
180
        }
181

NEW
182
        return nil
×
183
}
184

185
func (di *Diff) getDepTypeDiff(ctx context.Context, prNumber int, pr *pbinternal.PullRequest) (*interfaces.Ingested, error) {
×
186
        deps := pbinternal.PrDependencies{Pr: pr}
×
187
        page := 0
×
188

×
189
        for {
×
190
                prFiles, resp, err := di.cli.ListFiles(ctx, pr.RepoOwner, pr.RepoName, prNumber, prFilesPerPage, page)
×
191
                if err != nil {
×
192
                        return nil, fmt.Errorf("error getting pull request files: %w", err)
×
193
                }
×
194

195
                for _, file := range prFiles {
×
196
                        fileDiffs, err := di.ingestFileForDepDiff(file.GetFilename(), file.GetPatch(), file.GetRawURL(), *zerolog.Ctx(ctx))
×
197
                        if err != nil {
×
198
                                return nil, fmt.Errorf("error ingesting file %s: %w", file.GetFilename(), err)
×
199
                        }
×
200
                        deps.Deps = append(deps.Deps, fileDiffs...)
×
201
                }
202

203
                if resp.NextPage == 0 {
×
204
                        break
×
205
                }
206

207
                page = resp.NextPage
×
208
        }
209

210
        return &interfaces.Ingested{Object: &deps, Checkpoint: checkpoints.NewCheckpointV1Now()}, nil
×
211
}
212

213
func (di *Diff) getFullTypeDiff(ctx context.Context, prNumber int, pr *pbinternal.PullRequest) (*interfaces.Ingested, error) {
×
214
        diff := &pbinternal.PrContents{Pr: pr}
×
215
        page := 0
×
216

×
217
        for {
×
218
                prFiles, resp, err := di.cli.ListFiles(ctx, pr.RepoOwner, pr.RepoName, prNumber, prFilesPerPage, page)
×
219
                if err != nil {
×
220
                        return nil, fmt.Errorf("error getting pull request files: %w", err)
×
221
                }
×
222

223
                for _, file := range prFiles {
×
224
                        fileDiffs, err := ingestFileForFullDiff(file.GetFilename(), file.GetPatch(), file.GetRawURL())
×
225
                        if err != nil {
×
226
                                return nil, fmt.Errorf("error ingesting file %s: %w", file.GetFilename(), err)
×
227
                        }
×
228
                        diff.Files = append(diff.Files, fileDiffs)
×
229
                }
230

231
                if resp.NextPage == 0 {
×
232
                        break
×
233
                }
234

235
                page = resp.NextPage
×
236
        }
237

238
        return &interfaces.Ingested{Object: diff, Checkpoint: checkpoints.NewCheckpointV1Now()}, nil
×
239
}
240

241
func (di *Diff) ingestFileForDepDiff(
242
        filename, patchContents, patchUrl string,
243
        logger zerolog.Logger,
244
) ([]*pbinternal.PrDependencies_ContextualDependency, error) {
×
245
        parser := di.getParserForFile(filename, logger)
×
246
        if parser == nil {
×
247
                return nil, nil
×
248
        }
×
249

250
        depBatch, err := parser(patchContents)
×
251
        if err != nil {
×
252
                return nil, fmt.Errorf("error parsing file %s: %w", filename, err)
×
253
        }
×
254

255
        batchCtxDeps := make([]*pbinternal.PrDependencies_ContextualDependency, 0, len(depBatch))
×
256
        for i := range depBatch {
×
257
                dep := depBatch[i]
×
258
                batchCtxDeps = append(batchCtxDeps, &pbinternal.PrDependencies_ContextualDependency{
×
259
                        Dep: dep,
×
260
                        File: &pbinternal.PrDependencies_ContextualDependency_FilePatch{
×
261
                                Name:     filename,
×
262
                                PatchUrl: patchUrl,
×
263
                        },
×
264
                })
×
265
        }
×
266

267
        return batchCtxDeps, nil
×
268
}
269

270
func (di *Diff) getScalibrTypeDiff(ctx context.Context, _ int, pr *pbinternal.PullRequest) (*interfaces.Ingested, error) {
4✔
271
        deps := pbinternal.PrDependencies{Pr: pr}
4✔
272

4✔
273
        // TODO: we should be able to just fetch the additional commits between base and target.
4✔
274
        // Our current Git abstraction isn't quite powerful enough, so we do two shallow clones.
4✔
275

4✔
276
        baseInventory, err := di.scalibrInventory(ctx, pr.BaseCloneUrl, pr.BaseRef)
4✔
277
        if err != nil {
4✔
278
                return nil, fmt.Errorf("failed to clone base from %s at %q: %w", pr.BaseCloneUrl, pr.BaseRef, err)
×
279
        }
×
280
        newInventory, err := di.scalibrInventory(ctx, pr.TargetCloneUrl, pr.TargetRef)
4✔
281
        if err != nil {
4✔
282
                return nil, fmt.Errorf("failed to clone fork from %s at %q: %w", pr.TargetCloneUrl, pr.TargetRef, err)
×
283
        }
×
284

285
        newDeps := setDifference(baseInventory, newInventory, inventorySorter)
4✔
286

4✔
287
        deps.Deps = make([]*pbinternal.PrDependencies_ContextualDependency, 0, len(newDeps))
4✔
288
        for _, inventory := range newDeps {
10✔
289
                for _, filename := range inventory.Locations {
12✔
290
                        deps.Deps = append(deps.Deps, &pbinternal.PrDependencies_ContextualDependency{
6✔
291
                                Dep: &pbinternal.Dependency{
6✔
292
                                        Ecosystem: inventoryToEcosystem(inventory),
6✔
293
                                        Name:      inventory.Name,
6✔
294
                                        Version:   inventory.Version,
6✔
295
                                },
6✔
296
                                File: &pbinternal.PrDependencies_ContextualDependency_FilePatch{
6✔
297
                                        Name:     filename,
6✔
298
                                        PatchUrl: "", // TODO: do we need this?
6✔
299
                                },
6✔
300
                        })
6✔
301
                }
6✔
302
        }
303

304
        return &interfaces.Ingested{Object: &deps, Checkpoint: checkpoints.NewCheckpointV1Now()}, nil
4✔
305
}
306

307
func inventorySorter(a *extractor.Package, b *extractor.Package) int {
22✔
308
        // If we compare by name and version first, we can avoid serializing Locations to strings
22✔
309
        res := cmp.Or(cmp.Compare(a.Name, b.Name), cmp.Compare(a.Version, b.Version))
22✔
310
        if res != 0 {
42✔
311
                return res
20✔
312
        }
20✔
313
        // TODO: Locations should probably be sorted, but scalibr is going to export a compare function.
314
        aLoc := fmt.Sprintf("%v", a.Locations)
2✔
315
        bLoc := fmt.Sprintf("%v", b.Locations)
2✔
316
        return cmp.Compare(aLoc, bLoc)
2✔
317
}
318

319
func (di *Diff) scalibrInventory(ctx context.Context, repoURL string, ref string) ([]*extractor.Package, error) {
8✔
320
        clone, err := di.cli.Clone(ctx, repoURL, ref)
8✔
321
        if err != nil {
8✔
322
                return nil, err
×
323
        }
×
324

325
        tree, err := clone.Worktree()
8✔
326
        if err != nil {
8✔
327
                return nil, err
×
328
        }
×
329
        return scanFs(ctx, tree.Filesystem, map[string]string{})
8✔
330
}
331

332
func scanFs(ctx context.Context, memFS billy.Filesystem, _ map[string]string) ([]*extractor.Package, error) {
8✔
333
        // have to down-cast here, because scalibr needs multiple io/fs types
8✔
334
        wrapped, ok := iofs.New(memFS).(scalibr_fs.FS)
8✔
335
        if !ok {
8✔
336
                return nil, fmt.Errorf("error converting filesystem to ReadDirFS")
×
337
        }
×
338

339
        desiredCaps := scalibr_plugin.Capabilities{
8✔
340
                OS:            scalibr_plugin.OSLinux,
8✔
341
                Network:       scalibr_plugin.NetworkOffline,
8✔
342
                DirectFS:      false,
8✔
343
                RunningSystem: false,
8✔
344
        }
8✔
345

8✔
346
        scalibrFs := scalibr_fs.ScanRoot{FS: wrapped}
8✔
347
        scanConfig := scalibr.ScanConfig{
8✔
348
                ScanRoots: []*scalibr_fs.ScanRoot{&scalibrFs},
8✔
349
                // All includes Ruby, Dotnet which we're not ready to test yet, so use the more limited Default set.
8✔
350
                Plugins:      list.FromCapabilities(&desiredCaps),
8✔
351
                Capabilities: &desiredCaps,
8✔
352
        }
8✔
353

8✔
354
        scanner := scalibr.New()
8✔
355
        scanResults := scanner.Scan(ctx, &scanConfig)
8✔
356

8✔
357
        if scanResults == nil || scanResults.Status == nil {
8✔
358
                return nil, fmt.Errorf("error scanning files: no results")
×
359
        }
×
360
        if scanResults.Status.Status != scalibr_plugin.ScanStatusSucceeded {
8✔
361
                return nil, fmt.Errorf("error scanning files: %s", scanResults.Status)
×
362
        }
×
363

364
        return scanResults.Inventory.Packages, nil
8✔
365
}
366

367
func inventoryToEcosystem(inventory *extractor.Package) pbinternal.DepEcosystem {
6✔
368
        if inventory == nil {
6✔
369
                zerolog.Ctx(context.Background()).Warn().Msg("nil ecosystem scanning diffs")
×
370
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_UNSPECIFIED
×
371
        }
×
372

373
        package_url := inventory.PURL()
6✔
374

6✔
375
        // Sometimes Scalibr uses the string "PyPI" instead of "pypi" when reporting the ecosystem.
6✔
376
        switch package_url.Type {
6✔
377
        // N.B. using an enum here abitrarily restricts our ability to add new
378
        // ecosystems without a core minder change.  Switching to strings ala
379
        // purl might be an improvement.
380
        case purl.TypePyPi:
2✔
381
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_PYPI
2✔
382
        case purl.TypeNPM:
2✔
383
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_NPM
2✔
384
        case purl.TypeGolang:
2✔
385
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_GO
2✔
386
        default:
×
387
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_UNSPECIFIED
×
388
        }
389
}
390

391
// ingestFileForFullDiff processes a given file's patch from a pull request.
392
// It scans through the patch line by line, identifying the changes made.
393
// If it's a hunk header, it extracts the starting line number. If it's an addition, it records the line content and its number.
394
// The function also increments the line number for context lines (lines that provide context but haven't been modified).
395
func ingestFileForFullDiff(filename, patch, patchUrl string) (*pbinternal.PrContents_File, error) {
×
396
        var result []*pbinternal.PrContents_File_Line
×
397

×
398
        scanner := bufio.NewScanner(strings.NewReader(patch))
×
399
        regex := regexp.MustCompile(`@@ -\d+,\d+ \+(\d+),\d+ @@`)
×
400

×
401
        var currentLineNumber int64
×
402
        var err error
×
403
        for scanner.Scan() {
×
404
                line := scanner.Text()
×
405

×
406
                if matches := regex.FindStringSubmatch(line); matches != nil {
×
407
                        currentLineNumber, err = strconv.ParseInt(matches[1], 10, 32)
×
408
                        if err != nil {
×
409
                                return nil, fmt.Errorf("error parsing line number from the hunk header: %w", err)
×
410
                        }
×
411
                } else if strings.HasPrefix(line, "+") {
×
412
                        result = append(result, &pbinternal.PrContents_File_Line{
×
413
                                Content: line[1:],
×
414
                                // see the use of strconv.ParseInt above: this is a safe downcast
×
415
                                // nolint: gosec
×
416
                                LineNumber: int32(currentLineNumber),
×
417
                        })
×
418

×
419
                        currentLineNumber++
×
420
                } else if !strings.HasPrefix(line, "-") {
×
421
                        currentLineNumber++
×
422
                }
×
423
        }
424

425
        if err := scanner.Err(); err != nil {
×
426
                return nil, fmt.Errorf("error reading patch: %w", err)
×
427
        }
×
428

429
        return &pbinternal.PrContents_File{
×
430
                Name:         filename,
×
431
                FilePatchUrl: patchUrl,
×
432
                PatchLines:   result,
×
433
        }, nil
×
434
}
435

436
func (di *Diff) getEcosystemForFile(filename string) DependencyEcosystem {
5✔
437
        lastComponent := filepath.Base(filename)
5✔
438

5✔
439
        for _, ecoMapping := range di.cfg.Ecosystems {
10✔
440
                if match, _ := filepath.Match(ecoMapping.Depfile, lastComponent); match {
8✔
441
                        return DependencyEcosystem(ecoMapping.Name)
3✔
442
                }
3✔
443
        }
444
        return DepEcosystemNone
2✔
445
}
446

447
func (di *Diff) getParserForFile(filename string, logger zerolog.Logger) ecosystemParser {
×
448
        eco := di.getEcosystemForFile(filename)
×
449
        if eco == DepEcosystemNone {
×
450
                logger.Debug().
×
451
                        Str("filename", filename).
×
452
                        Msg("No ecosystem found, skipping")
×
453
                return nil
×
454
        }
×
455

456
        logger.Debug().
×
457
                Str("filename", filename).
×
458
                Str("package-ecosystem", string(eco)).
×
459
                Msg("matched ecosystem")
×
460

×
461
        return newEcosystemParser(eco)
×
462
}
463

464
// Computes the set of elements in updated which are not in base.
465
// Note: this function may permute (sort) the order of elements in base and updated.
466
func setDifference[Slice ~[]E, E any](base Slice, updated Slice, sorter func(a, b E) int) Slice {
8✔
467

8✔
468
        slices.SortFunc(base, sorter)
8✔
469
        slices.SortFunc(updated, sorter)
8✔
470

8✔
471
        baseIdx, newIdx := 0, 0
8✔
472
        ret := make(Slice, 0)
8✔
473
        for baseIdx < len(base) && newIdx < len(updated) {
25✔
474
                cmpResult := sorter(base[baseIdx], updated[newIdx])
17✔
475
                if cmpResult < 0 {
23✔
476
                        baseIdx++
6✔
477
                } else if cmpResult > 0 {
22✔
478
                        ret = append(ret, updated[newIdx])
5✔
479
                        newIdx++
5✔
480
                } else {
11✔
481
                        baseIdx++
6✔
482
                        newIdx++
6✔
483
                }
6✔
484
        }
485
        if newIdx < len(updated) {
11✔
486
                ret = append(ret, updated[newIdx:]...)
3✔
487
        }
3✔
488

489
        // TODO: add metric for number of deps scanned vs total deps
490

491
        return ret
8✔
492
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc