• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

mindersec / minder / 24718479667

21 Apr 2026 10:54AM UTC coverage: 60.126%. First build
24718479667

Pull #6382

github

web-flow
Merge 87fe9b564 into fef41377a
Pull Request #6382: test(engine): add unit tests for commit metadata extraction

26 of 81 new or added lines in 3 files covered. (32.1%)

20250 of 33679 relevant lines covered (60.13%)

35.96 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

44.98
/internal/engine/ingester/diff/diff.go
1
// SPDX-FileCopyrightText: Copyright 2023 The Minder Authors
2
// SPDX-License-Identifier: Apache-2.0
3

4
// Package diff provides the diff rule data ingest engine
5
package diff
6

7
import (
8
        "bufio"
9
        "cmp"
10
        "context"
11
        "fmt"
12
        "math"
13
        "path/filepath"
14
        "regexp"
15
        "slices"
16
        "strconv"
17
        "strings"
18

19
        "github.com/go-git/go-billy/v5"
20
        "github.com/go-git/go-billy/v5/helper/iofs"
21
        "github.com/google/go-github/v63/github"
22
        scalibr "github.com/google/osv-scalibr"
23
        "github.com/google/osv-scalibr/extractor"
24
        scalibr_fs "github.com/google/osv-scalibr/fs"
25
        scalibr_plugin "github.com/google/osv-scalibr/plugin"
26
        "github.com/google/osv-scalibr/plugin/list"
27
        "github.com/google/osv-scalibr/purl"
28
        "github.com/rs/zerolog"
29
        "google.golang.org/protobuf/reflect/protoreflect"
30

31
        "github.com/mindersec/minder/internal/engine/commitinfo"
32
        pbinternal "github.com/mindersec/minder/internal/proto"
33
        pb "github.com/mindersec/minder/pkg/api/protobuf/go/minder/v1"
34
        "github.com/mindersec/minder/pkg/engine/v1/interfaces"
35
        "github.com/mindersec/minder/pkg/entities/v1/checkpoints"
36
)
37

38
const (
39
        // DiffRuleDataIngestType is the type of the diff rule data ingest engine
40
        DiffRuleDataIngestType = "diff"
41
        prFilesPerPage         = 30
42
        prCommitsPerPage       = 30
43
        wildcard               = "*"
44
)
45

46
type commitLister interface {
47
        ListPullRequestCommits(
48
                ctx context.Context,
49
                owner string,
50
                repo string,
51
                prNumber int,
52
                perPage int,
53
                pageNumber int,
54
        ) ([]*github.RepositoryCommit, *github.Response, error)
55
}
56

57
// Diff is the diff rule data ingest engine
58
type Diff struct {
59
        cli interfaces.GitHubListAndClone
60
        cfg *pb.DiffType
61
}
62

63
// NewDiffIngester creates a new diff ingester
64
func NewDiffIngester(
65
        cfg *pb.DiffType,
66
        cli interfaces.GitHubListAndClone,
67
) (*Diff, error) {
4✔
68
        if cfg == nil {
4✔
69
                cfg = &pb.DiffType{}
×
70
        }
×
71

72
        if cli == nil {
4✔
73
                return nil, fmt.Errorf("provider is nil")
×
74
        }
×
75

76
        return &Diff{
4✔
77
                cfg: cfg,
4✔
78
                cli: cli,
4✔
79
        }, nil
4✔
80
}
81

82
// GetType returns the type of the diff rule data ingest engine
83
func (*Diff) GetType() string {
3✔
84
        return DiffRuleDataIngestType
3✔
85
}
3✔
86

87
// GetConfig returns the config for the diff rule data ingest engine
88
func (di *Diff) GetConfig() protoreflect.ProtoMessage {
6✔
89
        return di.cfg
6✔
90
}
6✔
91

92
// Ingest ingests a diff from a pull request in accordance with its type
93
func (di *Diff) Ingest(
94
        ctx context.Context,
95
        ent protoreflect.ProtoMessage,
96
        _ map[string]any,
97
) (*interfaces.Ingested, error) {
4✔
98
        pr, ok := ent.(*pbinternal.PullRequest)
4✔
99
        if !ok {
4✔
100
                return nil, fmt.Errorf("entity is not a pull request")
×
101
        }
×
102

103
        // The GitHub Go API takes an int32, but our proto stores an int64; make sure we don't overflow
104
        if pr.Number > math.MaxInt {
4✔
105
                return nil, fmt.Errorf("pr number is too large")
×
106
        }
×
107
        prNumber := int(pr.Number)
4✔
108

4✔
109
        if err := di.logPullRequestCommits(ctx, pr, prNumber); err != nil {
4✔
NEW
110
                return nil, fmt.Errorf("error listing pull request commits: %w", err)
×
NEW
111
        }
×
112

113
        switch di.cfg.GetType() {
4✔
114
        case "", pb.DiffTypeDep:
×
115
                return di.getDepTypeDiff(ctx, prNumber, pr)
×
116

117
        case pb.DiffTypeNewDeps:
4✔
118
                // TODO: once we've tested some, convert DiffTypeDep to use this algorithm.
4✔
119
                return di.getScalibrTypeDiff(ctx, prNumber, pr)
4✔
120

121
        case pb.DiffTypeFull:
×
122
                return di.getFullTypeDiff(ctx, prNumber, pr)
×
123

124
        default:
×
125
                return nil, fmt.Errorf("unknown diff type")
×
126
        }
127
}
128

129
func (di *Diff) logPullRequestCommits(ctx context.Context, pr *pbinternal.PullRequest, prNumber int) error {
4✔
130
        lister, ok := di.cli.(commitLister)
4✔
131
        if !ok {
8✔
132
                return nil
4✔
133
        }
4✔
134

NEW
135
        logger := zerolog.Ctx(ctx)
×
NEW
136
        page := 0
×
NEW
137
        for {
×
NEW
138
                commits, resp, err := lister.ListPullRequestCommits(
×
NEW
139
                        ctx,
×
NEW
140
                        pr.RepoOwner,
×
NEW
141
                        pr.RepoName,
×
NEW
142
                        prNumber,
×
NEW
143
                        prCommitsPerPage,
×
NEW
144
                        page,
×
NEW
145
                )
×
NEW
146
                if err != nil {
×
NEW
147
                        return fmt.Errorf("failed to list pull request commits: %w", err)
×
NEW
148
                }
×
149

NEW
150
                for _, c := range commits {
×
NEW
151
                        info := commitinfo.Extract(c)
×
NEW
152
                        logger.Debug().
×
NEW
153
                                Str("commit_sha", info.SHA).
×
NEW
154
                                Str("commit_msg", info.Message).
×
NEW
155
                                Str("commit_author", info.Author).
×
NEW
156
                                Msg("pull request commit")
×
NEW
157
                }
×
158

NEW
159
                if resp == nil || resp.NextPage == 0 {
×
NEW
160
                        break
×
161
                }
162

NEW
163
                page = resp.NextPage
×
164
        }
165

NEW
166
        return nil
×
167
}
168

169
func (di *Diff) getDepTypeDiff(ctx context.Context, prNumber int, pr *pbinternal.PullRequest) (*interfaces.Ingested, error) {
×
170
        deps := pbinternal.PrDependencies{Pr: pr}
×
171
        page := 0
×
172

×
173
        for {
×
174
                prFiles, resp, err := di.cli.ListFiles(ctx, pr.RepoOwner, pr.RepoName, prNumber, prFilesPerPage, page)
×
175
                if err != nil {
×
176
                        return nil, fmt.Errorf("error getting pull request files: %w", err)
×
177
                }
×
178

179
                for _, file := range prFiles {
×
180
                        fileDiffs, err := di.ingestFileForDepDiff(file.GetFilename(), file.GetPatch(), file.GetRawURL(), *zerolog.Ctx(ctx))
×
181
                        if err != nil {
×
182
                                return nil, fmt.Errorf("error ingesting file %s: %w", file.GetFilename(), err)
×
183
                        }
×
184
                        deps.Deps = append(deps.Deps, fileDiffs...)
×
185
                }
186

187
                if resp.NextPage == 0 {
×
188
                        break
×
189
                }
190

191
                page = resp.NextPage
×
192
        }
193

194
        return &interfaces.Ingested{Object: &deps, Checkpoint: checkpoints.NewCheckpointV1Now()}, nil
×
195
}
196

197
func (di *Diff) getFullTypeDiff(ctx context.Context, prNumber int, pr *pbinternal.PullRequest) (*interfaces.Ingested, error) {
×
198
        diff := &pbinternal.PrContents{Pr: pr}
×
199
        page := 0
×
200

×
201
        for {
×
202
                prFiles, resp, err := di.cli.ListFiles(ctx, pr.RepoOwner, pr.RepoName, prNumber, prFilesPerPage, page)
×
203
                if err != nil {
×
204
                        return nil, fmt.Errorf("error getting pull request files: %w", err)
×
205
                }
×
206

207
                for _, file := range prFiles {
×
208
                        fileDiffs, err := ingestFileForFullDiff(file.GetFilename(), file.GetPatch(), file.GetRawURL())
×
209
                        if err != nil {
×
210
                                return nil, fmt.Errorf("error ingesting file %s: %w", file.GetFilename(), err)
×
211
                        }
×
212
                        diff.Files = append(diff.Files, fileDiffs)
×
213
                }
214

215
                if resp.NextPage == 0 {
×
216
                        break
×
217
                }
218

219
                page = resp.NextPage
×
220
        }
221

222
        return &interfaces.Ingested{Object: diff, Checkpoint: checkpoints.NewCheckpointV1Now()}, nil
×
223
}
224

225
func (di *Diff) ingestFileForDepDiff(
226
        filename, patchContents, patchUrl string,
227
        logger zerolog.Logger,
228
) ([]*pbinternal.PrDependencies_ContextualDependency, error) {
×
229
        parser := di.getParserForFile(filename, logger)
×
230
        if parser == nil {
×
231
                return nil, nil
×
232
        }
×
233

234
        depBatch, err := parser(patchContents)
×
235
        if err != nil {
×
236
                return nil, fmt.Errorf("error parsing file %s: %w", filename, err)
×
237
        }
×
238

239
        batchCtxDeps := make([]*pbinternal.PrDependencies_ContextualDependency, 0, len(depBatch))
×
240
        for i := range depBatch {
×
241
                dep := depBatch[i]
×
242
                batchCtxDeps = append(batchCtxDeps, &pbinternal.PrDependencies_ContextualDependency{
×
243
                        Dep: dep,
×
244
                        File: &pbinternal.PrDependencies_ContextualDependency_FilePatch{
×
245
                                Name:     filename,
×
246
                                PatchUrl: patchUrl,
×
247
                        },
×
248
                })
×
249
        }
×
250

251
        return batchCtxDeps, nil
×
252
}
253

254
func (di *Diff) getScalibrTypeDiff(ctx context.Context, _ int, pr *pbinternal.PullRequest) (*interfaces.Ingested, error) {
4✔
255
        deps := pbinternal.PrDependencies{Pr: pr}
4✔
256

4✔
257
        // TODO: we should be able to just fetch the additional commits between base and target.
4✔
258
        // Our current Git abstraction isn't quite powerful enough, so we do two shallow clones.
4✔
259

4✔
260
        baseInventory, err := di.scalibrInventory(ctx, pr.BaseCloneUrl, pr.BaseRef)
4✔
261
        if err != nil {
4✔
262
                return nil, fmt.Errorf("failed to clone base from %s at %q: %w", pr.BaseCloneUrl, pr.BaseRef, err)
×
263
        }
×
264
        newInventory, err := di.scalibrInventory(ctx, pr.TargetCloneUrl, pr.TargetRef)
4✔
265
        if err != nil {
4✔
266
                return nil, fmt.Errorf("failed to clone fork from %s at %q: %w", pr.TargetCloneUrl, pr.TargetRef, err)
×
267
        }
×
268

269
        newDeps := setDifference(baseInventory, newInventory, inventorySorter)
4✔
270

4✔
271
        deps.Deps = make([]*pbinternal.PrDependencies_ContextualDependency, 0, len(newDeps))
4✔
272
        for _, inventory := range newDeps {
10✔
273
                for _, filename := range inventory.Locations {
12✔
274
                        deps.Deps = append(deps.Deps, &pbinternal.PrDependencies_ContextualDependency{
6✔
275
                                Dep: &pbinternal.Dependency{
6✔
276
                                        Ecosystem: inventoryToEcosystem(inventory),
6✔
277
                                        Name:      inventory.Name,
6✔
278
                                        Version:   inventory.Version,
6✔
279
                                },
6✔
280
                                File: &pbinternal.PrDependencies_ContextualDependency_FilePatch{
6✔
281
                                        Name:     filename,
6✔
282
                                        PatchUrl: "", // TODO: do we need this?
6✔
283
                                },
6✔
284
                        })
6✔
285
                }
6✔
286
        }
287

288
        return &interfaces.Ingested{Object: &deps, Checkpoint: checkpoints.NewCheckpointV1Now()}, nil
4✔
289
}
290

291
func inventorySorter(a *extractor.Package, b *extractor.Package) int {
22✔
292
        // If we compare by name and version first, we can avoid serializing Locations to strings
22✔
293
        res := cmp.Or(cmp.Compare(a.Name, b.Name), cmp.Compare(a.Version, b.Version))
22✔
294
        if res != 0 {
42✔
295
                return res
20✔
296
        }
20✔
297
        // TODO: Locations should probably be sorted, but scalibr is going to export a compare function.
298
        aLoc := fmt.Sprintf("%v", a.Locations)
2✔
299
        bLoc := fmt.Sprintf("%v", b.Locations)
2✔
300
        return cmp.Compare(aLoc, bLoc)
2✔
301
}
302

303
func (di *Diff) scalibrInventory(ctx context.Context, repoURL string, ref string) ([]*extractor.Package, error) {
8✔
304
        clone, err := di.cli.Clone(ctx, repoURL, ref)
8✔
305
        if err != nil {
8✔
306
                return nil, err
×
307
        }
×
308

309
        tree, err := clone.Worktree()
8✔
310
        if err != nil {
8✔
311
                return nil, err
×
312
        }
×
313
        return scanFs(ctx, tree.Filesystem, map[string]string{})
8✔
314
}
315

316
func scanFs(ctx context.Context, memFS billy.Filesystem, _ map[string]string) ([]*extractor.Package, error) {
8✔
317
        // have to down-cast here, because scalibr needs multiple io/fs types
8✔
318
        wrapped, ok := iofs.New(memFS).(scalibr_fs.FS)
8✔
319
        if !ok {
8✔
320
                return nil, fmt.Errorf("error converting filesystem to ReadDirFS")
×
321
        }
×
322

323
        desiredCaps := scalibr_plugin.Capabilities{
8✔
324
                OS:            scalibr_plugin.OSLinux,
8✔
325
                Network:       scalibr_plugin.NetworkOffline,
8✔
326
                DirectFS:      false,
8✔
327
                RunningSystem: false,
8✔
328
        }
8✔
329

8✔
330
        scalibrFs := scalibr_fs.ScanRoot{FS: wrapped}
8✔
331
        scanConfig := scalibr.ScanConfig{
8✔
332
                ScanRoots: []*scalibr_fs.ScanRoot{&scalibrFs},
8✔
333
                // All includes Ruby, Dotnet which we're not ready to test yet, so use the more limited Default set.
8✔
334
                Plugins:      list.FromCapabilities(&desiredCaps),
8✔
335
                Capabilities: &desiredCaps,
8✔
336
        }
8✔
337

8✔
338
        scanner := scalibr.New()
8✔
339
        scanResults := scanner.Scan(ctx, &scanConfig)
8✔
340

8✔
341
        if scanResults == nil || scanResults.Status == nil {
8✔
342
                return nil, fmt.Errorf("error scanning files: no results")
×
343
        }
×
344
        if scanResults.Status.Status != scalibr_plugin.ScanStatusSucceeded {
8✔
345
                return nil, fmt.Errorf("error scanning files: %s", scanResults.Status)
×
346
        }
×
347

348
        return scanResults.Inventory.Packages, nil
8✔
349
}
350

351
func inventoryToEcosystem(inventory *extractor.Package) pbinternal.DepEcosystem {
6✔
352
        if inventory == nil {
6✔
353
                zerolog.Ctx(context.Background()).Warn().Msg("nil ecosystem scanning diffs")
×
354
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_UNSPECIFIED
×
355
        }
×
356

357
        package_url := inventory.PURL()
6✔
358

6✔
359
        // Sometimes Scalibr uses the string "PyPI" instead of "pypi" when reporting the ecosystem.
6✔
360
        switch package_url.Type {
6✔
361
        // N.B. using an enum here abitrarily restricts our ability to add new
362
        // ecosystems without a core minder change.  Switching to strings ala
363
        // purl might be an improvement.
364
        case purl.TypePyPi:
2✔
365
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_PYPI
2✔
366
        case purl.TypeNPM:
2✔
367
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_NPM
2✔
368
        case purl.TypeGolang:
2✔
369
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_GO
2✔
370
        default:
×
371
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_UNSPECIFIED
×
372
        }
373
}
374

375
// ingestFileForFullDiff processes a given file's patch from a pull request.
376
// It scans through the patch line by line, identifying the changes made.
377
// If it's a hunk header, it extracts the starting line number. If it's an addition, it records the line content and its number.
378
// The function also increments the line number for context lines (lines that provide context but haven't been modified).
379
func ingestFileForFullDiff(filename, patch, patchUrl string) (*pbinternal.PrContents_File, error) {
×
380
        var result []*pbinternal.PrContents_File_Line
×
381

×
382
        scanner := bufio.NewScanner(strings.NewReader(patch))
×
383
        regex := regexp.MustCompile(`@@ -\d+,\d+ \+(\d+),\d+ @@`)
×
384

×
385
        var currentLineNumber int64
×
386
        var err error
×
387
        for scanner.Scan() {
×
388
                line := scanner.Text()
×
389

×
390
                if matches := regex.FindStringSubmatch(line); matches != nil {
×
391
                        currentLineNumber, err = strconv.ParseInt(matches[1], 10, 32)
×
392
                        if err != nil {
×
393
                                return nil, fmt.Errorf("error parsing line number from the hunk header: %w", err)
×
394
                        }
×
395
                } else if strings.HasPrefix(line, "+") {
×
396
                        result = append(result, &pbinternal.PrContents_File_Line{
×
397
                                Content: line[1:],
×
398
                                // see the use of strconv.ParseInt above: this is a safe downcast
×
399
                                // nolint: gosec
×
400
                                LineNumber: int32(currentLineNumber),
×
401
                        })
×
402

×
403
                        currentLineNumber++
×
404
                } else if !strings.HasPrefix(line, "-") {
×
405
                        currentLineNumber++
×
406
                }
×
407
        }
408

409
        if err := scanner.Err(); err != nil {
×
410
                return nil, fmt.Errorf("error reading patch: %w", err)
×
411
        }
×
412

413
        return &pbinternal.PrContents_File{
×
414
                Name:         filename,
×
415
                FilePatchUrl: patchUrl,
×
416
                PatchLines:   result,
×
417
        }, nil
×
418
}
419

420
func (di *Diff) getEcosystemForFile(filename string) DependencyEcosystem {
5✔
421
        lastComponent := filepath.Base(filename)
5✔
422

5✔
423
        for _, ecoMapping := range di.cfg.Ecosystems {
10✔
424
                if match, _ := filepath.Match(ecoMapping.Depfile, lastComponent); match {
8✔
425
                        return DependencyEcosystem(ecoMapping.Name)
3✔
426
                }
3✔
427
        }
428
        return DepEcosystemNone
2✔
429
}
430

431
func (di *Diff) getParserForFile(filename string, logger zerolog.Logger) ecosystemParser {
×
432
        eco := di.getEcosystemForFile(filename)
×
433
        if eco == DepEcosystemNone {
×
434
                logger.Debug().
×
435
                        Str("filename", filename).
×
436
                        Msg("No ecosystem found, skipping")
×
437
                return nil
×
438
        }
×
439

440
        logger.Debug().
×
441
                Str("filename", filename).
×
442
                Str("package-ecosystem", string(eco)).
×
443
                Msg("matched ecosystem")
×
444

×
445
        return newEcosystemParser(eco)
×
446
}
447

448
// Computes the set of elements in updated which are not in base.
449
// Note: this function may permute (sort) the order of elements in base and updated.
450
func setDifference[Slice ~[]E, E any](base Slice, updated Slice, sorter func(a, b E) int) Slice {
8✔
451

8✔
452
        slices.SortFunc(base, sorter)
8✔
453
        slices.SortFunc(updated, sorter)
8✔
454

8✔
455
        baseIdx, newIdx := 0, 0
8✔
456
        ret := make(Slice, 0)
8✔
457
        for baseIdx < len(base) && newIdx < len(updated) {
25✔
458
                cmpResult := sorter(base[baseIdx], updated[newIdx])
17✔
459
                if cmpResult < 0 {
23✔
460
                        baseIdx++
6✔
461
                } else if cmpResult > 0 {
22✔
462
                        ret = append(ret, updated[newIdx])
5✔
463
                        newIdx++
5✔
464
                } else {
11✔
465
                        baseIdx++
6✔
466
                        newIdx++
6✔
467
                }
6✔
468
        }
469
        if newIdx < len(updated) {
11✔
470
                ret = append(ret, updated[newIdx:]...)
3✔
471
        }
3✔
472

473
        // TODO: add metric for number of deps scanned vs total deps
474

475
        return ret
8✔
476
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc