• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

mindersec / minder / 15981216547

30 Jun 2025 06:52PM UTC coverage: 57.393% (+0.02%) from 57.377%
15981216547

Pull #5702

github

web-flow
Merge b1a7618c3 into 9b4f171a0
Pull Request #5702: Add templates for REST and YQ eval

57 of 78 new or added lines in 7 files covered. (73.08%)

21 existing lines in 3 files now uncovered.

18593 of 32396 relevant lines covered (57.39%)

37.23 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

48.36
/internal/engine/ingester/diff/diff.go
1
// SPDX-FileCopyrightText: Copyright 2023 The Minder Authors
2
// SPDX-License-Identifier: Apache-2.0
3

4
// Package diff provides the diff rule data ingest engine
5
package diff
6

7
import (
8
        "bufio"
9
        "cmp"
10
        "context"
11
        "fmt"
12
        "math"
13
        "path/filepath"
14
        "regexp"
15
        "slices"
16
        "strconv"
17
        "strings"
18

19
        "github.com/go-git/go-billy/v5"
20
        "github.com/go-git/go-billy/v5/helper/iofs"
21
        scalibr "github.com/google/osv-scalibr"
22
        "github.com/google/osv-scalibr/extractor"
23
        "github.com/google/osv-scalibr/extractor/filesystem/list"
24
        scalibr_fs "github.com/google/osv-scalibr/fs"
25
        scalibr_plugin "github.com/google/osv-scalibr/plugin"
26
        "github.com/google/osv-scalibr/purl"
27
        "github.com/rs/zerolog"
28
        "google.golang.org/protobuf/reflect/protoreflect"
29

30
        pbinternal "github.com/mindersec/minder/internal/proto"
31
        pb "github.com/mindersec/minder/pkg/api/protobuf/go/minder/v1"
32
        "github.com/mindersec/minder/pkg/engine/v1/interfaces"
33
        "github.com/mindersec/minder/pkg/entities/v1/checkpoints"
34
)
35

36
const (
37
        // DiffRuleDataIngestType is the type of the diff rule data ingest engine
38
        DiffRuleDataIngestType = "diff"
39
        prFilesPerPage         = 30
40
        wildcard               = "*"
41
)
42

43
// Diff is the diff rule data ingest engine
44
type Diff struct {
45
        cli interfaces.GitHubListAndClone
46
        cfg *pb.DiffType
47
}
48

49
// NewDiffIngester creates a new diff ingester
50
func NewDiffIngester(
51
        cfg *pb.DiffType,
52
        cli interfaces.GitHubListAndClone,
53
) (*Diff, error) {
4✔
54
        if cfg == nil {
4✔
55
                cfg = &pb.DiffType{}
×
56
        }
×
57

58
        if cli == nil {
4✔
59
                return nil, fmt.Errorf("provider is nil")
×
60
        }
×
61

62
        return &Diff{
4✔
63
                cfg: cfg,
4✔
64
                cli: cli,
4✔
65
        }, nil
4✔
66
}
67

68
// GetType returns the type of the diff rule data ingest engine
69
func (*Diff) GetType() string {
3✔
70
        return DiffRuleDataIngestType
3✔
71
}
3✔
72

73
// GetConfig returns the config for the diff rule data ingest engine
74
func (di *Diff) GetConfig() protoreflect.ProtoMessage {
6✔
75
        return di.cfg
6✔
76
}
6✔
77

78
// Ingest ingests a diff from a pull request in accordance with its type
79
func (di *Diff) Ingest(
80
        ctx context.Context,
81
        ent protoreflect.ProtoMessage,
82
        _ map[string]any,
83
) (*interfaces.Ingested, error) {
4✔
84
        pr, ok := ent.(*pbinternal.PullRequest)
4✔
85
        if !ok {
4✔
86
                return nil, fmt.Errorf("entity is not a pull request")
×
87
        }
×
88

89
        // The GitHub Go API takes an int32, but our proto stores an int64; make sure we don't overflow
90
        if pr.Number > math.MaxInt {
4✔
91
                return nil, fmt.Errorf("pr number is too large")
×
92
        }
×
93
        prNumber := int(pr.Number)
4✔
94

4✔
95
        switch di.cfg.GetType() {
4✔
96
        case "", pb.DiffTypeDep:
×
97
                return di.getDepTypeDiff(ctx, prNumber, pr)
×
98

99
        case pb.DiffTypeNewDeps:
4✔
100
                // TODO: once we've tested some, convert DiffTypeDep to use this algorithm.
4✔
101
                return di.getScalibrTypeDiff(ctx, prNumber, pr)
4✔
102

103
        case pb.DiffTypeFull:
×
104
                return di.getFullTypeDiff(ctx, prNumber, pr)
×
105

106
        default:
×
107
                return nil, fmt.Errorf("unknown diff type")
×
108
        }
109
}
110

111
func (di *Diff) getDepTypeDiff(ctx context.Context, prNumber int, pr *pbinternal.PullRequest) (*interfaces.Ingested, error) {
×
112
        deps := pbinternal.PrDependencies{Pr: pr}
×
113
        page := 0
×
114

×
115
        for {
×
116
                prFiles, resp, err := di.cli.ListFiles(ctx, pr.RepoOwner, pr.RepoName, prNumber, prFilesPerPage, page)
×
117
                if err != nil {
×
118
                        return nil, fmt.Errorf("error getting pull request files: %w", err)
×
119
                }
×
120

121
                for _, file := range prFiles {
×
122
                        fileDiffs, err := di.ingestFileForDepDiff(file.GetFilename(), file.GetPatch(), file.GetRawURL(), *zerolog.Ctx(ctx))
×
123
                        if err != nil {
×
124
                                return nil, fmt.Errorf("error ingesting file %s: %w", file.GetFilename(), err)
×
125
                        }
×
126
                        deps.Deps = append(deps.Deps, fileDiffs...)
×
127
                }
128

129
                if resp.NextPage == 0 {
×
130
                        break
×
131
                }
132

133
                page = resp.NextPage
×
134
        }
135

136
        return &interfaces.Ingested{Object: &deps, Checkpoint: checkpoints.NewCheckpointV1Now()}, nil
×
137
}
138

139
func (di *Diff) getFullTypeDiff(ctx context.Context, prNumber int, pr *pbinternal.PullRequest) (*interfaces.Ingested, error) {
×
140
        diff := &pbinternal.PrContents{Pr: pr}
×
141
        page := 0
×
142

×
143
        for {
×
144
                prFiles, resp, err := di.cli.ListFiles(ctx, pr.RepoOwner, pr.RepoName, prNumber, prFilesPerPage, page)
×
145
                if err != nil {
×
146
                        return nil, fmt.Errorf("error getting pull request files: %w", err)
×
147
                }
×
148

149
                for _, file := range prFiles {
×
150
                        fileDiffs, err := ingestFileForFullDiff(file.GetFilename(), file.GetPatch(), file.GetRawURL())
×
151
                        if err != nil {
×
152
                                return nil, fmt.Errorf("error ingesting file %s: %w", file.GetFilename(), err)
×
153
                        }
×
154
                        diff.Files = append(diff.Files, fileDiffs)
×
155
                }
156

157
                if resp.NextPage == 0 {
×
158
                        break
×
159
                }
160

161
                page = resp.NextPage
×
162
        }
163

164
        return &interfaces.Ingested{Object: diff, Checkpoint: checkpoints.NewCheckpointV1Now()}, nil
×
165
}
166

167
func (di *Diff) ingestFileForDepDiff(
168
        filename, patchContents, patchUrl string,
169
        logger zerolog.Logger,
170
) ([]*pbinternal.PrDependencies_ContextualDependency, error) {
×
171
        parser := di.getParserForFile(filename, logger)
×
172
        if parser == nil {
×
173
                return nil, nil
×
174
        }
×
175

176
        depBatch, err := parser(patchContents)
×
177
        if err != nil {
×
178
                return nil, fmt.Errorf("error parsing file %s: %w", filename, err)
×
179
        }
×
180

181
        batchCtxDeps := make([]*pbinternal.PrDependencies_ContextualDependency, 0, len(depBatch))
×
182
        for i := range depBatch {
×
183
                dep := depBatch[i]
×
184
                batchCtxDeps = append(batchCtxDeps, &pbinternal.PrDependencies_ContextualDependency{
×
185
                        Dep: dep,
×
186
                        File: &pbinternal.PrDependencies_ContextualDependency_FilePatch{
×
187
                                Name:     filename,
×
188
                                PatchUrl: patchUrl,
×
189
                        },
×
190
                })
×
191
        }
×
192

193
        return batchCtxDeps, nil
×
194
}
195

196
func (di *Diff) getScalibrTypeDiff(ctx context.Context, _ int, pr *pbinternal.PullRequest) (*interfaces.Ingested, error) {
4✔
197
        deps := pbinternal.PrDependencies{Pr: pr}
4✔
198

4✔
199
        // TODO: we should be able to just fetch the additional commits between base and target.
4✔
200
        // Our current Git abstraction isn't quite powerful enough, so we do two shallow clones.
4✔
201

4✔
202
        baseInventory, err := di.scalibrInventory(ctx, pr.BaseCloneUrl, pr.BaseRef)
4✔
203
        if err != nil {
4✔
204
                return nil, fmt.Errorf("failed to clone base from %s at %q: %w", pr.BaseCloneUrl, pr.BaseRef, err)
×
205
        }
×
206
        newInventory, err := di.scalibrInventory(ctx, pr.TargetCloneUrl, pr.TargetRef)
4✔
207
        if err != nil {
4✔
208
                return nil, fmt.Errorf("failed to clone fork from %s at %q: %w", pr.TargetCloneUrl, pr.TargetRef, err)
×
209
        }
×
210

211
        newDeps := setDifference(baseInventory, newInventory, inventorySorter)
4✔
212

4✔
213
        deps.Deps = make([]*pbinternal.PrDependencies_ContextualDependency, 0, len(newDeps))
4✔
214
        for _, inventory := range newDeps {
10✔
215
                for _, filename := range inventory.Locations {
12✔
216
                        deps.Deps = append(deps.Deps, &pbinternal.PrDependencies_ContextualDependency{
6✔
217
                                Dep: &pbinternal.Dependency{
6✔
218
                                        Ecosystem: inventoryToEcosystem(inventory),
6✔
219
                                        Name:      inventory.Name,
6✔
220
                                        Version:   inventory.Version,
6✔
221
                                },
6✔
222
                                File: &pbinternal.PrDependencies_ContextualDependency_FilePatch{
6✔
223
                                        Name:     filename,
6✔
224
                                        PatchUrl: "", // TODO: do we need this?
6✔
225
                                },
6✔
226
                        })
6✔
227
                }
6✔
228
        }
229

230
        return &interfaces.Ingested{Object: &deps, Checkpoint: checkpoints.NewCheckpointV1Now()}, nil
4✔
231
}
232

233
func inventorySorter(a *extractor.Package, b *extractor.Package) int {
22✔
234
        // If we compare by name and version first, we can avoid serializing Locations to strings
22✔
235
        res := cmp.Or(cmp.Compare(a.Name, b.Name), cmp.Compare(a.Version, b.Version))
22✔
236
        if res != 0 {
42✔
237
                return res
20✔
238
        }
20✔
239
        // TODO: Locations should probably be sorted, but scalibr is going to export a compare function.
240
        aLoc := fmt.Sprintf("%v", a.Locations)
2✔
241
        bLoc := fmt.Sprintf("%v", b.Locations)
2✔
242
        return cmp.Compare(aLoc, bLoc)
2✔
243
}
244

245
func (di *Diff) scalibrInventory(ctx context.Context, repoURL string, ref string) ([]*extractor.Package, error) {
8✔
246
        clone, err := di.cli.Clone(ctx, repoURL, ref)
8✔
247
        if err != nil {
8✔
248
                return nil, err
×
249
        }
×
250

251
        tree, err := clone.Worktree()
8✔
252
        if err != nil {
8✔
253
                return nil, err
×
254
        }
×
255
        return scanFs(ctx, tree.Filesystem, map[string]string{})
8✔
256
}
257

258
func scanFs(ctx context.Context, memFS billy.Filesystem, _ map[string]string) ([]*extractor.Package, error) {
8✔
259
        // have to down-cast here, because scalibr needs multiple io/fs types
8✔
260
        wrapped, ok := iofs.New(memFS).(scalibr_fs.FS)
8✔
261
        if !ok {
8✔
262
                return nil, fmt.Errorf("error converting filesystem to ReadDirFS")
×
263
        }
×
264

265
        desiredCaps := scalibr_plugin.Capabilities{
8✔
266
                OS:            scalibr_plugin.OSLinux,
8✔
267
                Network:       scalibr_plugin.NetworkOffline,
8✔
268
                DirectFS:      false,
8✔
269
                RunningSystem: false,
8✔
270
        }
8✔
271

8✔
272
        scalibrFs := scalibr_fs.ScanRoot{FS: wrapped}
8✔
273
        scanConfig := scalibr.ScanConfig{
8✔
274
                ScanRoots: []*scalibr_fs.ScanRoot{&scalibrFs},
8✔
275
                // All includes Ruby, Dotnet which we're not ready to test yet, so use the more limited Default set.
8✔
276
                FilesystemExtractors: list.FromCapabilities(&desiredCaps),
8✔
277
                Capabilities:         &desiredCaps,
8✔
278
        }
8✔
279

8✔
280
        scanner := scalibr.New()
8✔
281
        scanResults := scanner.Scan(ctx, &scanConfig)
8✔
282

8✔
283
        if scanResults == nil || scanResults.Status == nil {
8✔
284
                return nil, fmt.Errorf("error scanning files: no results")
×
285
        }
×
286
        if scanResults.Status.Status != scalibr_plugin.ScanStatusSucceeded {
8✔
287
                return nil, fmt.Errorf("error scanning files: %s", scanResults.Status)
×
288
        }
×
289

290
        return scanResults.Inventory.Packages, nil
8✔
291
}
292

293
func inventoryToEcosystem(inventory *extractor.Package) pbinternal.DepEcosystem {
6✔
294
        if inventory == nil {
6✔
295
                zerolog.Ctx(context.Background()).Warn().Msg("nil ecosystem scanning diffs")
×
296
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_UNSPECIFIED
×
297
        }
×
298

299
        package_url := inventory.PURL()
6✔
300

6✔
301
        // Sometimes Scalibr uses the string "PyPI" instead of "pypi" when reporting the ecosystem.
6✔
302
        switch package_url.Type {
6✔
303
        // N.B. using an enum here abitrarily restricts our ability to add new
304
        // ecosystems without a core minder change.  Switching to strings ala
305
        // purl might be an improvement.
306
        case purl.TypePyPi:
2✔
307
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_PYPI
2✔
308
        case purl.TypeNPM:
2✔
309
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_NPM
2✔
310
        case purl.TypeGolang:
2✔
311
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_GO
2✔
UNCOV
312
        default:
×
313
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_UNSPECIFIED
×
314
        }
315
}
316

317
// ingestFileForFullDiff processes a given file's patch from a pull request.
318
// It scans through the patch line by line, identifying the changes made.
319
// If it's a hunk header, it extracts the starting line number. If it's an addition, it records the line content and its number.
320
// The function also increments the line number for context lines (lines that provide context but haven't been modified).
UNCOV
321
func ingestFileForFullDiff(filename, patch, patchUrl string) (*pbinternal.PrContents_File, error) {
×
322
        var result []*pbinternal.PrContents_File_Line
×
323

×
324
        scanner := bufio.NewScanner(strings.NewReader(patch))
×
325
        regex := regexp.MustCompile(`@@ -\d+,\d+ \+(\d+),\d+ @@`)
×
326

×
327
        var currentLineNumber int64
×
328
        var err error
×
329
        for scanner.Scan() {
×
330
                line := scanner.Text()
×
331

×
332
                if matches := regex.FindStringSubmatch(line); matches != nil {
×
333
                        currentLineNumber, err = strconv.ParseInt(matches[1], 10, 32)
×
334
                        if err != nil {
×
335
                                return nil, fmt.Errorf("error parsing line number from the hunk header: %w", err)
×
336
                        }
×
337
                } else if strings.HasPrefix(line, "+") {
×
338
                        result = append(result, &pbinternal.PrContents_File_Line{
×
339
                                Content: line[1:],
×
340
                                // see the use of strconv.ParseInt above: this is a safe downcast
×
341
                                // nolint: gosec
×
342
                                LineNumber: int32(currentLineNumber),
×
343
                        })
×
344

×
345
                        currentLineNumber++
×
346
                } else if !strings.HasPrefix(line, "-") {
×
347
                        currentLineNumber++
×
348
                }
×
349
        }
350

UNCOV
351
        if err := scanner.Err(); err != nil {
×
352
                return nil, fmt.Errorf("error reading patch: %w", err)
×
353
        }
×
354

UNCOV
355
        return &pbinternal.PrContents_File{
×
356
                Name:         filename,
×
357
                FilePatchUrl: patchUrl,
×
358
                PatchLines:   result,
×
359
        }, nil
×
360
}
361

362
func (di *Diff) getEcosystemForFile(filename string) DependencyEcosystem {
5✔
363
        lastComponent := filepath.Base(filename)
5✔
364

5✔
365
        for _, ecoMapping := range di.cfg.Ecosystems {
10✔
366
                if match, _ := filepath.Match(ecoMapping.Depfile, lastComponent); match {
8✔
367
                        return DependencyEcosystem(ecoMapping.Name)
3✔
368
                }
3✔
369
        }
370
        return DepEcosystemNone
2✔
371
}
372

UNCOV
373
func (di *Diff) getParserForFile(filename string, logger zerolog.Logger) ecosystemParser {
×
374
        eco := di.getEcosystemForFile(filename)
×
375
        if eco == DepEcosystemNone {
×
376
                logger.Debug().
×
377
                        Str("filename", filename).
×
378
                        Msg("No ecosystem found, skipping")
×
379
                return nil
×
380
        }
×
381

UNCOV
382
        logger.Debug().
×
383
                Str("filename", filename).
×
384
                Str("package-ecosystem", string(eco)).
×
385
                Msg("matched ecosystem")
×
386

×
387
        return newEcosystemParser(eco)
×
388
}
389

390
// Computes the set of elements in updated which are not in base.
391
// Note: this function may permute (sort) the order of elements in base and updated.
392
func setDifference[Slice ~[]E, E any](base Slice, updated Slice, sorter func(a, b E) int) Slice {
8✔
393

8✔
394
        slices.SortFunc(base, sorter)
8✔
395
        slices.SortFunc(updated, sorter)
8✔
396

8✔
397
        baseIdx, newIdx := 0, 0
8✔
398
        ret := make(Slice, 0)
8✔
399
        for baseIdx < len(base) && newIdx < len(updated) {
25✔
400
                cmpResult := sorter(base[baseIdx], updated[newIdx])
17✔
401
                if cmpResult < 0 {
23✔
402
                        baseIdx++
6✔
403
                } else if cmpResult > 0 {
22✔
404
                        ret = append(ret, updated[newIdx])
5✔
405
                        newIdx++
5✔
406
                } else {
11✔
407
                        baseIdx++
6✔
408
                        newIdx++
6✔
409
                }
6✔
410
        }
411
        if newIdx < len(updated) {
11✔
412
                ret = append(ret, updated[newIdx:]...)
3✔
413
        }
3✔
414

415
        // TODO: add metric for number of deps scanned vs total deps
416

417
        return ret
8✔
418
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc