• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

mindersec / minder / 11363687430

16 Oct 2024 10:20AM UTC coverage: 54.39% (-0.09%) from 54.475%
11363687430

Pull #4762

github

web-flow
Merge ca419002c into e4f47e8e3
Pull Request #4762: Implement Minder TestKit

50 of 60 new or added lines in 19 files covered. (83.33%)

2 existing lines in 1 file now uncovered.

14838 of 27281 relevant lines covered (54.39%)

41.28 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

47.65
/internal/engine/ingester/diff/diff.go
1
// Copyright 2023 Stacklok, Inc.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//        http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14

15
// Package diff provides the diff rule data ingest engine
16
package diff
17

18
import (
19
        "bufio"
20
        "cmp"
21
        "context"
22
        "fmt"
23
        "math"
24
        "path/filepath"
25
        "regexp"
26
        "slices"
27
        "strconv"
28
        "strings"
29

30
        "github.com/go-git/go-billy/v5"
31
        "github.com/go-git/go-billy/v5/helper/iofs"
32
        scalibr "github.com/google/osv-scalibr"
33
        "github.com/google/osv-scalibr/extractor"
34
        "github.com/google/osv-scalibr/extractor/filesystem/list"
35
        scalibr_fs "github.com/google/osv-scalibr/fs"
36
        scalibr_plugin "github.com/google/osv-scalibr/plugin"
37
        "github.com/google/osv-scalibr/purl"
38
        "github.com/rs/zerolog"
39
        "google.golang.org/protobuf/reflect/protoreflect"
40

41
        pbinternal "github.com/mindersec/minder/internal/proto"
42
        pb "github.com/mindersec/minder/pkg/api/protobuf/go/minder/v1"
43
        "github.com/mindersec/minder/pkg/engine/v1/interfaces"
44
        "github.com/mindersec/minder/pkg/entities/v1/checkpoints"
45
        provifv1 "github.com/mindersec/minder/pkg/providers/v1"
46
)
47

48
const (
49
        // DiffRuleDataIngestType is the type of the diff rule data ingest engine
50
        DiffRuleDataIngestType = "diff"
51
        prFilesPerPage         = 30
52
        wildcard               = "*"
53
)
54

55
// Diff is the diff rule data ingest engine
56
type Diff struct {
57
        cli provifv1.GitHub
58
        cfg *pb.DiffType
59
}
60

61
// NewDiffIngester creates a new diff ingester
62
func NewDiffIngester(
63
        cfg *pb.DiffType,
64
        cli provifv1.GitHub,
65
) (*Diff, error) {
4✔
66
        if cfg == nil {
4✔
67
                cfg = &pb.DiffType{}
×
68
        }
×
69

70
        if cli == nil {
4✔
71
                return nil, fmt.Errorf("provider is nil")
×
72
        }
×
73

74
        return &Diff{
4✔
75
                cfg: cfg,
4✔
76
                cli: cli,
4✔
77
        }, nil
4✔
78
}
79

80
// GetType returns the type of the diff rule data ingest engine
81
func (*Diff) GetType() string {
3✔
82
        return DiffRuleDataIngestType
3✔
83
}
3✔
84

85
// GetConfig returns the config for the diff rule data ingest engine
86
func (di *Diff) GetConfig() protoreflect.ProtoMessage {
6✔
87
        return di.cfg
6✔
88
}
6✔
89

90
// Ingest ingests a diff from a pull request in accordance with its type
91
//
92
//nolint:gocyclo
93
func (di *Diff) Ingest(
94
        ctx context.Context,
95
        ent protoreflect.ProtoMessage,
96
        _ map[string]any,
97
) (*interfaces.Result, error) {
4✔
98
        pr, ok := ent.(*pb.PullRequest)
4✔
99
        if !ok {
4✔
100
                return nil, fmt.Errorf("entity is not a pull request")
×
101
        }
×
102

103
        // The GitHub Go API takes an int32, but our proto stores an int64; make sure we don't overflow
104
        if pr.Number > math.MaxInt {
4✔
105
                return nil, fmt.Errorf("pr number is too large")
×
106
        }
×
107
        prNumber := int(pr.Number)
4✔
108

4✔
109
        switch di.cfg.GetType() {
4✔
110
        case "", pb.DiffTypeDep:
×
111
                return di.getDepTypeDiff(ctx, prNumber, pr)
×
112

113
        case pb.DiffTypeNewDeps:
4✔
114
                // TODO: once we've tested some, convert DiffTypeDep to use this algorithm.
4✔
115
                return di.getScalibrTypeDiff(ctx, prNumber, pr)
4✔
116

117
        case pb.DiffTypeFull:
×
118
                return di.getFullTypeDiff(ctx, prNumber, pr)
×
119

120
        default:
×
121
                return nil, fmt.Errorf("unknown diff type")
×
122
        }
123
}
124

NEW
125
func (di *Diff) getDepTypeDiff(ctx context.Context, prNumber int, pr *pb.PullRequest) (*interfaces.Result, error) {
×
126
        deps := pbinternal.PrDependencies{Pr: pr}
×
127
        page := 0
×
128

×
129
        for {
×
130
                prFiles, resp, err := di.cli.ListFiles(ctx, pr.RepoOwner, pr.RepoName, prNumber, prFilesPerPage, page)
×
131
                if err != nil {
×
132
                        return nil, fmt.Errorf("error getting pull request files: %w", err)
×
133
                }
×
134

135
                for _, file := range prFiles {
×
136
                        fileDiffs, err := di.ingestFileForDepDiff(file.GetFilename(), file.GetPatch(), file.GetRawURL(), *zerolog.Ctx(ctx))
×
137
                        if err != nil {
×
138
                                return nil, fmt.Errorf("error ingesting file %s: %w", file.GetFilename(), err)
×
139
                        }
×
140
                        deps.Deps = append(deps.Deps, fileDiffs...)
×
141
                }
142

143
                if resp.NextPage == 0 {
×
144
                        break
×
145
                }
146

147
                page = resp.NextPage
×
148
        }
149

NEW
150
        return &interfaces.Result{Object: &deps, Checkpoint: checkpoints.NewCheckpointV1Now()}, nil
×
151
}
152

NEW
153
func (di *Diff) getFullTypeDiff(ctx context.Context, prNumber int, pr *pb.PullRequest) (*interfaces.Result, error) {
×
154
        diff := &pbinternal.PrContents{Pr: pr}
×
155
        page := 0
×
156

×
157
        for {
×
158
                prFiles, resp, err := di.cli.ListFiles(ctx, pr.RepoOwner, pr.RepoName, prNumber, prFilesPerPage, page)
×
159
                if err != nil {
×
160
                        return nil, fmt.Errorf("error getting pull request files: %w", err)
×
161
                }
×
162

163
                for _, file := range prFiles {
×
164
                        fileDiffs, err := ingestFileForFullDiff(file.GetFilename(), file.GetPatch(), file.GetRawURL())
×
165
                        if err != nil {
×
166
                                return nil, fmt.Errorf("error ingesting file %s: %w", file.GetFilename(), err)
×
167
                        }
×
168
                        diff.Files = append(diff.Files, fileDiffs)
×
169
                }
170

171
                if resp.NextPage == 0 {
×
172
                        break
×
173
                }
174

175
                page = resp.NextPage
×
176
        }
177

NEW
178
        return &interfaces.Result{Object: diff, Checkpoint: checkpoints.NewCheckpointV1Now()}, nil
×
179
}
180

181
func (di *Diff) ingestFileForDepDiff(
182
        filename, patchContents, patchUrl string,
183
        logger zerolog.Logger,
184
) ([]*pbinternal.PrDependencies_ContextualDependency, error) {
×
185
        parser := di.getParserForFile(filename, logger)
×
186
        if parser == nil {
×
187
                return nil, nil
×
188
        }
×
189

190
        depBatch, err := parser(patchContents)
×
191
        if err != nil {
×
192
                return nil, fmt.Errorf("error parsing file %s: %w", filename, err)
×
193
        }
×
194

195
        batchCtxDeps := make([]*pbinternal.PrDependencies_ContextualDependency, 0, len(depBatch))
×
196
        for i := range depBatch {
×
197
                dep := depBatch[i]
×
198
                batchCtxDeps = append(batchCtxDeps, &pbinternal.PrDependencies_ContextualDependency{
×
199
                        Dep: dep,
×
200
                        File: &pbinternal.PrDependencies_ContextualDependency_FilePatch{
×
201
                                Name:     filename,
×
202
                                PatchUrl: patchUrl,
×
203
                        },
×
204
                })
×
205
        }
×
206

207
        return batchCtxDeps, nil
×
208
}
209

210
func (di *Diff) getScalibrTypeDiff(ctx context.Context, _ int, pr *pb.PullRequest) (*interfaces.Result, error) {
4✔
211
        deps := pbinternal.PrDependencies{Pr: pr}
4✔
212

4✔
213
        // TODO: we should be able to just fetch the additional commits between base and target.
4✔
214
        // Our current Git abstraction isn't quite powerful enough, so we do two shallow clones.
4✔
215

4✔
216
        baseInventory, err := di.scalibrInventory(ctx, pr.BaseCloneUrl, pr.BaseRef)
4✔
217
        if err != nil {
4✔
218
                return nil, fmt.Errorf("Failed to clone base from %s at %q: %w", pr.BaseCloneUrl, pr.BaseRef, err)
×
219
        }
×
220
        newInventory, err := di.scalibrInventory(ctx, pr.TargetCloneUrl, pr.TargetRef)
4✔
221
        if err != nil {
4✔
222
                return nil, fmt.Errorf("Failed to clone fork from %s at %q: %w", pr.TargetCloneUrl, pr.TargetRef, err)
×
223
        }
×
224

225
        newDeps := setDifference(baseInventory, newInventory, inventorySorter)
4✔
226

4✔
227
        deps.Deps = make([]*pbinternal.PrDependencies_ContextualDependency, 0, len(newDeps))
4✔
228
        for _, inventory := range newDeps {
10✔
229
                for _, filename := range inventory.Locations {
12✔
230
                        deps.Deps = append(deps.Deps, &pbinternal.PrDependencies_ContextualDependency{
6✔
231
                                Dep: &pbinternal.Dependency{
6✔
232
                                        Ecosystem: inventoryToEcosystem(inventory),
6✔
233
                                        Name:      inventory.Name,
6✔
234
                                        Version:   inventory.Version,
6✔
235
                                },
6✔
236
                                File: &pbinternal.PrDependencies_ContextualDependency_FilePatch{
6✔
237
                                        Name:     filename,
6✔
238
                                        PatchUrl: "", // TODO: do we need this?
6✔
239
                                },
6✔
240
                        })
6✔
241
                }
6✔
242
        }
243

244
        return &interfaces.Result{Object: &deps, Checkpoint: checkpoints.NewCheckpointV1Now()}, nil
4✔
245
}
246

247
func inventorySorter(a *extractor.Inventory, b *extractor.Inventory) int {
23✔
248
        // If we compare by name and version first, we can avoid serializing Locations to strings
23✔
249
        res := cmp.Or(cmp.Compare(a.Name, b.Name), cmp.Compare(a.Version, b.Version))
23✔
250
        if res != 0 {
44✔
251
                return res
21✔
252
        }
21✔
253
        // TODO: Locations should probably be sorted, but scalibr is going to export a compare function.
254
        aLoc := fmt.Sprintf("%v", a.Locations)
2✔
255
        bLoc := fmt.Sprintf("%v", b.Locations)
2✔
256
        return cmp.Compare(aLoc, bLoc)
2✔
257
}
258

259
func (di *Diff) scalibrInventory(ctx context.Context, repoURL string, ref string) ([]*extractor.Inventory, error) {
8✔
260
        clone, err := di.cli.Clone(ctx, repoURL, ref)
8✔
261
        if err != nil {
8✔
262
                return nil, err
×
263
        }
×
264

265
        tree, err := clone.Worktree()
8✔
266
        if err != nil {
8✔
267
                return nil, err
×
268
        }
×
269
        return scanFs(ctx, tree.Filesystem, map[string]string{})
8✔
270
}
271

272
func scanFs(ctx context.Context, memFS billy.Filesystem, _ map[string]string) ([]*extractor.Inventory, error) {
8✔
273
        // have to down-cast here, because scalibr needs multiple io/fs types
8✔
274
        wrapped, ok := iofs.New(memFS).(scalibr_fs.FS)
8✔
275
        if !ok {
8✔
276
                return nil, fmt.Errorf("error converting filesystem to ReadDirFS")
×
277
        }
×
278

279
        desiredCaps := scalibr_plugin.Capabilities{
8✔
280
                OS:            scalibr_plugin.OSLinux,
8✔
281
                Network:       true,
8✔
282
                DirectFS:      false,
8✔
283
                RunningSystem: false,
8✔
284
        }
8✔
285

8✔
286
        scalibrFs := scalibr_fs.ScanRoot{FS: wrapped}
8✔
287
        scanConfig := scalibr.ScanConfig{
8✔
288
                ScanRoots: []*scalibr_fs.ScanRoot{&scalibrFs},
8✔
289
                // All includes Ruby, Dotnet which we're not ready to test yet, so use the more limited Default set.
8✔
290
                FilesystemExtractors: list.FilterByCapabilities(list.Default, &desiredCaps),
8✔
291
                Capabilities:         &desiredCaps,
8✔
292
        }
8✔
293

8✔
294
        scanner := scalibr.New()
8✔
295
        scanResults := scanner.Scan(ctx, &scanConfig)
8✔
296

8✔
297
        if scanResults == nil || scanResults.Status == nil {
8✔
298
                return nil, fmt.Errorf("error scanning files: no results")
×
299
        }
×
300
        if scanResults.Status.Status != scalibr_plugin.ScanStatusSucceeded {
8✔
301
                return nil, fmt.Errorf("error scanning files: %s", scanResults.Status)
×
302
        }
×
303

304
        return scanResults.Inventories, nil
8✔
305
}
306

307
func inventoryToEcosystem(inventory *extractor.Inventory) pbinternal.DepEcosystem {
6✔
308
        if inventory == nil {
6✔
309
                zerolog.Ctx(context.Background()).Warn().Msg("nil ecosystem scanning diffs")
×
310
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_UNSPECIFIED
×
311
        }
×
312

313
        // This should be inventory.PURL()... but there isn't a convenience wrapper yet
314
        package_url, err := inventory.Extractor.ToPURL(inventory)
6✔
315
        if err != nil {
6✔
316
                zerolog.Ctx(context.Background()).Warn().Err(err).Msg("error getting ecosystem from inventory")
×
317
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_UNSPECIFIED
×
318
        }
×
319

320
        // Sometimes Scalibr uses the string "PyPI" instead of "pypi" when reporting the ecosystem.
321
        switch package_url.Type {
6✔
322
        // N.B. using an enum here abitrarily restricts our ability to add new
323
        // ecosystems without a core minder change.  Switching to strings ala
324
        // purl might be an improvement.
325
        case purl.TypePyPi:
2✔
326
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_PYPI
2✔
327
        case purl.TypeNPM:
2✔
328
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_NPM
2✔
329
        case purl.TypeGolang:
2✔
330
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_GO
2✔
331
        default:
×
332
                return pbinternal.DepEcosystem_DEP_ECOSYSTEM_UNSPECIFIED
×
333
        }
334
}
335

336
// ingestFileForFullDiff processes a given file's patch from a pull request.
337
// It scans through the patch line by line, identifying the changes made.
338
// If it's a hunk header, it extracts the starting line number. If it's an addition, it records the line content and its number.
339
// The function also increments the line number for context lines (lines that provide context but haven't been modified).
340
func ingestFileForFullDiff(filename, patch, patchUrl string) (*pbinternal.PrContents_File, error) {
×
341
        var result []*pbinternal.PrContents_File_Line
×
342

×
343
        scanner := bufio.NewScanner(strings.NewReader(patch))
×
344
        regex := regexp.MustCompile(`@@ -\d+,\d+ \+(\d+),\d+ @@`)
×
345

×
346
        var currentLineNumber int64
×
347
        var err error
×
348
        for scanner.Scan() {
×
349
                line := scanner.Text()
×
350

×
351
                if matches := regex.FindStringSubmatch(line); matches != nil {
×
352
                        currentLineNumber, err = strconv.ParseInt(matches[1], 10, 32)
×
353
                        if err != nil {
×
354
                                return nil, fmt.Errorf("error parsing line number from the hunk header: %w", err)
×
355
                        }
×
356
                } else if strings.HasPrefix(line, "+") {
×
357
                        result = append(result, &pbinternal.PrContents_File_Line{
×
358
                                Content: line[1:],
×
359
                                // see the use of strconv.ParseInt above: this is a safe downcast
×
360
                                // nolint: gosec
×
361
                                LineNumber: int32(currentLineNumber),
×
362
                        })
×
363

×
364
                        currentLineNumber++
×
365
                } else if !strings.HasPrefix(line, "-") {
×
366
                        currentLineNumber++
×
367
                }
×
368
        }
369

370
        if err := scanner.Err(); err != nil {
×
371
                return nil, fmt.Errorf("error reading patch: %w", err)
×
372
        }
×
373

374
        return &pbinternal.PrContents_File{
×
375
                Name:         filename,
×
376
                FilePatchUrl: patchUrl,
×
377
                PatchLines:   result,
×
378
        }, nil
×
379
}
380

381
func (di *Diff) getEcosystemForFile(filename string) DependencyEcosystem {
5✔
382
        lastComponent := filepath.Base(filename)
5✔
383

5✔
384
        for _, ecoMapping := range di.cfg.Ecosystems {
10✔
385
                if match, _ := filepath.Match(ecoMapping.Depfile, lastComponent); match {
8✔
386
                        return DependencyEcosystem(ecoMapping.Name)
3✔
387
                }
3✔
388
        }
389
        return DepEcosystemNone
2✔
390
}
391

392
func (di *Diff) getParserForFile(filename string, logger zerolog.Logger) ecosystemParser {
×
393
        eco := di.getEcosystemForFile(filename)
×
394
        if eco == DepEcosystemNone {
×
395
                logger.Debug().
×
396
                        Str("filename", filename).
×
397
                        Msg("No ecosystem found, skipping")
×
398
                return nil
×
399
        }
×
400

401
        logger.Debug().
×
402
                Str("filename", filename).
×
403
                Str("package-ecosystem", string(eco)).
×
404
                Msg("matched ecosystem")
×
405

×
406
        return newEcosystemParser(eco)
×
407
}
408

409
// Computes the set of elements in updated which are not in base.
410
// Note: this function may permute (sort) the order of elements in base and updated.
411
func setDifference[Slice ~[]E, E any](base Slice, updated Slice, sorter func(a, b E) int) Slice {
8✔
412

8✔
413
        slices.SortFunc(base, sorter)
8✔
414
        slices.SortFunc(updated, sorter)
8✔
415

8✔
416
        baseIdx, newIdx := 0, 0
8✔
417
        ret := make(Slice, 0)
8✔
418
        for baseIdx < len(base) && newIdx < len(updated) {
25✔
419
                cmpResult := sorter(base[baseIdx], updated[newIdx])
17✔
420
                if cmpResult < 0 {
23✔
421
                        baseIdx++
6✔
422
                } else if cmpResult > 0 {
22✔
423
                        ret = append(ret, updated[newIdx])
5✔
424
                        newIdx++
5✔
425
                } else {
11✔
426
                        baseIdx++
6✔
427
                        newIdx++
6✔
428
                }
6✔
429
        }
430
        if newIdx < len(updated) {
11✔
431
                ret = append(ret, updated[newIdx:]...)
3✔
432
        }
3✔
433

434
        // TODO: add metric for number of deps scanned vs total deps
435

436
        return ret
8✔
437
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc