• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

systemsdk / phpcpd / #4

30 Dec 2024 04:42PM UTC coverage: 75.789%. Remained the same
#4

push

DKravtsov
phpcpd 8.0.0 release. Made codebase refactoring. Updated requirements php 8.3, updated composer dependencies, updated tests to the PHPUnit 11. Updated dev environment to the php 8.4, Phing 3.0, added code quality tools: ecs, phpstan.

271 of 379 new or added lines in 20 files covered. (71.5%)

65 existing lines in 13 files now uncovered.

648 of 855 relevant lines covered (75.79%)

2.18 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.58
/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace Systemsdk\PhpCPD\Detector\Strategy\SuffixTree;
6

7
/**
8
 * An extension of the suffix tree adding an algorithm for finding approximate
9
 * clones, i.e. substrings which are similar.
10
 *
11
 * @author $Author: hummelb $
12
 *
13
 * @version $Revision: 43151 $
14
 *
15
 * @ConQAT.Rating GREEN Hash: BB94CD690760BC239F04D32D5BCAC33E
16
 */
17
class ApproximateCloneDetectingSuffixTree extends SuffixTree
18
{
19
    /**
20
     * The minimal length of clones to return.
21
     *
22
     * @var int
23
     */
24
    protected $minLength = 70;
25

26
    /**
27
     * The number of leaves reachable from the given node (1 for leaves).
28
     *
29
     * @var int[]
30
     * */
31
    private $leafCount = [];
32

33
    /**
34
     * This is the distance between two entries in the {@link #cloneInfos} map.
35
     *
36
     * @var int
37
     */
38
    private $INDEX_SPREAD = 10;
39

40
    /**
41
     * This map stores for each position the relevant clone infos.
42
     *
43
     * @var array<CloneInfo[]>
44
     */
45
    private $cloneInfos = [];
46

47
    /**
48
     * The maximal length of a clone. This influences the size of the
49
     * (quadratic) {@link #edBuffer}.
50
     *
51
     * @var int
52
     */
53
    private $MAX_LENGTH = 1024;
54

55
    /**
56
     * Buffer used for calculating edit distance.
57
     *
58
     * @var array<int[]>
59
     */
60
    private $edBuffer = [];
61

62
    /**
63
     * Number of units that must be equal at the start of a clone.
64
     *
65
     * @var int
66
     */
67
    private $headEquality = 10;
68

69
    /**
70
     * Create a new suffix tree from a given word. The word given as parameter
71
     * is used internally and should not be modified anymore, so copy it before
72
     * if required.
73
     * <p>
74
     * This only word correctly if the given word is closed using a sentinel
75
     * character.
76
     *
77
     * @param AbstractToken[] $word List of tokens to analyze
78
     */
79
    public function __construct(array $word)
80
    {
81
        parent::__construct($word);
1✔
82

83
        $arr = array_fill(0, $this->MAX_LENGTH, 0);
1✔
84
        $this->edBuffer = array_fill(0, $this->MAX_LENGTH, $arr);
1✔
85
        $this->ensureChildLists();
1✔
86
        $this->leafCount = array_fill(0, $this->numNodes, 0);
1✔
87
        $this->initLeafCount(0);
1✔
88
    }
89

90
    /**
91
     * @todo Add options:
92
     *   --min-tokens
93
     *   --min-lines
94
     *   --edit-distance
95
     * @todo Possibly add consumer from original code.
96
     */
97

98
    /**
99
     * Finds all clones in the string (List) used in the constructor.
100
     *
101
     * @param int $minLength    the minimal length of a clone in tokens (not lines)
102
     * @param int $maxErrors    the maximal number of errors/gaps allowed
103
     * @param int $headEquality the number of elements which have to be the same at the beginning of a clone
104
     *
105
     * @return CloneInfo[]
106
     */
107
    public function findClones(int $minLength, int $maxErrors, int $headEquality): array
108
    {
109
        $this->minLength = $minLength;
1✔
110
        $this->headEquality = $headEquality;
1✔
111
        $this->cloneInfos = [];
1✔
112

113
        for ($i = 0; $i < count($this->word); $i++) {
1✔
114
            // Do quick start, as first character has to match anyway.
115
            $node = $this->nextNode->get(0, $this->word[$i]);
1✔
116

117
            if ($node < 0 || $this->leafCount[$node] <= 1) {
1✔
118
                continue;
1✔
119
            }
120

121
            // we know that we have an exact match of at least 'length'
122
            // characters, as the word itself is part of the suffix tree.
123
            $length = $this->nodeWordEnd[$node] - $this->nodeWordBegin[$node];
1✔
124
            $numReported = 0;
1✔
125

126
            for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) {
1✔
127
                if ($this->matchWord($i, $i + $length, $this->nodeChildNode[$e], $length, $maxErrors)) {
1✔
128
                    $numReported++;
1✔
129
                }
130
            }
131

132
            if ($length >= $this->minLength && $numReported != 1) {
1✔
UNCOV
133
                $this->reportClone($i, $i + $length, $node, $length, $length);
×
134
            }
135
        }
136

137
        $map = [];
1✔
138

139
        for ($index = 0; $index <= count($this->word); $index++) {
1✔
140
            /** @var CloneInfo[] */
141
            $existingClones = $this->cloneInfos[$index] ?? null;
1✔
142

143
            if (!empty($existingClones)) {
1✔
144
                foreach ($existingClones as $ci) {
1✔
145
                    // length = number of tokens
146
                    // TODO: min token length
147
                    if ($ci->length > $minLength) {
1✔
148
                        $previousCi = $map[$ci->token->line] ?? null;
1✔
149

150
                        if ($previousCi === null) {
1✔
151
                            $map[$ci->token->line] = $ci;
1✔
152
                        } elseif ($ci->length > $previousCi->length) {
1✔
153
                            $map[$ci->token->line] = $ci;
1✔
154
                        }
155
                    }
156
                }
157
            }
158
        }
159

160
        /** @var CloneInfo[] */
161
        $values = array_values($map);
1✔
162
        usort($values, static function (CloneInfo $a, CloneInfo $b): int {
1✔
UNCOV
163
            return $b->length - $a->length;
×
164
        });
1✔
165

166
        return $values;
1✔
167
    }
168

169
    /**
170
     * This should return true, if the provided character is not allowed to
171
     * match with anything else (e.g. is a sentinel).
172
     */
173
    protected function mayNotMatch(AbstractToken $token): bool
174
    {
175
        return $token instanceof Sentinel;
1✔
176
    }
177

178
    /**
179
     * This method is called whenever the {@link #MAX_LENGTH} is to small and
180
     * hence the {@link #edBuffer} was not large enough. This may cause that a
181
     * really large clone is reported in multiple chunks of size
182
     * {@link #MAX_LENGTH} and potentially minor parts of such a clone might be
183
     * lost.
184
     */
185
    protected function reportBufferShortage(int $leafStart, int $leafLength): void
186
    {
UNCOV
187
        print 'Encountered buffer shortage: ' . $leafStart . ' ' . $leafLength . "\n";
×
188
    }
189

190
    /**
191
     * Initializes the {@link #leafCount} array which given for each node the
192
     * number of leaves reachable from it (where leaves obtain a value of 1).
193
     */
194
    private function initLeafCount(int $node): void
195
    {
196
        $this->leafCount[$node] = 0;
1✔
197

198
        for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) {
1✔
199
            $this->initLeafCount($this->nodeChildNode[$e]);
1✔
200
            $this->leafCount[$node] += $this->leafCount[$this->nodeChildNode[$e]];
1✔
201
        }
202

203
        if ($this->leafCount[$node] == 0) {
1✔
204
            $this->leafCount[$node] = 1;
1✔
205
        }
206
    }
207

208
    /**
209
     * Performs the approximative matching between the input word and the tree.
210
     *
211
     * @param int $wordStart      the start position of the currently matched word (position in
212
     *                            the input word)
213
     * @param int $wordPosition   the current position along the input word
214
     * @param int $node           the node we are currently at (i.e. the edge leading to this
215
     *                            node is relevant to us).
216
     * @param int $nodeWordLength the length of the word found along the nodes (this may be
217
     *                            different from the length along the input word due to gaps)
218
     * @param int $maxErrors      the number of errors still allowed
219
     *
220
     * @return bool whether some clone was reported
221
     */
222
    private function matchWord(int $wordStart, int $wordPosition, int $node, int $nodeWordLength, int $maxErrors): bool
223
    {
224
        // We are aware that this method is longer than desirable for code
225
        // reading. However, we currently do not see a refactoring that has a
226
        // sensible cost-benefit ratio. Suggestions are welcome!
227

228
        // self match?
229
        if ($this->leafCount[$node] == 1 && $this->nodeWordBegin[$node] == $wordPosition) {
1✔
230
            return false;
1✔
231
        }
232

233
        $currentNodeWordLength = min($this->nodeWordEnd[$node] - $this->nodeWordBegin[$node], $this->MAX_LENGTH - 1);
1✔
234

235
        // Do min edit distance
236
        $currentLength = $this->calculateMaxLength(
1✔
237
            $wordStart,
1✔
238
            $wordPosition,
1✔
239
            $node,
1✔
240
            $maxErrors,
1✔
241
            $currentNodeWordLength
1✔
242
        );
1✔
243

244
        if ($currentLength == 0) {
1✔
245
            return false;
1✔
246
        }
247

248
        if ($currentLength >= $this->MAX_LENGTH - 1) {
1✔
UNCOV
249
            $this->reportBufferShortage($this->nodeWordBegin[$node], $currentNodeWordLength);
×
250
        }
251

252
        // calculate cheapest match
253
        $best = $maxErrors + 42;
1✔
254
        $iBest = 0;
1✔
255
        $jBest = 0;
1✔
256

257
        for ($k = 0; $k <= $currentLength; $k++) {
1✔
258
            $i = $currentLength - $k;
1✔
259
            $j = $currentLength;
1✔
260

261
            if ($this->edBuffer[$i][$j] < $best) {
1✔
262
                $best = $this->edBuffer[$i][$j];
1✔
263
                $iBest = $i;
1✔
264
                $jBest = $j;
1✔
265
            }
266

267
            $i = $currentLength;
1✔
268
            $j = $currentLength - $k;
1✔
269

270
            if ($this->edBuffer[$i][$j] < $best) {
1✔
271
                $best = $this->edBuffer[$i][$j];
1✔
272
                $iBest = $i;
1✔
273
                $jBest = $j;
1✔
274
            }
275
        }
276

277
        while (
278
            $wordPosition + $iBest < count($this->word)
1✔
279
            && $jBest < $currentNodeWordLength
1✔
280
            && $this->word[$wordPosition + $iBest] != $this->word[$this->nodeWordBegin[$node] + $jBest]
1✔
281
            && $this->word[$wordPosition + $iBest]->equals($this->word[$this->nodeWordBegin[$node] + $jBest])
1✔
282
        ) {
UNCOV
283
            $iBest++;
×
UNCOV
284
            $jBest++;
×
285
        }
286

287
        $numReported = 0;
1✔
288

289
        if ($currentLength == $currentNodeWordLength) {
1✔
290
            // we may proceed
291
            for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) {
1✔
292
                if (
293
                    $this->matchWord(
1✔
294
                        $wordStart,
1✔
295
                        $wordPosition + $iBest,
1✔
296
                        $this->nodeChildNode[$e],
1✔
297
                        $nodeWordLength + $jBest,
1✔
298
                        $maxErrors - $best
1✔
299
                    )
1✔
300
                ) {
301
                    $numReported++;
1✔
302
                }
303
            }
304
        }
305

306
        // do not report locally if had reports in exactly one subtree (would be
307
        // pure subclone)
308
        if ($numReported == 1) {
1✔
309
            return true;
1✔
310
        }
311

312
        // disallow tail changes
313
        while (
314
            $iBest > 0
1✔
315
            && $jBest > 0
1✔
316
            && !$this->word[$wordPosition + $iBest - 1]->equals($this->word[$this->nodeWordBegin[$node] + $jBest - 1])
1✔
317
        ) {
318
            if (
319
                $iBest > 1
1✔
320
                &&
321
                    $this->word[$wordPosition + $iBest - 2]->equals(
1✔
322
                        $this->word[$this->nodeWordBegin[$node] + $jBest - 1]
1✔
323
                    )
1✔
324
            ) {
UNCOV
325
                $iBest--;
×
326
            } elseif (
327
                $jBest > 1
1✔
328
                && $this->word[$wordPosition + $iBest - 1]->equals(
1✔
329
                    $this->word[$this->nodeWordBegin[$node] + $jBest - 2]
1✔
330
                )
1✔
331
            ) {
UNCOV
332
                $jBest--;
×
333
            } else {
334
                $iBest--;
1✔
335
                $jBest--;
1✔
336
            }
337
        }
338

339
        // report if real clone
340
        if ($iBest > 0 && $jBest > 0) {
1✔
341
            $numReported++;
1✔
342
            $this->reportClone($wordStart, $wordPosition + $iBest, $node, $jBest, $nodeWordLength + $jBest);
1✔
343
        }
344

345
        return $numReported > 0;
1✔
346
    }
347

348
    /**
349
     * Calculates the maximum length we may take along the word to the current
350
     * $node (respecting the number of errors to make). *.
351
     *
352
     * @param int $wordStart             the start position of the currently matched word (position in
353
     *                                   the input word)
354
     * @param int $wordPosition          the current position along the input word
355
     * @param int $node                  the node we are currently at (i.e. the edge leading to this
356
     *                                   node is relevant to us).
357
     * @param int $maxErrors             the number of errors still allowed
358
     * @param int $currentNodeWordLength the length of the word found along the nodes (this may be
359
     *                                   different from the actual length due to buffer limits)
360
     *
361
     * @return int the maximal length that can be taken
362
     */
363
    private function calculateMaxLength(
364
        int $wordStart,
365
        int $wordPosition,
366
        int $node,
367
        int $maxErrors,
368
        int $currentNodeWordLength
369
    ): int {
370
        $this->edBuffer[0][0] = 0;
1✔
371
        $currentLength = 1;
1✔
372

373
        for (; $currentLength <= $currentNodeWordLength; $currentLength++) {
1✔
374
            /** @var int<1, max> $best */
375
            $best = $currentLength;
1✔
376
            $this->edBuffer[0][$currentLength] = $currentLength;
1✔
377
            $this->edBuffer[$currentLength][0] = $currentLength;
1✔
378

379
            if ($wordPosition + $currentLength >= count($this->word)) {
1✔
380
                break;
1✔
381
            }
382

383
            // deal with case that character may not be matched (sentinel!)
384
            $iChar = $this->word[$wordPosition + $currentLength - 1];
1✔
385
            $jChar = $this->word[$this->nodeWordBegin[$node] + $currentLength - 1];
1✔
386

387
            if ($this->mayNotMatch($iChar) || $this->mayNotMatch($jChar)) {
1✔
388
                break;
1✔
389
            }
390

391
            // usual matrix completion for edit distance
392
            for ($k = 1; $k < $currentLength; $k++) {
1✔
393
                $best = min(
1✔
394
                    $best,
1✔
395
                    $this->fillEDBuffer(
1✔
396
                        $k,
1✔
397
                        $currentLength,
1✔
398
                        $wordPosition,
1✔
399
                        $this->nodeWordBegin[$node]
1✔
400
                    )
1✔
401
                );
1✔
402
            }
403

404
            for ($k = 1; $k < $currentLength; $k++) {
1✔
405
                $best = min(
1✔
406
                    $best,
1✔
407
                    $this->fillEDBuffer(
1✔
408
                        $currentLength,
1✔
409
                        $k,
1✔
410
                        $wordPosition,
1✔
411
                        $this->nodeWordBegin[$node]
1✔
412
                    )
1✔
413
                );
1✔
414
            }
415
            $best = min(
1✔
416
                $best,
1✔
417
                $this->fillEDBuffer(
1✔
418
                    $currentLength,
1✔
419
                    $currentLength,
1✔
420
                    $wordPosition,
1✔
421
                    $this->nodeWordBegin[$node]
1✔
422
                )
1✔
423
            );
1✔
424

425
            if (
426
                $best > $maxErrors
1✔
427
                || $wordPosition - $wordStart + $currentLength <= $this->headEquality && $best > 0
1✔
428
            ) {
429
                break;
1✔
430
            }
431
        }
432
        $currentLength--;
1✔
433

434
        return $currentLength;
1✔
435
    }
436

437
    private function reportClone(
438
        int $wordBegin,
439
        int $wordEnd,
440
        int $currentNode,
441
        int $nodeWordPos,
442
        int $nodeWordLength
443
    ): void {
444
        $length = $wordEnd - $wordBegin;
1✔
445

446
        if ($length < $this->minLength || $nodeWordLength < $this->minLength) {
1✔
447
            return;
1✔
448
        }
449

450
        // NB: 0 and 0 are two indicate the template S and T for Psalm, in lack of generics.
451
        $otherClones = new PairList(16); // , 0, 0
1✔
452
        $this->findRemainingClones(
1✔
453
            $otherClones,
1✔
454
            $nodeWordLength,
1✔
455
            $currentNode,
1✔
456
            $this->nodeWordEnd[$currentNode] - $this->nodeWordBegin[$currentNode] - $nodeWordPos,
1✔
457
            $wordBegin
1✔
458
        );
1✔
459

460
        $occurrences = 1 + $otherClones->size();
1✔
461

462
        // check whether we may start from here
463
        $t = $this->word[$wordBegin];
1✔
464
        $newInfo = new CloneInfo($length, $wordBegin, $occurrences, $t, $otherClones);
1✔
465

466
        for ($index = max(0, $wordBegin - $this->INDEX_SPREAD + 1); $index <= $wordBegin; $index++) {
1✔
467
            $existingClones = $this->cloneInfos[$index] ?? null;
1✔
468

469
            if ($existingClones != null) {
1✔
470
                //for (CloneInfo cloneInfo : $existingClones) {
471
                foreach ($existingClones as $cloneInfo) {
1✔
472
                    if ($cloneInfo->dominates($newInfo, $wordBegin - $index)) {
1✔
473
                        // we already have a dominating clone, so ignore
474
                        return;
1✔
475
                    }
476
                }
477
            }
478
        }
479

480
        // add clone to $otherClones to avoid getting more duplicates
481
        for ($i = $wordBegin; $i < $wordEnd; $i += $this->INDEX_SPREAD) {
1✔
482
            $this->cloneInfos[$i][]
1✔
483
                = new CloneInfo($length - ($i - $wordBegin), $wordBegin, $occurrences, $t, $otherClones);
1✔
484
        }
485
        $t = $this->word[$wordBegin];
1✔
486

487
        for ($clone = 0; $clone < $otherClones->size(); $clone++) {
1✔
488
            $start = $otherClones->getFirst($clone);
1✔
489
            $otherLength = $otherClones->getSecond($clone);
1✔
490

491
            for ($i = 0; $i < $otherLength; $i += $this->INDEX_SPREAD) {
1✔
492
                $this->cloneInfos[$start + $i][]
1✔
493
                    = new CloneInfo($otherLength - $i, $wordBegin, $occurrences, $t, $otherClones);
1✔
494
            }
495
        }
496
    }
497

498
    /**
499
     * Fills the edit distance buffer at position (i,j).
500
     *
501
     * @param int $i       the first index of the buffer
502
     * @param int $j       the second index of the buffer
503
     * @param int $iOffset the offset where the word described by $i starts
504
     * @param int $jOffset the offset where the word described by $j starts
505
     *
506
     * @return int the value inserted into the buffer
507
     */
508
    private function fillEDBuffer(int $i, int $j, int $iOffset, int $jOffset): int
509
    {
510
        $iChar = $this->word[$iOffset + $i - 1];
1✔
511
        $jChar = $this->word[$jOffset + $j - 1];
1✔
512

513
        $insertDelete = 1 + min($this->edBuffer[$i - 1][$j], $this->edBuffer[$i][$j - 1]);
1✔
514
        $change = $this->edBuffer[$i - 1][$j - 1] + ($iChar->equals($jChar) ? 0 : 1);
1✔
515

516
        return $this->edBuffer[$i][$j] = min($insertDelete, $change);
1✔
517
    }
518

519
    /**
520
     * Fills a list of pairs giving the start positions and lengths of the
521
     * remaining clones.
522
     *
523
     * @param PairList $clonePositions the clone positions being filled (start position and length)
524
     * @param int      $nodeWordLength the length of the word along the nodes
525
     * @param int      $currentNode    the node we are currently at
526
     * @param int      $distance       the distance along the word leading to the current node
527
     * @param int      $wordStart      the start of the currently searched word
528
     */
529
    private function findRemainingClones(
530
        PairList $clonePositions,
531
        int $nodeWordLength,
532
        int $currentNode,
533
        int $distance,
534
        int $wordStart
535
    ): void {
536
        for (
537
            $nextNode = $this->nodeChildFirst[$currentNode];
1✔
538
            $nextNode >= 0;
1✔
539
            $nextNode = $this->nodeChildNext[$nextNode]
1✔
540
        ) {
UNCOV
541
            $node = $this->nodeChildNode[$nextNode];
×
UNCOV
542
            $this->findRemainingClones($clonePositions, $nodeWordLength, $node, $distance
×
543
                    + $this->nodeWordEnd[$node] - $this->nodeWordBegin[$node], $wordStart);
×
544
        }
545

546
        if ($this->nodeChildFirst[$currentNode] < 0) {
1✔
547
            $start = count($this->word) - $distance - $nodeWordLength;
1✔
548

549
            if ($start != $wordStart) {
1✔
550
                $clonePositions->add($start, $nodeWordLength);
1✔
551
            }
552
        }
553
    }
554
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc