• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

ducks-project / encoding-repair / 21393690412

27 Jan 2026 10:30AM UTC coverage: 91.282% (-1.2%) from 92.453%
21393690412

push

github

donaldinou
feat : normalization

3 of 4 new or added lines in 1 file covered. (75.0%)

7 existing lines in 1 file now uncovered.

534 of 585 relevant lines covered (91.28%)

21.53 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.35
/CharsetProcessor.php
1
<?php
2

3
/**
4
 * Part of EncodingRepair package.
5
 *
6
 * (c) Adrien Loyant <donald_duck@team-df.org>
7
 *
8
 * For the full copyright and license information, please view the LICENSE
9
 * file that was distributed with this source code.
10
 */
11

12
declare(strict_types=1);
13

14
namespace Ducks\Component\EncodingRepair;
15

16
use Ducks\Component\EncodingRepair\Detector\CachedDetector;
17
use Ducks\Component\EncodingRepair\Detector\DetectorChain;
18
use Ducks\Component\EncodingRepair\Detector\DetectorInterface;
19
use Ducks\Component\EncodingRepair\Detector\FileInfoDetector;
20
use Ducks\Component\EncodingRepair\Detector\MbStringDetector;
21
use Ducks\Component\EncodingRepair\Interpreter\ArrayInterpreter;
22
use Ducks\Component\EncodingRepair\Interpreter\InterpreterChain;
23
use Ducks\Component\EncodingRepair\Interpreter\ObjectInterpreter;
24
use Ducks\Component\EncodingRepair\Interpreter\PropertyMapperInterface;
25
use Ducks\Component\EncodingRepair\Interpreter\StringInterpreter;
26
use Ducks\Component\EncodingRepair\Interpreter\TypeInterpreterInterface;
27
use Ducks\Component\EncodingRepair\Transcoder\IconvTranscoder;
28
use Ducks\Component\EncodingRepair\Transcoder\MbStringTranscoder;
29
use Ducks\Component\EncodingRepair\Transcoder\TranscoderChain;
30
use Ducks\Component\EncodingRepair\Transcoder\TranscoderInterface;
31
use Ducks\Component\EncodingRepair\Transcoder\UConverterTranscoder;
32
use InvalidArgumentException;
33
use Normalizer;
34
use RuntimeException;
35

36
/**
37
 * Charset processing service.
38
 *
39
 * @final
40
 */
41
final class CharsetProcessor implements CharsetProcessorInterface
42
{
43
    private const DEFAULT_ENCODINGS = [
44
        self::ENCODING_UTF8,
45
        self::WINDOWS_1252,
46
        self::ENCODING_ISO,
47
        self::ENCODING_ASCII,
48
    ];
49

50
    private const MAX_REPAIR_DEPTH = 5;
51
    private const JSON_DEFAULT_DEPTH = 512;
52
    private const DEFAULT_MAX_SAMPLES = 1;
53

54
    /**
55
     * @var TranscoderChain
56
     */
57
    private TranscoderChain $transcoderChain;
58

59
    /**
60
     * @var DetectorChain
61
     */
62
    private DetectorChain $detectorChain;
63

64
    /**
65
     * @var InterpreterChain
66
     */
67
    private InterpreterChain $interpreterChain;
68

69
    /**
70
     * @var list<string>
71
     */
72
    private $allowedEncodings;
73

74
    public function __construct()
95✔
75
    {
76
        $this->transcoderChain = new TranscoderChain();
95✔
77
        $this->detectorChain = new DetectorChain();
95✔
78
        $this->interpreterChain = new InterpreterChain();
95✔
79
        $this->allowedEncodings = [
95✔
80
            self::AUTO,
95✔
81
            self::ENCODING_UTF8,
95✔
82
            self::WINDOWS_1252,
95✔
83
            self::ENCODING_ISO,
95✔
84
            self::ENCODING_ASCII,
95✔
85
            self::ENCODING_UTF16,
95✔
86
            self::ENCODING_UTF32,
95✔
87
        ];
95✔
88

89
        $this->resetTranscoders();
95✔
90
        $this->resetDetectors();
95✔
91
        $this->resetInterpreters();
95✔
92
    }
93

94
    /**
95
     * @inheritDoc
96
     */
97
    public function registerTranscoder(TranscoderInterface $transcoder, ?int $priority = null): self
8✔
98
    {
99
        $this->transcoderChain->register($transcoder, $priority);
8✔
100

101
        return $this;
8✔
102
    }
103

104
    /**
105
     * @inheritDoc
106
     */
107
    public function unregisterTranscoder(TranscoderInterface $transcoder): self
3✔
108
    {
109
        $this->transcoderChain->unregister($transcoder);
3✔
110

111
        return $this;
3✔
112
    }
113

114
    /**
115
     * @inheritDoc
116
     */
117
    public function queueTranscoders(TranscoderInterface ...$transcoders): self
1✔
118
    {
119
        foreach ($transcoders as $transcoder) {
1✔
120
            $this->registerTranscoder($transcoder);
1✔
121
        }
122

123
        return $this;
1✔
124
    }
125

126
    /**
127
     * @inheritDoc
128
     */
129
    public function resetTranscoders(): self
95✔
130
    {
131
        $this->transcoderChain = new TranscoderChain();
95✔
132
        $this->transcoderChain->register(new UConverterTranscoder());
95✔
133
        $this->transcoderChain->register(new IconvTranscoder());
95✔
134
        $this->transcoderChain->register(new MbStringTranscoder());
95✔
135

136
        return $this;
95✔
137
    }
138

139
    /**
140
     * @inheritDoc
141
     */
142
    public function registerDetector(DetectorInterface $detector, ?int $priority = null): self
7✔
143
    {
144
        $this->detectorChain->register($detector, $priority);
7✔
145

146
        return $this;
7✔
147
    }
148

149
    /**
150
     * @inheritDoc
151
     */
152
    public function unregisterDetector(DetectorInterface $detector): self
1✔
153
    {
154
        $this->detectorChain->unregister($detector);
1✔
155

156
        return $this;
1✔
157
    }
158

159
    /**
160
     * @inheritDoc
161
     */
162
    public function queueDetectors(DetectorInterface ...$detectors): self
1✔
163
    {
164
        foreach ($detectors as $detector) {
1✔
165
            $this->registerDetector($detector);
1✔
166
        }
167

168
        return $this;
1✔
169
    }
170

171
    /**
172
     * @inheritDoc
173
     */
174
    public function resetDetectors(): self
95✔
175
    {
176
        $this->detectorChain = new DetectorChain();
95✔
177
        $mbDetector = new MbStringDetector();
95✔
178
        $cachedDetector = new CachedDetector($mbDetector);
95✔
179
        $this->detectorChain->register($cachedDetector);
95✔
180
        $this->detectorChain->register(new FileInfoDetector());
95✔
181

182
        return $this;
95✔
183
    }
184

185
    /**
186
     * @inheritDoc
187
     */
188
    public function addEncodings(string ...$encodings): self
4✔
189
    {
190
        foreach ($encodings as $encoding) {
4✔
191
            if (!\in_array($encoding, $this->allowedEncodings, true)) {
4✔
192
                $this->allowedEncodings[] = $encoding;
4✔
193
            }
194
        }
195

196
        return $this;
4✔
197
    }
198

199
    /**
200
     * @inheritDoc
201
     */
202
    public function removeEncodings(string ...$encodings): self
2✔
203
    {
204
        $this->allowedEncodings = \array_values(
2✔
205
            \array_diff($this->allowedEncodings, $encodings)
2✔
206
        );
2✔
207

208
        return $this;
2✔
209
    }
210

211
    /**
212
     * @inheritDoc
213
     */
214
    public function getEncodings(): array
5✔
215
    {
216
        return $this->allowedEncodings;
5✔
217
    }
218

219
    /**
220
     * @inheritDoc
221
     */
222
    public function resetEncodings(): self
1✔
223
    {
224
        $this->allowedEncodings = [
1✔
225
            self::AUTO,
1✔
226
            self::ENCODING_UTF8,
1✔
227
            self::WINDOWS_1252,
1✔
228
            self::ENCODING_ISO,
1✔
229
            self::ENCODING_ASCII,
1✔
230
            self::ENCODING_UTF16,
1✔
231
            self::ENCODING_UTF32,
1✔
232
        ];
1✔
233

234
        return $this;
1✔
235
    }
236

237
    /**
238
     * @inheritDoc
239
     */
240
    public function registerInterpreter(TypeInterpreterInterface $interpreter, ?int $priority = null): self
2✔
241
    {
242
        $this->interpreterChain->register($interpreter, $priority);
2✔
243

244
        return $this;
2✔
245
    }
246

247
    /**
248
     * @inheritDoc
249
     */
250
    public function unregisterInterpreter(TypeInterpreterInterface $interpreter): self
1✔
251
    {
252
        $this->interpreterChain->unregister($interpreter);
1✔
253

254
        return $this;
1✔
255
    }
256

257
    /**
258
     * @inheritDoc
259
     */
260
    public function registerPropertyMapper(string $className, PropertyMapperInterface $mapper): self
3✔
261
    {
262
        $objectInterpreter = $this->interpreterChain->getObjectInterpreter();
3✔
263

264
        if (null === $objectInterpreter) {
3✔
265
            throw new RuntimeException('ObjectInterpreter not registered in chain');
1✔
266
        }
267

268
        $objectInterpreter->registerMapper($className, $mapper);
2✔
269

270
        return $this;
2✔
271
    }
272

273
    /**
274
     * @inheritDoc
275
     */
276
    public function resetInterpreters(): self
95✔
277
    {
278
        $this->interpreterChain = new InterpreterChain();
95✔
279
        $this->interpreterChain->register(new StringInterpreter(), 100);
95✔
280
        $this->interpreterChain->register(new ArrayInterpreter($this->interpreterChain), 50);
95✔
281
        $this->interpreterChain->register(new ObjectInterpreter($this->interpreterChain), 30);
95✔
282

283
        return $this;
95✔
284
    }
285

286
    /**
287
     * @inheritDoc
288
     */
289
    public function detect(string $string, array $options = []): string
17✔
290
    {
291
        if ($this->isValidUtf8($string)) {
17✔
292
            return self::ENCODING_UTF8;
14✔
293
        }
294

295
        $detected = $this->detectorChain->detect($string, $options);
3✔
296

297
        return $detected ?? self::ENCODING_ISO;
3✔
298
    }
299

300
    /**
301
     * @inheritDoc
302
     */
303
    public function detectBatch(iterable $items, array $options = []): string
6✔
304
    {
305
        /** @var mixed $maxSamples */
306
        $maxSamples = $options['maxSamples'] ?? self::DEFAULT_MAX_SAMPLES;
6✔
307
        if (!\is_int($maxSamples) || 1 > $maxSamples) {
6✔
308
            $maxSamples = self::DEFAULT_MAX_SAMPLES;
1✔
309
        }
310

311
        /** @var list<string> $samples */
312
        $samples = [];
6✔
313

314
        /** @var mixed $item */
315
        foreach ($items as $item) {
6✔
316
            if (\is_string($item) && '' !== $item) {
6✔
317
                $samples[] = $item;
5✔
318
                if (\count($samples) >= $maxSamples) {
5✔
319
                    break;
5✔
320
                }
321
            }
322
        }
323

324
        // Fast return.
325
        if (empty($samples)) {
6✔
326
            return self::ENCODING_UTF8;
1✔
327
        }
328

329
        // Fast path: single sample (default behavior)
330
        if (1 === $maxSamples) {
5✔
331
            return $this->detect($samples[0], $options);
4✔
332
        }
333

334
        // Detect on longest sample (more reliable for multiple samples)
335
        $longest = \array_reduce(
1✔
336
            $samples,
1✔
337
            /**
338
             * @param null|string $carry
339
             * @param string $item
340
             */
341
            static fn ($carry, $item) => \strlen($item) > \strlen($carry ?? '') ? $item : $carry
1✔
342
        );
1✔
343

344
        return $this->detect($longest, $options);
1✔
345
    }
346

347
    /**
348
     * @inheritDoc
349
     */
350
    public function toCharset(
70✔
351
        $data,
352
        string $to = self::ENCODING_UTF8,
353
        string $from = self::ENCODING_ISO,
354
        array $options = []
355
    ) {
356
        $this->validateEncoding($to, 'target');
70✔
357
        $this->validateEncoding($from, 'source');
69✔
358

359
        $options = $this->configureOptions($options);
69✔
360

361
        // We define the callback logic for a single string
362
        /**
363
         * @psalm-suppress MissingClosureParamType
364
         * @psalm-suppress MissingClosureReturnType
365
         */
366
        $callback = fn ($value) => $this->convertValue($value, $to, $from, $options);
69✔
367

368
        return $this->applyRecursive($data, $callback);
69✔
369
    }
370

371
    /**
372
     * Converts anything (string, array, object) to UTF-8.
373
     *
374
     * @param mixed $data Data to convert
375
     * @param string $from Source encoding
376
     * @param array<string, mixed> $options Conversion options
377
     *                                      - 'normalize': bool (default: true)
378
     *                                      - 'translit': bool (default: true)
379
     *                                      - 'ignore': bool (default: true)
380
     *
381
     * @return mixed
382
     *
383
     * @throws InvalidArgumentException If encoding is invalid
384
     *
385
     * @psalm-api
386
     */
387
    public function toUtf8($data, string $from = self::WINDOWS_1252, array $options = [])
19✔
388
    {
389
        return $this->toCharset($data, self::ENCODING_UTF8, $from, $options);
19✔
390
    }
391

392
    /**
393
     * Converts anything to ISO-8859-1 (Windows-1252).
394
     *
395
     * @param mixed $data Data to convert
396
     * @param string $from Source encoding
397
     * @param array<string, mixed> $options Conversion options
398
     *                                      - 'normalize': bool (default: true)
399
     *                                      - 'translit': bool (default: true)
400
     *                                      - 'ignore': bool (default: true)
401
     *
402
     * @return mixed
403
     *
404
     * @throws InvalidArgumentException If encoding is invalid
405
     *
406
     * @psalm-api
407
     */
408
    public function toIso($data, string $from = self::ENCODING_UTF8, array $options = [])
1✔
409
    {
410
        return $this->toCharset($data, self::WINDOWS_1252, $from, $options);
1✔
411
    }
412

413
    /**
414
     * @inheritDoc
415
     */
416
    public function toCharsetBatch(
6✔
417
        array $items,
418
        string $to = self::ENCODING_UTF8,
419
        string $from = self::ENCODING_ISO,
420
        array $options = []
421
    ): array {
422
        $this->validateEncoding($to, 'target');
6✔
423
        $this->validateEncoding($from, 'source');
6✔
424

425
        if (self::AUTO === $from) {
6✔
426
            $from = $this->detectBatch($items, $options);
1✔
427
        }
428

429
        /** @psalm-suppress MissingClosureReturnType */
430
        return \array_map(fn ($item) => $this->toCharset($item, $to, $from, $options), $items);
6✔
431
    }
432

433
    /**
434
     * Batch convert array items from one encoding to utf8.
435
     *
436
     * Optimized for homogeneous arrays: detects encoding once on first non-empty string.
437
     * Use this instead of toUtf8() when processing large arrays with AUTO detection.
438
     *
439
     * @param array<mixed> $items Items to convert
440
     * @param string $from Source encoding (use AUTO for detection)
441
     * @param array<string, mixed> $options Conversion options
442
     *
443
     * @return array<mixed> Converted items
444
     *
445
     * @throws InvalidArgumentException If encoding is
446
     *
447
     * @psalm-api
448
     */
449
    public function toUtf8Batch(
1✔
450
        array $items,
451
        string $from = self::WINDOWS_1252,
452
        array $options = []
453
    ): array {
454
        return $this->toCharsetBatch($items, self::ENCODING_UTF8, $from, $options);
1✔
455
    }
456

457
    /**
458
     * Batch convert array items from one encoding to iso.
459
     *
460
     * Optimized for homogeneous arrays: detects encoding once on first non-empty string.
461
     * Use this instead of toIso() when processing large arrays with AUTO detection.
462
     *
463
     * @param array<mixed> $items Items to convert
464
     * @param string $from Source encoding (use AUTO for detection)
465
     * @param array<string, mixed> $options Conversion options
466
     *
467
     * @return array<mixed> Converted items
468
     *
469
     * @throws InvalidArgumentException If encoding is invalid
470
     *
471
     * @psalm-api
472
     */
473
    public function toIsoBatch(
1✔
474
        array $items,
475
        string $from = self::ENCODING_UTF8,
476
        array $options = []
477
    ): array {
478
        return $this->toCharsetBatch($items, self::WINDOWS_1252, $from, $options);
1✔
479
    }
480

481
    /**
482
     * @inheritDoc
483
     */
484
    public function repair(
21✔
485
        $data,
486
        string $to = self::ENCODING_UTF8,
487
        string $from = self::ENCODING_ISO,
488
        array $options = []
489
    ) {
490
        $options = $this->configureOptions($options, ['maxDepth' => self::MAX_REPAIR_DEPTH]);
21✔
491

492
        /**
493
         * @psalm-suppress MissingClosureParamType
494
         * @psalm-suppress MissingClosureReturnType
495
         */
496
        $callback = fn ($value) => $this->repairValue($value, $to, $from, $options);
21✔
497

498
        return $this->applyRecursive($data, $callback);
21✔
499
    }
500

501
    /**
502
     * @inheritDoc
503
     */
504
    public function safeJsonEncode(
6✔
505
        $data,
506
        int $flags = 0,
507
        int $depth = self::JSON_DEFAULT_DEPTH,
508
        string $from = self::WINDOWS_1252
509
    ): string {
510
        /** @var mixed $data */
511
        $data = $this->repair($data, self::ENCODING_UTF8, $from);
6✔
512

513
        // Force JSON_THROW_ON_ERROR flag
514
        return \json_encode($data, $flags | \JSON_THROW_ON_ERROR, $depth);
6✔
515
    }
516

517
    /**
518
     * @inheritDoc
519
     */
520
    public function safeJsonDecode(
6✔
521
        string $json,
522
        ?bool $associative = null,
523
        int $depth = self::JSON_DEFAULT_DEPTH,
524
        int $flags = 0,
525
        string $to = self::ENCODING_UTF8,
526
        string $from = self::WINDOWS_1252
527
    ) {
528
        // Repair string to a valid UTF-8 for decoding
529
        /** @var string $data */
530
        $data = $this->repair($json, self::ENCODING_UTF8, $from);
6✔
531

532
        // Force JSON_THROW_ON_ERROR flag
533
        /** @var mixed $result */
534
        $result = \json_decode($data, $associative, $depth, $flags | \JSON_THROW_ON_ERROR);
6✔
535

536
        return $this->toCharset($result, $to, self::ENCODING_UTF8);
5✔
537
    }
538

539
    /**
540
     * Applies a callback recursively using type interpreters.
541
     *
542
     * @param mixed $data Data to process
543
     * @param callable $callback Processing callback function
544
     *
545
     * @return mixed
546
     */
547
    private function applyRecursive($data, callable $callback)
71✔
548
    {
549
        return $this->interpreterChain->interpret($data, $callback, []);
71✔
550
    }
551

552
    /**
553
     * Converts a single value to target encoding.
554
     *
555
     * @param mixed $value Value to convert
556
     * @param string $to Target encoding
557
     * @param string $from Source encoding
558
     * @param array<string, mixed> $options Conversion configuration
559
     *
560
     * @return mixed
561
     */
562
    private function convertValue($value, string $to, string $from, array $options)
67✔
563
    {
564
        if (!\is_string($value)) {
67✔
565
            // @codeCoverageIgnoreStart
566
            return $value;
567
            // @codeCoverageIgnoreEnd
568
        }
569

570
        // Special handling when converting FROM UTF-8
571
        // Do not trust mbstring when return utf-8 but we want another encoding,
572
        // because it will return true even if it's not really valid.
573
        if (self::ENCODING_UTF8 !== $to && $this->isValidUtf8($value)) {
67✔
574
            return $this->convertString($value, $to, self::ENCODING_UTF8, $options);
6✔
575
        }
576

577
        // Check if already in target encoding
578
        if (\mb_check_encoding($value, $to)) {
62✔
579
            return $this->normalize($value, $to, $options);
50✔
580
        }
581

582
        return $this->convertString($value, $to, $from, $options);
13✔
583
    }
584

585
    /**
586
     * Low-level string conversion logic.
587
     *
588
     * @param string $data String to convert
589
     * @param string $to Target encoding
590
     * @param string $from Source encoding
591
     * @param array<string, mixed> $options Conversion options
592
     *
593
     * @return string Converted string or $data if convertion failed
594
     */
595
    private function convertString(string $data, string $to, string $from, array $options): string
18✔
596
    {
597
        return $this->transcodeString($data, $to, $from, $options) ?? $data;
18✔
598
    }
599

600
    /**
601
     * Low-level string transcode logic with fallback strategies.
602
     *
603
     * @param string $data String to transcode
604
     * @param string $to Target encoding
605
     * @param string $from Source encoding
606
     * @param array<string, mixed> $options Conversion options
607
     *
608
     * @return ?string Converted string or null if failed.
609
     */
610
    private function transcodeString(string $data, string $to, string $from, array $options): ?string
20✔
611
    {
612
        // Optimize: detect once if both are AUTO
613
        $detectedEncoding = null;
20✔
614
        if (self::AUTO === $to || self::AUTO === $from) {
20✔
615
            // @codeCoverageIgnoreStart
616
            $detectedEncoding = $this->detect($data, $options);
617
            // @codeCoverageIgnoreEnd
618
        }
619

620
        /** @var string $targetEncoding */
621
        $targetEncoding = self::AUTO === $to ? $detectedEncoding : $to;
20✔
622
        /** @var string $sourceEncoding */
623
        $sourceEncoding = self::AUTO === $from ? $detectedEncoding : $from;
20✔
624

625
        $result = $this->transcoderChain->transcode($data, $targetEncoding, $sourceEncoding, $options);
20✔
626

627
        if (null !== $result && self::ENCODING_UTF8 === $targetEncoding) {
20✔
628
            return $this->normalize($result, $targetEncoding, $options);
13✔
629
        }
630

631
        return $result;
8✔
632
    }
633

634
    /**
635
     * Repairs a double-encoded value.
636
     *
637
     * @param mixed $value Value to repair
638
     * @param string $to Target encoding
639
     * @param string $from Glitch encoding
640
     * @param array<string, mixed> $options Configuration
641
     *
642
     * @return mixed
643
     */
644
    private function repairValue($value, string $to, string $from, array $options)
19✔
645
    {
646
        if (!\is_string($value)) {
19✔
647
            // @codeCoverageIgnoreStart
648
            return $value;
649
            // @codeCoverageIgnoreEnd
650
        }
651

652
        /** @var mixed $maxDepth */
653
        $maxDepth = $options['maxDepth'] ?? self::MAX_REPAIR_DEPTH;
19✔
654
        if (!\is_int($maxDepth)) {
19✔
655
            $maxDepth = self::MAX_REPAIR_DEPTH;
2✔
656
        }
657

658
        $fixed = $this->peelEncodingLayers($value, $from, $maxDepth);
19✔
659
        $detectedEncoding = $this->isValidUtf8($fixed) ? self::ENCODING_UTF8 : $from;
19✔
660

661
        return $this->toCharset($fixed, $to, $detectedEncoding, $options);
19✔
662
    }
663

664
    /**
665
     * Attempts to remove multiple encoding layers.
666
     *
667
     * @param string $value String to repair
668
     * @param string $from Encoding to reverse
669
     * @param int $maxDepth Maximum iterations
670
     *
671
     * @return string Repaired string
672
     */
673
    private function peelEncodingLayers(string $value, string $from, int $maxDepth): string
19✔
674
    {
675
        // Clean invalid UTF-8 sequences first (edge case: malformed bytes like \xC2\x88)
676
        if (\function_exists('mb_scrub')) {
19✔
677
            $clean = \mb_scrub($value, 'UTF-8');
19✔
678
        } else {
NEW
679
            $clean = \mb_convert_encoding($value, 'UTF-8', 'UTF-8');
×
680
        }
681

682
        if (false !== $clean) {
19✔
683
            $value = $clean;
19✔
684
        }
685

686
        // Quick check: if no corruption patterns, return as-is
687
        if (false === \strpos($value, "\xC3\x82") && false === \strpos($value, "\xC3\x83")) {
19✔
688
            return $value;
17✔
689
        }
690

691
        $fixed = $value;
2✔
692
        $iterations = 0;
2✔
693
        $options = ['normalize' => false, 'translit' => false, 'ignore' => false];
2✔
694

695
        // Loop while it looks like valid UTF-8
696
        while ($iterations < $maxDepth && $this->isValidUtf8($fixed)) {
2✔
697
            // Attempt to reverse convert (UTF-8 -> $from)
698
            $test = $this->transcodeString($fixed, $from, self::ENCODING_UTF8, $options);
2✔
699

700
            // Break if conversion failed, no change, or result is longer (infinite loop detection)
701
            if (null === $test || $test === $fixed || \strlen($test) >= \strlen($fixed) || !$this->isValidUtf8($test)) {
2✔
702
                break;
2✔
703
            }
704

705
            // If conversion worked AND result is still valid UTF-8 AND result is different
706
            $fixed = $test;
2✔
707
            $iterations++;
2✔
708
        }
709

710
        // Try pattern-based repair (ForceUTF8 approach)
711
        if ($fixed === $value || false !== \strpos($fixed, "\xC3\x82")) {
2✔
UNCOV
712
            $fixed = $this->repairByPatternReplacement($fixed);
×
713
        }
714

715
        return $fixed;
2✔
716
    }
717

718
    /**
719
     * Repairs UTF-8 strings using pattern replacement (ForceUTF8 approach).
720
     *
721
     * @param string $value String to repair
722
     *
723
     * @return string Repaired string
724
     */
UNCOV
725
    private function repairByPatternReplacement(string $value): string
×
726
    {
727
        // Optimized with single preg_replace call (30-40% faster than 2 calls)
UNCOV
728
        return \preg_replace(
×
UNCOV
729
            ['/\xC3\x82/', '/\xC3\x83\xC2([\xA0-\xFF])/'],
×
UNCOV
730
            ['', "\xC3$1"],
×
UNCOV
731
            $value
×
UNCOV
732
        ) ?? $value;
×
733
    }
734

735
    /**
736
     * Normalizes UTF-8 string if needed.
737
     *
738
     * @param string $value String to normalize
739
     * @param string $to Target encoding
740
     * @param array<string, mixed> $options Configuration
741
     *
742
     * @return string Normalized or original string
743
     *
744
     * @codeCoverageIgnore
745
     */
746
    private function normalize(string $value, string $to, array $options): string
747
    {
748
        // Only normalize if: target is UTF-8 AND normalize option is true
749
        if (self::ENCODING_UTF8 !== $to || false === ($options['normalize'] ?? true)) {
750
            return $value;
751
        }
752

753
        if (!\class_exists(Normalizer::class)) {
754
            return $value;
755
        }
756

757
        $normalized = Normalizer::normalize($value);
758

759
        return false !== $normalized ? $normalized : $value;
760
    }
761

762
    /**
763
     * Checks if string is valid UTF-8.
764
     *
765
     * Please not that it will use mb_check_encoding internally,
766
     * and could return true also if it's not really a full utf8 string.
767
     *
768
     * @param string $string String to check
769
     *
770
     * @return bool True if valid UTF-8
771
     */
772
    private function isValidUtf8(string $string): bool
42✔
773
    {
774
        return \mb_check_encoding($string, self::ENCODING_UTF8);
42✔
775
    }
776

777
    /**
778
     * Validates encoding name against whitelist.
779
     *
780
     * @param string $encoding Encoding to validate
781
     * @param string $type Type for error message (e.g., 'source', 'target')
782
     *
783
     * @throws InvalidArgumentException If encoding is not allowed
784
     */
785
    private function validateEncoding(string $encoding, string $type): void
70✔
786
    {
787
        $normalized = \strtoupper($encoding);
70✔
788

789
        if (
790
            !\in_array($encoding, $this->allowedEncodings, true)
70✔
791
            && !\in_array($normalized, $this->allowedEncodings, true)
70✔
792
        ) {
793
            throw new InvalidArgumentException(
1✔
794
                \sprintf(
1✔
795
                    'Invalid %s encoding: "%s". Allowed: %s',
1✔
796
                    $type,
1✔
797
                    $encoding,
1✔
798
                    \implode(', ', $this->allowedEncodings)
1✔
799
                )
1✔
800
            );
1✔
801
        }
802
    }
803

804
    /**
805
     * Builds conversion configuration with defaults.
806
     *
807
     * Merges user options with default values, allowing multiple override layers.
808
     *
809
     * @param array<string, mixed> $options User-provided options
810
     * @param array<string, mixed> ...$replacements Additional override layers
811
     *
812
     * @return array<string, mixed> Merged configuration
813
     *
814
     * @example
815
     * // Basic usage
816
     * $config = self::configureOptions(['normalize' => false]);
817
     *
818
     * // With additional defaults
819
     * $config = self::configureOptions(
820
     *     ['normalize' => false],
821
     *     ['maxDepth' => 10]
822
     * );
823
     */
824
    private function configureOptions(array $options, array ...$replacements): array
71✔
825
    {
826
        $replacements[] = $options;
71✔
827

828
        return \array_replace(
71✔
829
            ['normalize' => true, 'translit' => true, 'ignore' => true, 'encodings' => self::DEFAULT_ENCODINGS],
71✔
830
            ...$replacements
71✔
831
        );
71✔
832
    }
833
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc