• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

ducks-project / encoding-repair / 21251220455

22 Jan 2026 01:58PM UTC coverage: 95.165% (+3.6%) from 91.599%
21251220455

push

github

donaldinou
feat : add test

374 of 393 relevant lines covered (95.17%)

15.14 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.76
/CharsetProcessor.php
1
<?php
2

3
/**
4
 * Part of EncodingRepair package.
5
 *
6
 * (c) Adrien Loyant <donald_duck@team-df.org>
7
 *
8
 * For the full copyright and license information, please view the LICENSE
9
 * file that was distributed with this source code.
10
 */
11

12
declare(strict_types=1);
13

14
namespace Ducks\Component\EncodingRepair;
15

16
use Ducks\Component\EncodingRepair\Detector\CachedDetector;
17
use Ducks\Component\EncodingRepair\Detector\DetectorChain;
18
use Ducks\Component\EncodingRepair\Detector\DetectorInterface;
19
use Ducks\Component\EncodingRepair\Detector\FileInfoDetector;
20
use Ducks\Component\EncodingRepair\Detector\MbStringDetector;
21
use Ducks\Component\EncodingRepair\Transcoder\IconvTranscoder;
22
use Ducks\Component\EncodingRepair\Transcoder\MbStringTranscoder;
23
use Ducks\Component\EncodingRepair\Transcoder\TranscoderChain;
24
use Ducks\Component\EncodingRepair\Transcoder\TranscoderInterface;
25
use Ducks\Component\EncodingRepair\Transcoder\UConverterTranscoder;
26
use InvalidArgumentException;
27
use Normalizer;
28
use RuntimeException;
29

30
/**
31
 * Charset processing service.
32
 *
33
 * @final
34
 */
35
final class CharsetProcessor implements CharsetProcessorInterface
36
{
37
    private const DEFAULT_ENCODINGS = [
38
        self::ENCODING_UTF8,
39
        self::WINDOWS_1252,
40
        self::ENCODING_ISO,
41
        self::ENCODING_ASCII,
42
    ];
43

44
    private const MAX_REPAIR_DEPTH = 5;
45
    private const JSON_DEFAULT_DEPTH = 512;
46

47
    /**
48
     * @var TranscoderChain
49
     */
50
    private TranscoderChain $transcoderChain;
51

52
    /**
53
     * @var DetectorChain
54
     */
55
    private DetectorChain $detectorChain;
56

57
    /**
58
     * @var list<string>
59
     */
60
    private $allowedEncodings;
61

62
    public function __construct()
49✔
63
    {
64
        $this->transcoderChain = new TranscoderChain();
49✔
65
        $this->detectorChain = new DetectorChain();
49✔
66
        $this->allowedEncodings = [
49✔
67
            self::AUTO,
49✔
68
            self::ENCODING_UTF8,
49✔
69
            self::WINDOWS_1252,
49✔
70
            self::ENCODING_ISO,
49✔
71
            self::ENCODING_ASCII,
49✔
72
            self::ENCODING_UTF16,
49✔
73
            self::ENCODING_UTF32,
49✔
74
        ];
49✔
75

76
        $this->resetTranscoders();
49✔
77
        $this->resetDetectors();
49✔
78
    }
79

80
    /**
81
     * @inheritDoc
82
     */
83
    public function registerTranscoder(TranscoderInterface $transcoder, ?int $priority = null): self
3✔
84
    {
85
        $this->transcoderChain->register($transcoder, $priority);
3✔
86

87
        return $this;
3✔
88
    }
89

90
    /**
91
     * @inheritDoc
92
     */
93
    public function unregisterTranscoder(TranscoderInterface $transcoder): self
3✔
94
    {
95
        $this->transcoderChain->unregister($transcoder);
3✔
96

97
        return $this;
3✔
98
    }
99

100
    /**
101
     * @inheritDoc
102
     */
103
    public function queueTranscoders(TranscoderInterface ...$transcoders): self
×
104
    {
105
        foreach ($transcoders as $transcoder) {
×
106
            $this->registerTranscoder($transcoder);
×
107
        }
108

109
        return $this;
×
110
    }
111

112
    /**
113
     * @inheritDoc
114
     */
115
    public function resetTranscoders(): self
49✔
116
    {
117
        $this->transcoderChain = new TranscoderChain();
49✔
118
        $this->transcoderChain->register(new UConverterTranscoder());
49✔
119
        $this->transcoderChain->register(new IconvTranscoder());
49✔
120
        $this->transcoderChain->register(new MbStringTranscoder());
49✔
121

122
        return $this;
49✔
123
    }
124

125
    /**
126
     * @inheritDoc
127
     */
128
    public function registerDetector(DetectorInterface $detector, ?int $priority = null): self
2✔
129
    {
130
        $this->detectorChain->register($detector, $priority);
2✔
131

132
        return $this;
2✔
133
    }
134

135
    /**
136
     * @inheritDoc
137
     */
138
    public function unregisterDetector(DetectorInterface $detector): self
1✔
139
    {
140
        $this->detectorChain->unregister($detector);
1✔
141

142
        return $this;
1✔
143
    }
144

145
    /**
146
     * @inheritDoc
147
     */
148
    public function queueDetectors(DetectorInterface ...$detectors): self
×
149
    {
150
        foreach ($detectors as $detector) {
×
151
            $this->registerDetector($detector);
×
152
        }
153

154
        return $this;
×
155
    }
156

157
    /**
158
     * @inheritDoc
159
     */
160
    public function resetDetectors(): self
49✔
161
    {
162
        $this->detectorChain = new DetectorChain();
49✔
163
        $mbDetector = new MbStringDetector();
49✔
164
        $cachedDetector = new CachedDetector($mbDetector);
49✔
165
        $this->detectorChain->register($cachedDetector);
49✔
166
        $this->detectorChain->register(new FileInfoDetector());
49✔
167

168
        return $this;
49✔
169
    }
170

171
    /**
172
     * @inheritDoc
173
     */
174
    public function addEncodings(string ...$encodings): self
2✔
175
    {
176
        foreach ($encodings as $encoding) {
2✔
177
            if (!\in_array($encoding, $this->allowedEncodings, true)) {
2✔
178
                $this->allowedEncodings[] = $encoding;
2✔
179
            }
180
        }
181

182
        return $this;
2✔
183
    }
184

185
    /**
186
     * @inheritDoc
187
     */
188
    public function removeEncodings(string ...$encodings): self
2✔
189
    {
190
        $this->allowedEncodings = \array_values(
2✔
191
            \array_diff($this->allowedEncodings, $encodings)
2✔
192
        );
2✔
193

194
        return $this;
2✔
195
    }
196

197
    /**
198
     * @inheritDoc
199
     */
200
    public function getEncodings(): array
3✔
201
    {
202
        return $this->allowedEncodings;
3✔
203
    }
204

205
    /**
206
     * @inheritDoc
207
     */
208
    public function resetEncodings(): self
×
209
    {
210
        $this->allowedEncodings = [
×
211
            self::AUTO,
×
212
            self::ENCODING_UTF8,
×
213
            self::WINDOWS_1252,
×
214
            self::ENCODING_ISO,
×
215
            self::ENCODING_ASCII,
×
216
            self::ENCODING_UTF16,
×
217
            self::ENCODING_UTF32,
×
218
        ];
×
219

220
        return $this;
×
221
    }
222

223
    /**
224
     * @inheritDoc
225
     */
226
    public function detect(string $string, array $options = []): string
6✔
227
    {
228
        if ($this->isValidUtf8($string)) {
6✔
229
            return self::ENCODING_UTF8;
3✔
230
        }
231

232
        $detected = $this->detectorChain->detect($string, $options);
3✔
233

234
        return $detected ?? self::ENCODING_ISO;
3✔
235
    }
236

237
    /**
238
     * @inheritDoc
239
     */
240
    public function toCharset(
38✔
241
        $data,
242
        string $to = self::ENCODING_UTF8,
243
        string $from = self::ENCODING_ISO,
244
        array $options = []
245
    ) {
246
        $this->validateEncoding($to, 'target');
38✔
247
        $this->validateEncoding($from, 'source');
37✔
248

249
        $options = $this->configureOptions($options);
36✔
250

251
        // We define the callback logic for a single string
252
        /**
253
         * @psalm-suppress MissingClosureParamType
254
         * @psalm-suppress MissingClosureReturnType
255
         */
256
        $callback = fn ($value) => $this->convertValue($value, $to, $from, $options);
36✔
257

258
        return $this->applyRecursive($data, $callback);
36✔
259
    }
260

261
    /**
262
     * @inheritDoc
263
     */
264
    public function toUtf8($data, string $from = self::WINDOWS_1252, array $options = [])
11✔
265
    {
266
        return $this->toCharset($data, self::ENCODING_UTF8, $from, $options);
11✔
267
    }
268

269
    /**
270
     * @inheritDoc
271
     */
272
    public function toIso($data, string $from = self::ENCODING_UTF8, array $options = [])
1✔
273
    {
274
        return $this->toCharset($data, self::WINDOWS_1252, $from, $options);
1✔
275
    }
276

277
    /**
278
     * @inheritDoc
279
     */
280
    public function repair(
18✔
281
        $data,
282
        string $to = self::ENCODING_UTF8,
283
        string $from = self::ENCODING_ISO,
284
        array $options = []
285
    ) {
286
        $options = $this->configureOptions($options, ['maxDepth' => self::MAX_REPAIR_DEPTH]);
18✔
287

288
        /**
289
         * @psalm-suppress MissingClosureParamType
290
         * @psalm-suppress MissingClosureReturnType
291
         */
292
        $callback = fn ($value) => $this->repairValue($value, $to, $from, $options);
18✔
293

294
        return $this->applyRecursive($data, $callback);
18✔
295
    }
296

297
    /**
298
     * @inheritDoc
299
     */
300
    public function safeJsonEncode(
5✔
301
        $data,
302
        int $flags = 0,
303
        int $depth = self::JSON_DEFAULT_DEPTH,
304
        string $from = self::WINDOWS_1252
305
    ): string {
306
        /** @var mixed $data */
307
        $data = $this->repair($data, self::ENCODING_UTF8, $from);
5✔
308
        /** @var string|false $json */
309
        $json = \json_encode($data, $flags, $depth);
5✔
310

311
        if (false === $json) {
5✔
312
            throw new RuntimeException('JSON Encode Error: ' . \json_last_error_msg());
1✔
313
        }
314

315
        return $json;
4✔
316
    }
317

318
    /**
319
     * @inheritDoc
320
     */
321
    public function safeJsonDecode(
5✔
322
        string $json,
323
        ?bool $associative = null,
324
        int $depth = self::JSON_DEFAULT_DEPTH,
325
        int $flags = 0,
326
        string $to = self::ENCODING_UTF8,
327
        string $from = self::WINDOWS_1252
328
    ) {
329
        // Repair string to a valid UTF-8 for decoding
330
        /** @var string $data */
331
        $data = $this->repair($json, self::ENCODING_UTF8, $from);
5✔
332
        /** @var mixed $result */
333
        $result = \json_decode($data, $associative, $depth, $flags);
5✔
334

335
        if (null === $result && \JSON_ERROR_NONE !== \json_last_error()) {
5✔
336
            throw new RuntimeException('JSON Decode Error: ' . \json_last_error_msg());
1✔
337
        }
338

339
        return $this->toCharset($result, $to, self::ENCODING_UTF8);
4✔
340
    }
341

342
    /**
343
     * Applies a callback recursively to arrays, objects, and scalar values.
344
     *
345
     * @param mixed $data Data to process
346
     * @param callable $callback Processing callback function
347
     *
348
     * @return mixed
349
     */
350
    private function applyRecursive($data, callable $callback)
38✔
351
    {
352
        if (\is_array($data)) {
38✔
353
            /**
354
             * @psalm-suppress MissingClosureReturnType
355
             * @psalm-suppress MissingClosureParamType
356
             */
357
            return \array_map(fn ($item) => $this->applyRecursive($item, $callback), $data);
14✔
358
        }
359

360
        if (\is_object($data)) {
36✔
361
            return $this->applyToObject($data, $callback);
5✔
362
        }
363

364
        return $callback($data);
36✔
365
    }
366

367
    /**
368
     * Applies callback to object properties recursively.
369
     *
370
     * @param object $data Object to process
371
     * @param callable $callback Processing function
372
     *
373
     * @return object Cloned object with processed properties
374
     */
375
    private function applyToObject(object $data, callable $callback): object
5✔
376
    {
377
        $copy = clone $data;
5✔
378
        $properties = \get_object_vars($copy);
5✔
379

380
        /** @var mixed $value */
381
        foreach ($properties as $key => $value) {
5✔
382
            $copy->$key = $this->applyRecursive($value, $callback);
5✔
383
        }
384

385
        return $copy;
5✔
386
    }
387

388
    /**
389
     * Converts a single value to target encoding.
390
     *
391
     * @param mixed $value Value to convert
392
     * @param string $to Target encoding
393
     * @param string $from Source encoding
394
     * @param array<string, mixed> $options Conversion configuration
395
     *
396
     * @return mixed
397
     */
398
    private function convertValue($value, string $to, string $from, array $options)
35✔
399
    {
400
        if (!\is_string($value)) {
35✔
401
            return $value;
2✔
402
        }
403

404
        if (self::ENCODING_UTF8 !== $to && $this->isValidUtf8($value)) {
34✔
405
            return $this->convertString($value, $to, self::ENCODING_UTF8, $options);
2✔
406
        }
407

408
        if (\mb_check_encoding($value, $to)) {
32✔
409
            return $this->normalize($value, $to, $options);
26✔
410
        }
411

412
        return $this->convertString($value, $to, $from, $options);
7✔
413
    }
414

415
    /**
416
     * Low-level string conversion logic.
417
     *
418
     * @param string $data String to convert
419
     * @param string $to Target encoding
420
     * @param string $from Source encoding
421
     * @param array<string, mixed> $options Conversion options
422
     *
423
     * @return string Converted string or $data if convertion failed
424
     */
425
    private function convertString(string $data, string $to, string $from, array $options): string
9✔
426
    {
427
        return $this->transcodeString($data, $to, $from, $options) ?? $data;
9✔
428
    }
429

430
    /**
431
     * Low-level string transcode logic with fallback strategies.
432
     *
433
     * @param string $data String to transcode
434
     * @param string $to Target encoding
435
     * @param string $from Source encoding
436
     * @param array<string, mixed> $options Conversion options
437
     *
438
     * @return ?string Converted string or null if failed.
439
     */
440
    private function transcodeString(string $data, string $to, string $from, array $options): ?string
25✔
441
    {
442
        $targetEncoding = $this->resolveEncoding($to, $data, $options);
25✔
443
        $sourceEncoding = $this->resolveEncoding($from, $data, $options);
25✔
444

445
        $result = $this->transcoderChain->transcode($data, $targetEncoding, $sourceEncoding, $options);
25✔
446

447
        if (null !== $result && self::ENCODING_UTF8 === $targetEncoding) {
25✔
448
            return $this->normalize($result, $targetEncoding, $options);
7✔
449
        }
450

451
        return $result;
18✔
452
    }
453

454
    /**
455
     * Repairs a double-encoded value.
456
     *
457
     * @param mixed $value Value to repair
458
     * @param string $to Target encoding
459
     * @param string $from Glitch encoding
460
     * @param array<string, mixed> $options Configuration
461
     *
462
     * @return mixed
463
     */
464
    private function repairValue($value, string $to, string $from, array $options)
17✔
465
    {
466
        if (!\is_string($value)) {
17✔
467
            // @codeCoverageIgnoreStart
468
            return $value;
469
            // @codeCoverageIgnoreEnd
470
        }
471

472
        /** @var mixed $maxDepth */
473
        $maxDepth = $options['maxDepth'] ?? self::MAX_REPAIR_DEPTH;
16✔
474
        if (!\is_int($maxDepth)) {
16✔
475
            $maxDepth = self::MAX_REPAIR_DEPTH;
2✔
476
        }
477

478
        $fixed = $this->peelEncodingLayers($value, $from, $maxDepth);
16✔
479
        $detectedEncoding = $this->isValidUtf8($fixed) ? self::ENCODING_UTF8 : $from;
16✔
480

481
        return $this->toCharset($fixed, $to, $detectedEncoding, $options);
16✔
482
    }
483

484
    /**
485
     * Attempts to remove multiple encoding layers.
486
     *
487
     * @param string $value String to repair
488
     * @param string $from Encoding to reverse
489
     * @param int $maxDepth Maximum iterations
490
     *
491
     * @return string Repaired string
492
     */
493
    private function peelEncodingLayers(string $value, string $from, int $maxDepth): string
16✔
494
    {
495
        $fixed = $value;
16✔
496
        $iterations = 0;
16✔
497
        $options = ['normalize' => false, 'translit' => false, 'ignore' => false];
16✔
498

499
        // Loop while it looks like valid UTF-8
500
        while ($iterations < $maxDepth && $this->isValidUtf8($fixed)) {
16✔
501
            // Attempt to reverse convert (UTF-8 -> $from)
502
            $test = $this->transcodeString($fixed, $from, self::ENCODING_UTF8, $options);
16✔
503

504
            if (null === $test || $test === $fixed || !$this->isValidUtf8($test)) {
16✔
505
                break;
16✔
506
            }
507

508
            // If conversion worked AND result is still valid UTF-8 AND result is different
509
            $fixed = $test;
1✔
510
            $iterations++;
1✔
511
        }
512

513
        return $fixed;
16✔
514
    }
515

516
    /**
517
     * Resolves AUTO encoding to actual encoding.
518
     *
519
     * @param string $encoding Encoding constant
520
     * @param string $data String for detection
521
     * @param array<string, mixed> $options Detection options
522
     *
523
     * @return string Resolved encoding
524
     *
525
     * @codeCoverageIgnore
526
     */
527
    private function resolveEncoding(string $encoding, string $data, array $options): string
528
    {
529
        return self::AUTO === $encoding ? $this->detect($data, $options) : $encoding;
530
    }
531

532
    /**
533
     * Normalizes UTF-8 string if needed.
534
     *
535
     * @param string $value String to normalize
536
     * @param string $to Target encoding
537
     * @param array<string, mixed> $options Configuration
538
     *
539
     * @return string Normalized or original string
540
     *
541
     * @codeCoverageIgnore
542
     */
543
    private function normalize(string $value, string $to, array $options): string
544
    {
545
        if (self::ENCODING_UTF8 !== $to || false !== ($options['normalize'] ?? true)) {
546
            return $value;
547
        }
548

549
        if (!\class_exists(Normalizer::class)) {
550
            return $value;
551
        }
552

553
        $normalized = Normalizer::normalize($value);
554

555
        return false !== $normalized ? $normalized : $value;
556
    }
557

558
    /**
559
     * Checks if string is valid UTF-8.
560
     *
561
     * Please not that it will use mb_check_encoding internally,
562
     * and could return true also if it's not really a full utf8 string.
563
     *
564
     * @param string $string String to check
565
     *
566
     * @return bool True if valid UTF-8
567
     */
568
    private function isValidUtf8(string $string): bool
24✔
569
    {
570
        return \mb_check_encoding($string, self::ENCODING_UTF8);
24✔
571
    }
572

573
    /**
574
     * Validates encoding name against whitelist.
575
     *
576
     * @param string $encoding Encoding to validate
577
     * @param string $type Type for error message (e.g., 'source', 'target')
578
     *
579
     * @throws InvalidArgumentException If encoding is not allowed
580
     */
581
    private function validateEncoding(string $encoding, string $type): void
38✔
582
    {
583
        $normalized = \strtoupper($encoding);
38✔
584

585
        if (
586
            !\in_array($encoding, $this->allowedEncodings, true)
38✔
587
            && !\in_array($normalized, $this->allowedEncodings, true)
38✔
588
        ) {
589
            throw new InvalidArgumentException(
2✔
590
                \sprintf(
2✔
591
                    'Invalid %s encoding: "%s". Allowed: %s',
2✔
592
                    $type,
2✔
593
                    $encoding,
2✔
594
                    \implode(', ', $this->allowedEncodings)
2✔
595
                )
2✔
596
            );
2✔
597
        }
598
    }
599

600
    /**
601
     * Builds conversion configuration with defaults.
602
     *
603
     * Merges user options with default values, allowing multiple override layers.
604
     *
605
     * @param array<string, mixed> $options User-provided options
606
     * @param array<string, mixed> ...$replacements Additional override layers
607
     *
608
     * @return array<string, mixed> Merged configuration
609
     *
610
     * @example
611
     * // Basic usage
612
     * $config = self::configureOptions(['normalize' => false]);
613
     *
614
     * // With additional defaults
615
     * $config = self::configureOptions(
616
     *     ['normalize' => false],
617
     *     ['maxDepth' => 10]
618
     * );
619
     */
620
    private function configureOptions(array $options, array ...$replacements): array
38✔
621
    {
622
        $replacements[] = $options;
38✔
623

624
        return \array_replace(
38✔
625
            ['normalize' => true, 'translit' => true, 'ignore' => true, 'encodings' => self::DEFAULT_ENCODINGS],
38✔
626
            ...$replacements
38✔
627
        );
38✔
628
    }
629
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc