• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

saitoha / libsixel / 19776286632

29 Nov 2025 12:20AM UTC coverage: 41.017% (-0.3%) from 41.338%
19776286632

push

github

saitoha
build: remove unused status in reversible snap test

9964 of 36344 branches covered (27.42%)

13002 of 31699 relevant lines covered (41.02%)

1178071.22 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

63.41
/src/scale.c
1
/*
2
 * SPDX-License-Identifier: MIT
3
 *
4
 * Copyright (c) 2021-2025 libsixel developers. See `AUTHORS`.
5
 * Copyright (c) 2014-2016 Hayaki Saito
6
 *
7
 * Permission is hereby granted, free of charge, to any person obtaining a copy of
8
 * this software and associated documentation files (the "Software"), to deal in
9
 * the Software without restriction, including without limitation the rights to
10
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
11
 * the Software, and to permit persons to whom the Software is furnished to do so,
12
 * subject to the following conditions:
13
 *
14
 * The above copyright notice and this permission notice shall be included in all
15
 * copies or substantial portions of the Software.
16
 *
17
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
19
 * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
20
 * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
21
 * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
 */
24

25
#include "config.h"
26

27
/* STDC_HEADERS */
28
#include <stdlib.h>
29

30
#if HAVE_ERRNO_H
31
# include <errno.h>
32
#endif  /* HAVE_ERRNO_H */
33
#if HAVE_LIMITS_H
34
# include <limits.h>
35
#endif  /* HAVE_LIMITS_H */
36
#if HAVE_STRING_H
37
# include <string.h>
38
#endif  /* HAVE_STRING_H */
39
#if HAVE_STDINT_H
40
# include <stdint.h>
41
#endif  /* HAVE_STDINT_H */
42

43
#if HAVE_MATH_H
44
# define _USE_MATH_DEFINES  /* for MSVC */
45
# include <math.h>
46
#endif  /* HAVE_MATH_H */
47
#ifndef M_PI
48
# define M_PI 3.14159265358979323846
49
#endif
50

51
#include <sixel.h>
52

53
#include "cpu.h"
54
#include "logger.h"
55

56
#if SIXEL_ENABLE_THREADS
57
# include "sixel_threads_config.h"
58
# include "threadpool.h"
59
#endif
60

61
#if defined(HAVE_IMMINTRIN_H) && \
62
    (defined(__x86_64__) || defined(_M_X64) || defined(__i386) || \
63
     defined(_M_IX86))
64
# define SIXEL_HAS_X86_INTRIN 1
65
# include <immintrin.h>
66
#endif
67

68
#if defined(HAVE_SSE2)
69
# if defined(__SSE2__)
70
#  if defined(HAVE_EMMINTRIN_H)
71
#   include <emmintrin.h>
72
#   define SIXEL_USE_SSE2 1
73
#  endif
74
# endif
75
#endif
76

77
#if defined(SIXEL_HAS_X86_INTRIN)
78
# if defined(__GNUC__)
79
#  if !defined(__clang__)
80
#   define SIXEL_TARGET_AVX __attribute__((target("avx")))
81
#   define SIXEL_TARGET_AVX2 __attribute__((target("avx2")))
82
#   define SIXEL_TARGET_AVX512 __attribute__((target("avx512f")))
83
#   define SIXEL_USE_AVX 1
84
#   define SIXEL_USE_AVX2 1
85
#   define SIXEL_USE_AVX512 1
86
#  else
87
/*
88
 * clang rejects returning AVX vectors when the translation unit target
89
 * does not already include the corresponding ISA.  Guard runtime AVX
90
 * helpers with compile-time ISA availability to keep non-AVX builds
91
 * warning-free while still using AVX when the compiler enables it.
92
 */
93
#   define SIXEL_TARGET_AVX
94
#   define SIXEL_TARGET_AVX2
95
#   define SIXEL_TARGET_AVX512
96
#   if defined(__AVX__)
97
#    define SIXEL_USE_AVX 1
98
#   endif
99
#   if defined(__AVX2__)
100
#    define SIXEL_USE_AVX2 1
101
#   endif
102
#   if defined(__AVX512F__)
103
#    define SIXEL_USE_AVX512 1
104
#   endif
105
#  endif
106
# else
107
#  define SIXEL_TARGET_AVX
108
#  define SIXEL_TARGET_AVX2
109
#  define SIXEL_TARGET_AVX512
110
#  if defined(__AVX__)
111
#   define SIXEL_USE_AVX 1
112
#  endif
113
#  if defined(__AVX2__)
114
#   define SIXEL_USE_AVX2 1
115
#  endif
116
#  if defined(__AVX512F__)
117
#   define SIXEL_USE_AVX512 1
118
#  endif
119
# endif
120
#endif
121

122
#if defined(__GNUC__) && !defined(__clang__)
123
# pragma GCC diagnostic push
124
# pragma GCC diagnostic ignored "-Wpsabi"
125
#endif
126

127
#if defined(HAVE_NEON)
128
# if (defined(__ARM_NEON) || defined(__ARM_NEON__))
129
#  if defined(HAVE_ARM_NEON_H)
130
#   include <arm_neon.h>
131
#   define SIXEL_USE_NEON 1
132
#  endif
133
# endif
134
#endif
135

136
#if !defined(MAX)
137
# define MAX(l, r) ((l) > (r) ? (l) : (r))
138
#endif
139
#if !defined(MIN)
140
#define MIN(l, r) ((l) < (r) ? (l) : (r))
141
#endif
142

143

144
#if 0
145
/* function Nearest Neighbor */
146
static double
147
nearest_neighbor(double const d)
148
{
149
    if (d <= 0.5) {
150
        return 1.0;
151
    }
152
    return 0.0;
153
}
154
#endif
155

156

157
/* function Bi-linear */
158
static double
159
bilinear(double const d)
64,767,280✔
160
{
161
    if (d < 1.0) {
64,767,280✔
162
        return 1.0 - d;
52,460,580✔
163
    }
164
    return 0.0;
12,306,700✔
165
}
12,953,456✔
166

167

168
/* function Welsh */
169
static double
170
welsh(double const d)
4,187,000✔
171
{
172
    if (d < 1.0) {
4,187,000✔
173
        return 1.0 - d * d;
2,993,500✔
174
    }
175
    return 0.0;
1,193,500✔
176
}
837,400✔
177

178

179
/* function Bi-cubic */
180
static double
181
bicubic(double const d)
6,491,000✔
182
{
183
    if (d <= 1.0) {
6,491,000✔
184
        return 1.0 + (d - 2.0) * d * d;
3,133,500✔
185
    }
186
    if (d <= 2.0) {
3,357,500✔
187
        return 4.0 + d * (-8.0 + d * (5.0 - d));
3,100,500✔
188
    }
189
    return 0.0;
257,000✔
190
}
1,298,200✔
191

192

193
/* function sinc
194
 * sinc(x) = sin(PI * x) / (PI * x)
195
 */
196
static double
197
sinc(double const x)
76,142,860✔
198
{
199
    return sin(M_PI * x) / (M_PI * x);
76,142,860✔
200
}
201

202

203
/* function Lanczos-2
204
 * Lanczos(x) = sinc(x) * sinc(x / 2) , |x| <= 2
205
 *            = 0, |x| > 2
206
 */
207
static double
208
lanczos2(double const d)
15,519,990✔
209
{
210
    if (d == 0.0) {
15,519,990!
211
        return 1.0;
×
212
    }
213
    if (d < 2.0) {
15,519,990✔
214
        return sinc(d) * sinc(d / 2.0);
14,309,890✔
215
    }
216
    return 0.0;
1,210,100✔
217
}
3,103,998✔
218

219

220
/* function Lanczos-3
221
 * Lanczos(x) = sinc(x) * sinc(x / 3) , |x| <= 3
222
 *            = 0, |x| > 3
223
 */
224
static double
225
lanczos3(double const d)
10,132,990✔
226
{
227
    if (d == 0.0) {
10,132,990!
228
        return 1.0;
×
229
    }
230
    if (d < 3.0) {
10,132,990✔
231
        return sinc(d) * sinc(d / 3.0);
9,778,900✔
232
    }
233
    return 0.0;
354,090✔
234
}
2,026,598✔
235

236
/* function Lanczos-4
237
 * Lanczos(x) = sinc(x) * sinc(x / 4) , |x| <= 4
238
 *            = 0, |x| > 4
239
 */
240
static double
241
lanczos4(double const d)
14,518,620✔
242
{
243
    if (d == 0.0) {
14,518,620!
244
        return 1.0;
×
245
    }
246
    if (d < 4.0) {
14,518,620✔
247
        return sinc(d) * sinc(d / 4.0);
13,982,640✔
248
    }
249
    return 0.0;
535,980✔
250
}
2,903,724✔
251

252

253
static double
254
gaussian(double const d)
3,393,250✔
255
{
256
    return exp(-2.0 * d * d) * sqrt(2.0 / M_PI);
3,393,250✔
257
}
258

259

260
static double
261
hanning(double const d)
3,646,760✔
262
{
263
    return 0.5 + 0.5 * cos(d * M_PI);
3,646,760✔
264
}
265

266

267
static double
268
hamming(const double d)
4,187,000✔
269
{
270
    return 0.54 + 0.46 * cos(d * M_PI);
4,187,000✔
271
}
272

273

274
static unsigned char
275
normalize(double x, double total)
×
276
{
277
    int result;
278

279
    result = floor(x / total);
×
280
    if (result > 255) {
×
281
        return 0xff;
×
282
    }
283
    if (result < 0) {
×
284
        return 0x00;
×
285
    }
286
    return (unsigned char)result;
×
287
}
288

289
static int
290
sixel_scale_simd_level(void)
125✔
291
{
292
    static int simd_level = -2;
293

294
    if (simd_level == -2) {
125!
295
        simd_level = sixel_cpu_simd_level();
125✔
296
    }
25✔
297

298
    return simd_level;
125✔
299
}
300

301
static float
302
sixel_clamp_unit_f32(float value)
×
303
{
304
    /*
305
     * Resampling kernels with negative lobes can push linear RGB values
306
     * outside the unit interval. Clamp here so downstream conversions do
307
     * not collapse to black.
308
     */
309
    if (value < 0.0f) {
×
310
        return 0.0f;
×
311
    }
312
    if (value > 1.0f) {
×
313
        return 1.0f;
×
314
    }
315

316
    return value;
×
317
}
318

319
#if defined(HAVE_IMMINTRIN_H)
320
#if defined(SIXEL_USE_AVX)
321
static SIXEL_TARGET_AVX __m256
322
sixel_avx_load_rgb_ps(unsigned char const *psrc)
323
{
324
    __m128i pixi128;
325
    __m128 pixf128;
326
    __m256 pixf256;
327

328
    /*
329
     * Build the byte vector explicitly so the AVX path never accumulates
330
     * garbage data when widening to 32-bit lanes.
331
     */
332
    pixi128 = _mm_setr_epi8((char)psrc[0],
333
                            (char)psrc[1],
334
                            (char)psrc[2],
335
                            0,
336
                            0, 0, 0, 0,
337
                            0, 0, 0, 0,
338
                            0, 0, 0, 0);
339
    pixf128 = _mm_cvtepi32_ps(pixi128);
340
    pixf256 = _mm256_castps128_ps256(pixf128);
341
    pixf256 = _mm256_insertf128_ps(pixf256, _mm_setzero_ps(), 1);
342
    return pixf256;
343
}
344

345
static SIXEL_TARGET_AVX void
346
sixel_avx_store_rgb_u8(__m256 acc, double total, unsigned char *dst)
347
{
348
    __m256 scalev;
349
    __m256 minv;
350
    __m256 maxv;
351
    __m256i acci;
352
    int out[8];
353

354
    scalev = _mm256_set1_ps((float)(1.0 / total));
355
    acc = _mm256_mul_ps(acc, scalev);
356
    minv = _mm256_set1_ps(0.0f);
357
    maxv = _mm256_set1_ps(255.0f);
358
    acc = _mm256_max_ps(minv, _mm256_min_ps(acc, maxv));
359
    acci = _mm256_cvtps_epi32(acc);
360
    _mm256_storeu_si256((__m256i *)out, acci);
361
    dst[0] = (unsigned char)out[0];
362
    dst[1] = (unsigned char)out[1];
363
    dst[2] = (unsigned char)out[2];
364
}
365

366
static SIXEL_TARGET_AVX __m256
367
sixel_avx_zero_ps(void)
368
{
369
    return _mm256_setzero_ps();
370
}
371

372
static SIXEL_TARGET_AVX __m256
373
sixel_avx_muladd_ps(__m256 acc, __m256 pix, float weight)
374
{
375
    __m256 wv;
376

377
    wv = _mm256_set1_ps(weight);
378
    return _mm256_add_ps(acc, _mm256_mul_ps(pix, wv));
379
}
380

381
static SIXEL_TARGET_AVX __m256
382
sixel_avx_load_rgb_f32(float const *psrc)
383
{
384
    __m256 pixf;
385

386
    pixf = _mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f,
387
                         psrc[2], psrc[1], psrc[0], 0.0f);
388
    return pixf;
389
}
390

391
static SIXEL_TARGET_AVX void
392
sixel_avx_store_rgb_f32(__m256 acc, double total, float *dst)
393
{
394
    __m256 scalev;
395
    __m256 minv;
396
    __m256 maxv;
397
    float out[8];
398

399
    scalev = _mm256_set1_ps((float)(1.0 / total));
400
    acc = _mm256_mul_ps(acc, scalev);
401
    minv = _mm256_set1_ps(0.0f);
402
    maxv = _mm256_set1_ps(1.0f);
403
    acc = _mm256_max_ps(minv, _mm256_min_ps(acc, maxv));
404
    _mm256_storeu_ps(out, acc);
405
    dst[0] = out[0];
406
    dst[1] = out[1];
407
    dst[2] = out[2];
408
}
409
#endif  /* SIXEL_USE_AVX */
410

411
#if defined(SIXEL_USE_AVX2)
412
static SIXEL_TARGET_AVX2 __m256
413
sixel_avx2_load_rgb_ps(unsigned char const *psrc)
101,475,112✔
414
{
415
    __m128i pixi128;
416
    __m256i pixi256;
417

418
    /*
419
     * Keep the unused bytes zeroed so widening to epi32 does not pull in
420
     * stack junk and bias every output channel toward white.
421
     */
422
    pixi128 = _mm_setr_epi8((char)psrc[0],
101,475,112✔
423
                            (char)psrc[1],
101,475,112✔
424
                            (char)psrc[2],
101,475,112✔
425
                            0,
426
                            0, 0, 0, 0,
427
                            0, 0, 0, 0,
428
                            0, 0, 0, 0);
429
    pixi256 = _mm256_cvtepu8_epi32(pixi128);
101,475,112✔
430
    return _mm256_cvtepi32_ps(pixi256);
101,475,112✔
431
}
432

433
static SIXEL_TARGET_AVX2 void
434
sixel_avx2_store_rgb_u8(__m256 acc, double total, unsigned char *dst)
11,765,024✔
435
{
436
    __m256 scalev;
437
    __m256 minv;
438
    __m256 maxv;
439
    __m256i acci;
440
    int out[8];
441

442
    scalev = _mm256_set1_ps((float)(1.0 / total));
23,530,048✔
443
    acc = _mm256_mul_ps(acc, scalev);
11,765,024✔
444
    minv = _mm256_set1_ps(0.0f);
11,765,024✔
445
    maxv = _mm256_set1_ps(255.0f);
11,765,024✔
446
    acc = _mm256_max_ps(minv, _mm256_min_ps(acc, maxv));
23,530,048✔
447
    acci = _mm256_cvtps_epi32(acc);
11,765,024✔
448
    _mm256_storeu_si256((__m256i *)out, acci);
449
    dst[0] = (unsigned char)out[0];
11,765,024✔
450
    dst[1] = (unsigned char)out[1];
11,765,024✔
451
    dst[2] = (unsigned char)out[2];
11,765,024✔
452
}
11,765,024✔
453

454
static SIXEL_TARGET_AVX2 __m256
455
sixel_avx2_zero_ps(void)
11,765,024✔
456
{
457
    return _mm256_setzero_ps();
11,765,024✔
458
}
459

460
static SIXEL_TARGET_AVX2 __m256
461
sixel_avx2_muladd_ps(__m256 acc, __m256 pix, float weight)
101,475,112✔
462
{
463
    __m256 wv;
464

465
    wv = _mm256_set1_ps(weight);
101,475,112✔
466
    return _mm256_add_ps(acc, _mm256_mul_ps(pix, wv));
202,950,224✔
467
}
468

469
static SIXEL_TARGET_AVX2 __m256
470
sixel_avx2_load_rgb_f32(float const *psrc)
471
{
472
    __m256 pixf;
473

474
    pixf = _mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f,
475
                         psrc[2], psrc[1], psrc[0], 0.0f);
476
    return pixf;
477
}
478

479
static SIXEL_TARGET_AVX2 void
480
sixel_avx2_store_rgb_f32(__m256 acc, double total, float *dst)
481
{
482
    __m256 scalev;
483
    __m256 minv;
484
    __m256 maxv;
485
    float out[8];
486

487
    scalev = _mm256_set1_ps((float)(1.0 / total));
488
    acc = _mm256_mul_ps(acc, scalev);
489
    minv = _mm256_set1_ps(0.0f);
490
    maxv = _mm256_set1_ps(1.0f);
491
    acc = _mm256_max_ps(minv, _mm256_min_ps(acc, maxv));
492
    _mm256_storeu_ps(out, acc);
493
    dst[0] = out[0];
494
    dst[1] = out[1];
495
    dst[2] = out[2];
496
}
497
#endif  /* SIXEL_USE_AVX2 */
498

499
#if defined(SIXEL_USE_AVX512)
500
static SIXEL_TARGET_AVX512 __m512
501
sixel_avx512_load_rgb_ps(unsigned char const *psrc)
502
{
503
    __m128i pixi128;
504
    __m512i pixi512;
505

506
    pixi128 = _mm_setr_epi8((char)psrc[0],
507
                            (char)psrc[1],
508
                            (char)psrc[2],
509
                            0,
510
                            0, 0, 0, 0,
511
                            0, 0, 0, 0,
512
                            0, 0, 0, 0);
513
    pixi512 = _mm512_cvtepu8_epi32(pixi128);
514
    return _mm512_cvtepi32_ps(pixi512);
515
}
516

517
static SIXEL_TARGET_AVX512 void
518
sixel_avx512_store_rgb_u8(__m512 acc, double total, unsigned char *dst)
519
{
520
    __m512 scalev;
521
    __m512 minv;
522
    __m512 maxv;
523
    __m512i acci;
524
    int out[16];
525

526
    scalev = _mm512_set1_ps((float)(1.0 / total));
527
    acc = _mm512_mul_ps(acc, scalev);
528
    minv = _mm512_set1_ps(0.0f);
529
    maxv = _mm512_set1_ps(255.0f);
530
    acc = _mm512_max_ps(minv, _mm512_min_ps(acc, maxv));
531
    acci = _mm512_cvtps_epi32(acc);
532
    _mm512_storeu_si512((void *)out, acci);
533
    dst[0] = (unsigned char)out[0];
534
    dst[1] = (unsigned char)out[1];
535
    dst[2] = (unsigned char)out[2];
536
}
537

538
static SIXEL_TARGET_AVX512 __m512
539
sixel_avx512_zero_ps(void)
540
{
541
    return _mm512_setzero_ps();
542
}
543

544
static SIXEL_TARGET_AVX512 __m512
545
sixel_avx512_muladd_ps(__m512 acc, __m512 pix, float weight)
546
{
547
    __m512 wv;
548

549
    wv = _mm512_set1_ps(weight);
550
    return _mm512_add_ps(acc, _mm512_mul_ps(pix, wv));
551
}
552

553
static SIXEL_TARGET_AVX512 __m512
554
sixel_avx512_load_rgb_f32(float const *psrc)
555
{
556
    __m512 pixf;
557

558
    pixf = _mm512_set_ps(0.0f, 0.0f, 0.0f, 0.0f,
559
                         0.0f, 0.0f, 0.0f, 0.0f,
560
                         0.0f, 0.0f, 0.0f, 0.0f,
561
                         psrc[2], psrc[1], psrc[0], 0.0f);
562
    return pixf;
563
}
564

565
static SIXEL_TARGET_AVX512 void
566
sixel_avx512_store_rgb_f32(__m512 acc, double total, float *dst)
567
{
568
    __m512 scalev;
569
    __m512 minv;
570
    __m512 maxv;
571
    float out[16];
572

573
    scalev = _mm512_set1_ps((float)(1.0 / total));
574
    acc = _mm512_mul_ps(acc, scalev);
575
    minv = _mm512_set1_ps(0.0f);
576
    maxv = _mm512_set1_ps(1.0f);
577
    acc = _mm512_max_ps(minv, _mm512_min_ps(acc, maxv));
578
    _mm512_storeu_ps(out, acc);
579
    dst[0] = out[0];
580
    dst[1] = out[1];
581
    dst[2] = out[2];
582
}
583
#endif  /* SIXEL_USE_AVX512 */
584
#endif /* HAVE_IMMINTRIN_H */
585

586

587
static void
588
scale_without_resampling(
30✔
589
    unsigned char *dst,
590
    unsigned char const *src,
591
    int const srcw,
592
    int const srch,
593
    int const dstw,
594
    int const dsth,
595
    int const depth)
596
{
597
    int w;
598
    int h;
599
    int x;
600
    int y;
601
    int i;
602
    int pos;
603

604
    for (h = 0; h < dsth; h++) {
830✔
605
        for (w = 0; w < dstw; w++) {
319,400✔
606
            x = (long)w * srcw / dstw;
318,600✔
607
            y = (long)h * srch / dsth;
318,600✔
608
            for (i = 0; i < depth; i++) {
1,274,400✔
609
                pos = (y * srcw + x) * depth + i;
955,800✔
610
                dst[(h * dstw + w) * depth + i] = src[pos];
955,800✔
611
            }
191,160✔
612
        }
63,720✔
613
    }
160✔
614
}
30✔
615

616
static void
617
scale_without_resampling_float32(
×
618
    float *dst,
619
    float const *src,
620
    int const srcw,
621
    int const srch,
622
    int const dstw,
623
    int const dsth,
624
    int const depth)
625
{
626
    int w;
627
    int h;
628
    int x;
629
    int y;
630
    int i;
631
    int pos;
632

633
    for (h = 0; h < dsth; h++) {
×
634
        for (w = 0; w < dstw; w++) {
×
635
            x = (long)w * srcw / dstw;
×
636
            y = (long)h * srch / dsth;
×
637
            for (i = 0; i < depth; i++) {
×
638
                pos = (y * srcw + x) * depth + i;
×
639
                dst[(h * dstw + w) * depth + i] = src[pos];
×
640
            }
641
        }
642
    }
643
}
×
644

645

646
typedef double (*resample_fn_t)(double const d);
647

648
/*
649
 * Two-pass separable filter helpers. Each function processes a single row so
650
 * the caller may invoke them serially or from a threadpool worker.
651
 */
652
static void
653
scale_horizontal_row(
53,250✔
654
    unsigned char *tmp,
655
    unsigned char const *src,
656
    int const srcw,
657
    int const dstw,
658
    int const depth,
659
    int const y,
660
    resample_fn_t const f_resample,
661
    double const n,
662
    int const simd_level)
663
{
664
    int w;
665
    int x;
666
    int i;
667
    int pos;
668
    int x_first;
669
    int x_last;
670
    double center_x;
671
    double diff_x;
672
    double weight;
673
    double total;
674
    double offsets[8];
675
#if defined(SIXEL_USE_AVX512)
676
    __m512 acc512;
677
#endif
678
#if defined(SIXEL_USE_AVX2) || defined(SIXEL_USE_AVX)
679
    __m256 acc256;
680
#endif
681
#if defined(SIXEL_USE_SSE2)
682
    __m128 acc128;
683
    __m128 minv128;
684
    __m128 maxv128;
685
    __m128 scalev128;
686
    __m128 wv128;
687
    __m128 pixf128;
688
    __m128i pixi128;
689
    __m128i acci128;
690
    __m128i acc16_128;
691
    unsigned int pixel128;
692
#endif
693
#if defined(SIXEL_USE_NEON)
694
    float32x4_t acc_neon;
695
    float32x4_t minv_neon;
696
    float32x4_t maxv_neon;
697
    float32x4_t scalev_neon;
698
    float32x4_t wv_neon;
699
    float32x4_t pixf_neon;
700
    uint32x4_t pix32_neon;
701
    uint32x4_t acci_neon;
702
    uint16x4_t acc16_neon;
703
    uint8x8_t acc8_neon;
704
    uint8_t outb_neon[8];
705
#endif
706

707
    for (w = 0; w < dstw; w++) {
9,437,250✔
708
        total = 0.0;
9,384,000✔
709
        for (i = 0; i < depth; i++) {
37,536,000✔
710
            offsets[i] = 0;
28,152,000✔
711
        }
5,630,400✔
712

713
        if (dstw >= srcw) {
9,384,000✔
714
            center_x = (w + 0.5) * srcw / dstw;
960,000✔
715
            x_first = MAX(center_x - n, 0);
960,000✔
716
            x_last = MIN(center_x + n, srcw - 1);
960,000✔
717
        } else {
192,000✔
718
            center_x = w + 0.5;
8,424,000✔
719
            x_first = MAX(floor((center_x - n) * srcw / dstw), 0);
8,424,000✔
720
            x_last = MIN(floor((center_x + n) * srcw / dstw), srcw - 1);
8,424,000✔
721
        }
722

723
#if defined(SIXEL_USE_AVX512)
724
        if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX512) {
7,507,200!
725
            acc512 = sixel_avx512_zero_ps();
726

727
            for (x = x_first; x <= x_last; x++) {
×
728
                diff_x = (dstw >= srcw)
729
                             ? (x + 0.5) - center_x
730
                             : (x + 0.5) * dstw / srcw - center_x;
×
731
                weight = f_resample(fabs(diff_x));
732
                pos = (y * srcw + x) * depth;
733
                acc512 = sixel_avx512_muladd_ps(
734
                    acc512,
735
                    sixel_avx512_load_rgb_ps(src + pos),
736
                    (float)weight);
737
                total += weight;
738
            }
739
            if (total > 0.0) {
×
740
                pos = (y * dstw + w) * depth;
741
                sixel_avx512_store_rgb_u8(acc512, total, tmp + pos);
742
            }
743
            continue;
744
        }
745
#endif
746
#if defined(SIXEL_USE_AVX2)
747
        if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX2) {
7,507,200!
748
            acc256 = sixel_avx2_zero_ps();
7,507,200✔
749

750
            for (x = x_first; x <= x_last; x++) {
83,050,200✔
751
                diff_x = (dstw >= srcw)
75,543,000✔
752
                             ? (x + 0.5) - center_x
2,299,200✔
753
                             : (x + 0.5) * dstw / srcw - center_x;
75,543,000✔
754
                weight = f_resample(fabs(diff_x));
75,543,000✔
755
                pos = (y * srcw + x) * depth;
75,543,000✔
756
                acc256 = sixel_avx2_muladd_ps(
75,543,000✔
757
                    acc256,
758
                    sixel_avx2_load_rgb_ps(src + pos),
759
                    (float)weight);
760
                total += weight;
75,543,000✔
761
            }
762
            if (total > 0.0) {
7,507,200!
763
                pos = (y * dstw + w) * depth;
7,507,200✔
764
                sixel_avx2_store_rgb_u8(acc256, total, tmp + pos);
7,507,200✔
765
            }
766
            continue;
7,507,200✔
767
        }
768
#endif
769
#if defined(SIXEL_USE_AVX)
770
        if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX) {
×
771
            acc256 = sixel_avx_zero_ps();
772

773
            for (x = x_first; x <= x_last; x++) {
×
774
                diff_x = (dstw >= srcw)
775
                             ? (x + 0.5) - center_x
776
                             : (x + 0.5) * dstw / srcw - center_x;
×
777
                weight = f_resample(fabs(diff_x));
778
                pos = (y * srcw + x) * depth;
779
                acc256 = sixel_avx_muladd_ps(
780
                    acc256,
781
                    sixel_avx_load_rgb_ps(src + pos),
782
                    (float)weight);
783
                total += weight;
784
            }
785
            if (total > 0.0) {
×
786
                pos = (y * dstw + w) * depth;
787
                sixel_avx_store_rgb_u8(acc256, total, tmp + pos);
788
            }
789
            continue;
790
        }
791
#endif
792
#if defined(SIXEL_USE_SSE2) || defined(SIXEL_USE_NEON)
793
        if (depth == 3
1,876,800!
794
# if defined(SIXEL_USE_SSE2)
795
            && simd_level >= SIXEL_SIMD_LEVEL_SSE2
×
796
# elif defined(SIXEL_USE_NEON)
797
            && simd_level >= SIXEL_SIMD_LEVEL_NEON
1,876,800!
798
# endif
799
            ) {
800
#if defined(SIXEL_USE_SSE2)
801
            acc128 = _mm_setzero_ps();
802
#elif defined(SIXEL_USE_NEON)
803
            acc_neon = vdupq_n_f32(0.0f);
1,876,800✔
804
#endif
805
            for (x = x_first; x <= x_last; x++) {
20,762,550!
806
                diff_x = (dstw >= srcw)
18,885,750✔
807
                             ? (x + 0.5) - center_x
574,800✔
808
                             : (x + 0.5) * dstw / srcw - center_x;
18,310,950!
809
                weight = f_resample(fabs(diff_x));
18,885,750✔
810
                pos = (y * srcw + x) * depth;
18,885,750✔
811
                const unsigned char *psrc = src + pos;
18,885,750✔
812
#if defined(SIXEL_USE_SSE2)
813
                pixel128 = psrc[0] | (psrc[1] << 8) | (psrc[2] << 16);
814
                pixi128 = _mm_cvtsi32_si128((int)pixel128);
815
                pixi128 = _mm_unpacklo_epi8(pixi128, _mm_setzero_si128());
816
                pixi128 = _mm_unpacklo_epi16(pixi128, _mm_setzero_si128());
817
                pixf128 = _mm_cvtepi32_ps(pixi128);
818
                wv128 = _mm_set1_ps((float)weight);
819
                acc128 = _mm_add_ps(acc128, _mm_mul_ps(pixf128, wv128));
820
#else /* NEON */
821
                pix32_neon = (uint32x4_t){psrc[0], psrc[1], psrc[2], 0};
18,885,750✔
822
                pixf_neon = vcvtq_f32_u32(pix32_neon);
18,885,750✔
823
                wv_neon = vdupq_n_f32((float)weight);
18,885,750✔
824
                acc_neon = vmlaq_f32(acc_neon, pixf_neon, wv_neon);
18,885,750✔
825
#endif
826
                total += weight;
18,885,750✔
827
            }
18,885,750✔
828
            if (total > 0.0) {
1,876,800!
829
#if defined(SIXEL_USE_SSE2)
830
                scalev128 = _mm_set1_ps((float)(1.0 / total));
831
                acc128 = _mm_mul_ps(acc128, scalev128);
832
                minv128 = _mm_set1_ps(0.0f);
833
                maxv128 = _mm_set1_ps(255.0f);
834
                acc128 = _mm_max_ps(minv128, _mm_min_ps(acc128, maxv128));
835
                acci128 = _mm_cvtps_epi32(acc128);
836
                acc16_128 = _mm_packs_epi32(acci128, _mm_setzero_si128());
837
                acc16_128 = _mm_packus_epi16(acc16_128, _mm_setzero_si128());
838
                pos = (y * dstw + w) * depth;
839
                pixel128 = (unsigned int)_mm_cvtsi128_si32(acc16_128);
840
                tmp[pos + 0] = (unsigned char)pixel128;
841
                tmp[pos + 1] = (unsigned char)(pixel128 >> 8);
842
                tmp[pos + 2] = (unsigned char)(pixel128 >> 16);
843
#else /* NEON */
844
                scalev_neon = vdupq_n_f32((float)(1.0 / total));
1,876,800✔
845
                acc_neon = vmulq_f32(acc_neon, scalev_neon);
1,876,800✔
846
                minv_neon = vdupq_n_f32(0.0f);
1,876,800✔
847
                maxv_neon = vdupq_n_f32(255.0f);
1,876,800✔
848
                acc_neon = vmaxq_f32(minv_neon,
3,753,600✔
849
                                     vminq_f32(acc_neon, maxv_neon));
1,876,800✔
850
                acci_neon = vcvtq_u32_f32(acc_neon);
1,876,800✔
851
                acc16_neon = vmovn_u32(acci_neon);
1,876,800✔
852
                acc8_neon = vmovn_u16(vcombine_u16(acc16_neon, acc16_neon));
1,876,800✔
853

854
                vst1_u8(outb_neon, acc8_neon);
1,876,800✔
855
                pos = (y * dstw + w) * depth;
1,876,800✔
856
                tmp[pos + 0] = outb_neon[0];
1,876,800✔
857
                tmp[pos + 1] = outb_neon[1];
1,876,800✔
858
                tmp[pos + 2] = outb_neon[2];
1,876,800✔
859
#endif
860
            }
1,876,800✔
861
            continue;
1,876,800✔
862
        }
863
#endif /* SIMD paths */
864

865
        for (x = x_first; x <= x_last; x++) {
×
866
            diff_x = (dstw >= srcw)
×
867
                         ? (x + 0.5) - center_x
×
868
                         : (x + 0.5) * dstw / srcw - center_x;
×
869
            weight = f_resample(fabs(diff_x));
×
870
            for (i = 0; i < depth; i++) {
×
871
                pos = (y * srcw + x) * depth + i;
×
872
                offsets[i] += src[pos] * weight;
×
873
            }
874
            total += weight;
×
875
        }
876

877
        if (total > 0.0) {
×
878
            for (i = 0; i < depth; i++) {
×
879
                pos = (y * dstw + w) * depth + i;
×
880
                tmp[pos] = normalize(offsets[i], total);
×
881
            }
882
        }
883
    }
884
}
53,250✔
885

886
static void
887
scale_vertical_row(
19,945✔
888
    unsigned char *dst,
889
    unsigned char const *tmp,
890
    int const dstw,
891
    int const dsth,
892
    int const depth,
893
    int const srch,
894
    int const h,
895
    resample_fn_t const f_resample,
896
    double const n,
897
    int const simd_level)
898
{
899
    int w;
900
    int y;
901
    int i;
902
    int pos;
903
    int y_first;
904
    int y_last;
905
    double center_y;
906
    double diff_y;
907
    double weight;
908
    double total;
909
    double offsets[8];
910
#if defined(SIXEL_USE_AVX512)
911
    __m512 acc512;
912
#endif
913
#if defined(SIXEL_USE_AVX2) || defined(SIXEL_USE_AVX)
914
    __m256 acc256;
915
#endif
916
#if defined(SIXEL_USE_SSE2)
917
    __m128 acc128;
918
    __m128 minv128;
919
    __m128 maxv128;
920
    __m128 scalev128;
921
    __m128 wv128;
922
    __m128 pixf128;
923
    __m128i pixi128;
924
    __m128i acci128;
925
    __m128i acc16_128;
926
    unsigned int pixel128;
927
#endif
928
#if defined(SIXEL_USE_NEON)
929
    float32x4_t acc_neon;
930
    float32x4_t minv_neon;
931
    float32x4_t maxv_neon;
932
    float32x4_t scalev_neon;
933
    float32x4_t wv_neon;
934
    float32x4_t pixf_neon;
935
    uint32x4_t pix32_neon;
936
    uint32x4_t acci_neon;
937
    uint16x4_t acc16_neon;
938
    uint8x8_t acc8_neon;
939
    uint8_t outb_neon[8];
940
#endif
941

942
    for (w = 0; w < dstw; w++) {
5,342,225✔
943
        total = 0.0;
5,322,280✔
944
        for (i = 0; i < depth; i++) {
21,289,120✔
945
            offsets[i] = 0;
15,966,840✔
946
        }
3,193,368✔
947

948
        if (dsth >= srch) {
5,322,280✔
949
            center_y = (h + 0.5) * srch / dsth;
2,932,500✔
950
            y_first = MAX(center_y - n, 0);
2,932,500✔
951
            y_last = MIN(center_y + n, srch - 1);
2,932,500✔
952
        } else {
586,500✔
953
            center_y = h + 0.5;
2,389,780✔
954
            y_first = MAX(floor((center_y - n) * srch / dsth), 0);
2,389,780✔
955
            y_last = MIN(floor((center_y + n) * srch / dsth), srch - 1);
2,389,780✔
956
        }
957

958
#if defined(SIXEL_USE_AVX512)
959
        if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX512) {
4,257,824!
960
            acc512 = sixel_avx512_zero_ps();
961

962
            for (y = y_first; y <= y_last; y++) {
×
963
                diff_y = (dsth >= srch)
964
                             ? (y + 0.5) - center_y
965
                             : (y + 0.5) * dsth / srch - center_y;
×
966
                weight = f_resample(fabs(diff_y));
967
                pos = (y * dstw + w) * depth;
968
                acc512 = sixel_avx512_muladd_ps(
969
                    acc512,
970
                    sixel_avx512_load_rgb_ps(tmp + pos),
971
                    (float)weight);
972
                total += weight;
973
            }
974
            if (total > 0.0) {
×
975
                pos = (h * dstw + w) * depth;
976
                sixel_avx512_store_rgb_u8(acc512, total, dst + pos);
977
            }
978
            continue;
979
        }
980
#endif
981
#if defined(SIXEL_USE_AVX2)
982
        if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX2) {
4,257,824!
983
            acc256 = sixel_avx2_zero_ps();
4,257,824✔
984

985
            for (y = y_first; y <= y_last; y++) {
30,189,936✔
986
                diff_y = (dsth >= srch)
25,932,112✔
987
                             ? (y + 0.5) - center_y
7,010,080✔
988
                             : (y + 0.5) * dsth / srch - center_y;
25,932,112✔
989
                weight = f_resample(fabs(diff_y));
25,932,112✔
990
                pos = (y * dstw + w) * depth;
25,932,112✔
991
                acc256 = sixel_avx2_muladd_ps(
25,932,112✔
992
                    acc256,
993
                    sixel_avx2_load_rgb_ps(tmp + pos),
994
                    (float)weight);
995
                total += weight;
25,932,112✔
996
            }
997
            if (total > 0.0) {
4,257,824!
998
                pos = (h * dstw + w) * depth;
4,257,824✔
999
                sixel_avx2_store_rgb_u8(acc256, total, dst + pos);
4,257,824✔
1000
            }
1001
            continue;
4,257,824✔
1002
        }
1003
#endif
1004
#if defined(SIXEL_USE_AVX)
1005
        if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX) {
×
1006
            acc256 = sixel_avx_zero_ps();
1007

1008
            for (y = y_first; y <= y_last; y++) {
×
1009
                diff_y = (dsth >= srch)
1010
                             ? (y + 0.5) - center_y
1011
                             : (y + 0.5) * dsth / srch - center_y;
×
1012
                weight = f_resample(fabs(diff_y));
1013
                pos = (y * dstw + w) * depth;
1014
                acc256 = sixel_avx_muladd_ps(
1015
                    acc256,
1016
                    sixel_avx_load_rgb_ps(tmp + pos),
1017
                    (float)weight);
1018
                total += weight;
1019
            }
1020
            if (total > 0.0) {
×
1021
                pos = (h * dstw + w) * depth;
1022
                sixel_avx_store_rgb_u8(acc256, total, dst + pos);
1023
            }
1024
            continue;
1025
        }
1026
#endif
1027
#if defined(SIXEL_USE_SSE2) || defined(SIXEL_USE_NEON)
1028
        if (depth == 3
1,064,456!
1029
# if defined(SIXEL_USE_SSE2)
1030
            && simd_level >= SIXEL_SIMD_LEVEL_SSE2
×
1031
# elif defined(SIXEL_USE_NEON)
1032
            && simd_level >= SIXEL_SIMD_LEVEL_NEON
1,064,456!
1033
# endif
1034
            ) {
1035
#if defined(SIXEL_USE_SSE2)
1036
            acc128 = _mm_setzero_ps();
1037
#elif defined(SIXEL_USE_NEON)
1038
            acc_neon = vdupq_n_f32(0.0f);
1,064,456✔
1039
#endif
1040
            for (y = y_first; y <= y_last; y++) {
7,547,484!
1041
                diff_y = (dsth >= srch)
6,483,028✔
1042
                             ? (y + 0.5) - center_y
1,752,520✔
1043
                             : (y + 0.5) * dsth / srch - center_y;
4,730,508!
1044
                weight = f_resample(fabs(diff_y));
6,483,028✔
1045
                pos = (y * dstw + w) * depth;
6,483,028✔
1046
                const unsigned char *psrc = tmp + pos;
6,483,028✔
1047
#if defined(SIXEL_USE_SSE2)
1048
                pixel128 = psrc[0] | (psrc[1] << 8) | (psrc[2] << 16);
1049
                pixi128 = _mm_cvtsi32_si128((int)pixel128);
1050
                pixi128 = _mm_unpacklo_epi8(pixi128, _mm_setzero_si128());
1051
                pixi128 = _mm_unpacklo_epi16(pixi128, _mm_setzero_si128());
1052
                pixf128 = _mm_cvtepi32_ps(pixi128);
1053
                wv128 = _mm_set1_ps((float)weight);
1054
                acc128 = _mm_add_ps(acc128, _mm_mul_ps(pixf128, wv128));
1055
#else /* NEON */
1056
                pix32_neon = (uint32x4_t){psrc[0], psrc[1], psrc[2], 0};
6,483,028✔
1057
                pixf_neon = vcvtq_f32_u32(pix32_neon);
6,483,028✔
1058
                wv_neon = vdupq_n_f32((float)weight);
6,483,028✔
1059
                acc_neon = vmlaq_f32(acc_neon, pixf_neon, wv_neon);
6,483,028✔
1060
#endif
1061
                total += weight;
6,483,028✔
1062
            }
6,483,028✔
1063
            if (total > 0.0) {
1,064,456!
1064
#if defined(SIXEL_USE_SSE2)
1065
                scalev128 = _mm_set1_ps((float)(1.0 / total));
1066
                acc128 = _mm_mul_ps(acc128, scalev128);
1067
                minv128 = _mm_set1_ps(0.0f);
1068
                maxv128 = _mm_set1_ps(255.0f);
1069
                acc128 = _mm_max_ps(minv128, _mm_min_ps(acc128, maxv128));
1070
                acci128 = _mm_cvtps_epi32(acc128);
1071
                acc16_128 = _mm_packs_epi32(acci128, _mm_setzero_si128());
1072
                acc16_128 = _mm_packus_epi16(acc16_128, _mm_setzero_si128());
1073
                pos = (h * dstw + w) * depth;
1074
                pixel128 = (unsigned int)_mm_cvtsi128_si32(acc16_128);
1075
                dst[pos + 0] = (unsigned char)pixel128;
1076
                dst[pos + 1] = (unsigned char)(pixel128 >> 8);
1077
                dst[pos + 2] = (unsigned char)(pixel128 >> 16);
1078
#else /* NEON */
1079
                scalev_neon = vdupq_n_f32((float)(1.0 / total));
1,064,456✔
1080
                acc_neon = vmulq_f32(acc_neon, scalev_neon);
1,064,456✔
1081
                minv_neon = vdupq_n_f32(0.0f);
1,064,456✔
1082
                maxv_neon = vdupq_n_f32(255.0f);
1,064,456✔
1083
                acc_neon = vmaxq_f32(minv_neon,
2,128,912✔
1084
                                     vminq_f32(acc_neon, maxv_neon));
1,064,456✔
1085
                acci_neon = vcvtq_u32_f32(acc_neon);
1,064,456✔
1086
                acc16_neon = vmovn_u32(acci_neon);
1,064,456✔
1087
                acc8_neon = vmovn_u16(vcombine_u16(acc16_neon, acc16_neon));
1,064,456✔
1088

1089
                vst1_u8(outb_neon, acc8_neon);
1,064,456✔
1090
                pos = (h * dstw + w) * depth;
1,064,456✔
1091
                dst[pos + 0] = outb_neon[0];
1,064,456✔
1092
                dst[pos + 1] = outb_neon[1];
1,064,456✔
1093
                dst[pos + 2] = outb_neon[2];
1,064,456✔
1094
#endif
1095
            }
1,064,456✔
1096
            continue;
1,064,456✔
1097
        }
1098
#endif /* SIMD paths */
1099
        for (y = y_first; y <= y_last; y++) {
×
1100
            diff_y = (dsth >= srch)
×
1101
                         ? (y + 0.5) - center_y
×
1102
                         : (y + 0.5) * dsth / srch - center_y;
×
1103
            weight = f_resample(fabs(diff_y));
×
1104
            for (i = 0; i < depth; i++) {
×
1105
                pos = (y * dstw + w) * depth + i;
×
1106
                offsets[i] += tmp[pos] * weight;
×
1107
            }
1108
            total += weight;
×
1109
        }
1110

1111
        if (total > 0.0) {
×
1112
            for (i = 0; i < depth; i++) {
×
1113
                pos = (h * dstw + w) * depth + i;
×
1114
                dst[pos] = normalize(offsets[i], total);
×
1115
            }
1116
        }
1117
    }
1118
}
19,945✔
1119

1120
static void
1121
scale_with_resampling_serial(
125✔
1122
    unsigned char *dst,
1123
    unsigned char const *src,
1124
    int const srcw,
1125
    int const srch,
1126
    int const dstw,
1127
    int const dsth,
1128
    int const depth,
1129
    resample_fn_t const f_resample,
1130
    double const n,
1131
    unsigned char *tmp)
1132
{
1133
    int y;
1134
    int h;
1135
    int simd_level;
1136

1137
    simd_level = sixel_scale_simd_level();
125✔
1138

1139
    for (y = 0; y < srch; y++) {
53,375✔
1140
        scale_horizontal_row(tmp,
63,900✔
1141
                             src,
10,650✔
1142
                             srcw,
10,650✔
1143
                             dstw,
10,650✔
1144
                             depth,
10,650✔
1145
                             y,
10,650✔
1146
                             f_resample,
10,650✔
1147
                             n,
10,650✔
1148
                             simd_level);
10,650✔
1149
    }
10,650✔
1150

1151
    for (h = 0; h < dsth; h++) {
20,070✔
1152
        scale_vertical_row(dst,
23,934✔
1153
                           tmp,
3,989✔
1154
                           dstw,
3,989✔
1155
                           dsth,
3,989✔
1156
                           depth,
3,989✔
1157
                           srch,
3,989✔
1158
                           h,
3,989✔
1159
                           f_resample,
3,989✔
1160
                           n,
3,989✔
1161
                           simd_level);
3,989✔
1162
    }
3,989✔
1163
}
125✔
1164

1165
#if SIXEL_ENABLE_THREADS
1166
typedef enum scale_parallel_pass {
1167
    SCALE_PASS_HORIZONTAL = 0,
1168
    SCALE_PASS_VERTICAL = 1
1169
} scale_parallel_pass_t;
1170

1171
typedef struct scale_parallel_context {
1172
    unsigned char *dst;
1173
    unsigned char const *src;
1174
    unsigned char *tmp;
1175
    int srcw;
1176
    int srch;
1177
    int dstw;
1178
    int dsth;
1179
    int depth;
1180
    resample_fn_t f_resample;
1181
    double n;
1182
    scale_parallel_pass_t pass;
1183
    int simd_level;
1184
    sixel_logger_t *logger;
1185
} scale_parallel_context_t;
1186

1187
/*
1188
 * Emit timeline entries for every band so downstream aggregation can compute
1189
 * first/last activity windows per thread without losing information.
1190
 */
1191
static int
1192
scale_parallel_should_log(scale_parallel_context_t const *ctx, int index)
1193
{
1194
    int span;
1195

1196
    if (ctx == NULL || ctx->logger == NULL || !ctx->logger->active) {
×
1197
        return 0;
1198
    }
1199

1200
    if (index < 0) {
×
1201
        return 0;
1202
    }
1203

1204
    if (ctx->pass == SCALE_PASS_HORIZONTAL) {
×
1205
        span = ctx->srch;
1206
    } else {
1207
        span = ctx->dsth;
1208
    }
1209

1210
    if (span <= 0 || index >= span) {
×
1211
        return 0;
1212
    }
1213

1214
    return 1;
1215
}
1216

1217
/*
1218
 * Allow callers to raise the floor for parallel execution using
1219
 * SIXEL_SCALE_PARALLEL_MIN_BYTES. The default of zero preserves the previous
1220
 * eager behavior while permitting deployments to defer threading on tiny
1221
 * inputs.
1222
 */
1223
static size_t
1224
scale_parallel_min_bytes(void)
100✔
1225
{
1226
    static int initialized = 0;
1227
    static size_t threshold = 0;
1228
    char const *text;
1229
    char *endptr;
1230
    unsigned long long parsed;
1231

1232
    if (initialized) {
100!
1233
        return threshold;
1234
    }
1235

1236
    initialized = 1;
100✔
1237
    text = getenv("SIXEL_SCALE_PARALLEL_MIN_BYTES");
100✔
1238
    if (text == NULL || text[0] == '\0') {
100!
1239
        return threshold;
100✔
1240
    }
1241

1242
    errno = 0;
1243
    parsed = strtoull(text, &endptr, 10);
1244
    if (endptr == text || *endptr != '\0' || errno == ERANGE) {
×
1245
        return threshold;
1246
    }
1247

1248
    if (parsed > (unsigned long long)SIZE_MAX) {
×
1249
        threshold = SIZE_MAX;
1250
    } else {
1251
        threshold = (size_t)parsed;
1252
    }
1253

1254
    return threshold;
1255
}
25✔
1256

1257
static int
1258
scale_parallel_worker(tp_job_t job, void *userdata, void *workspace)
1259
{
1260
    scale_parallel_context_t *ctx;
1261
    int index;
1262
    char const *role;
1263
    int y0;
1264
    int y1;
1265
    int in0;
1266
    int in1;
1267

1268
    (void)workspace;
1269
    ctx = (scale_parallel_context_t *)userdata;
1270
    if (ctx == NULL) {
×
1271
        return SIXEL_BAD_ARGUMENT;
1272
    }
1273

1274
    role = "horizontal";
1275
    y0 = 0;
1276
    y1 = 0;
1277
    in0 = 0;
1278
    in1 = 0;
1279
    index = job.band_index;
1280
    if (ctx->pass == SCALE_PASS_HORIZONTAL) {
×
1281
        if (index < 0 || index >= ctx->srch) {
×
1282
            return SIXEL_BAD_ARGUMENT;
1283
        }
1284
        y0 = index;
1285
        y1 = index + 1;
1286
        in1 = ctx->dstw;
1287
        if (scale_parallel_should_log(ctx, index)) {
×
1288
            sixel_logger_logf(ctx->logger,
1289
                              role,
1290
                              "scale",
1291
                              "start",
1292
                              index,
1293
                              index,
1294
                              y0,
1295
                              y1,
1296
                              in0,
1297
                              in1,
1298
                              "horizontal pass");
1299
        }
1300
        scale_horizontal_row(ctx->tmp,
1301
                             ctx->src,
1302
                             ctx->srcw,
1303
                             ctx->dstw,
1304
                             ctx->depth,
1305
                             index,
1306
                             ctx->f_resample,
1307
                             ctx->n,
1308
                             ctx->simd_level);
1309
    } else {
1310
        if (index < 0 || index >= ctx->dsth) {
×
1311
            return SIXEL_BAD_ARGUMENT;
1312
        }
1313
        role = "vertical";
1314
        y0 = index;
1315
        y1 = index + 1;
1316
        in1 = ctx->srch;
1317
        if (scale_parallel_should_log(ctx, index)) {
×
1318
            sixel_logger_logf(ctx->logger,
1319
                              role,
1320
                              "scale",
1321
                              "start",
1322
                              index,
1323
                              index,
1324
                              y0,
1325
                              y1,
1326
                              in0,
1327
                              in1,
1328
                              "vertical pass");
1329
        }
1330
        scale_vertical_row(ctx->dst,
1331
                           ctx->tmp,
1332
                           ctx->dstw,
1333
                           ctx->dsth,
1334
                           ctx->depth,
1335
                           ctx->srch,
1336
                           index,
1337
                           ctx->f_resample,
1338
                           ctx->n,
1339
                           ctx->simd_level);
1340
    }
1341

1342
    if (scale_parallel_should_log(ctx, index)) {
×
1343
        sixel_logger_logf(ctx->logger,
1344
                          role,
1345
                          "scale",
1346
                          "finish",
1347
                          index,
1348
                          index,
1349
                          y0,
1350
                          y1,
1351
                          in0,
1352
                          in1,
1353
                          "pass complete");
1354
    }
1355

1356
    return SIXEL_OK;
1357
}
1358

1359
/*
1360
 * Parallel path mirrors the encoder and dither thread selection through
1361
 * sixel_threads_resolve(). Rows become individual jobs for both passes so the
1362
 * caller can saturate the threadpool without altering the filtering math.
1363
 */
1364
static int
1365
scale_with_resampling_parallel(
100✔
1366
    unsigned char *dst,
1367
    unsigned char const *src,
1368
    int const srcw,
1369
    int const srch,
1370
    int const dstw,
1371
    int const dsth,
1372
    int const depth,
1373
    resample_fn_t const f_resample,
1374
    double const n,
1375
    unsigned char *tmp,
1376
    sixel_logger_t *logger)
1377
{
1378
    scale_parallel_context_t ctx;
1379
    threadpool_t *pool;
1380
    tp_job_t job;
1381
    size_t image_bytes;
1382
    int threads;
1383
    int queue_depth;
1384
    int y;
1385
    int rc;
1386
    int logger_ready;
1387

1388
    image_bytes = (size_t)srcw * (size_t)srch * (size_t)depth;
100✔
1389
    if (image_bytes < scale_parallel_min_bytes()) {
100!
1390
        if (logger != NULL) {
×
1391
            sixel_logger_logf(logger,
1392
                              "controller",
1393
                              "scale",
1394
                              "skip",
1395
                              -1,
1396
                              -1,
1397
                              0,
1398
                              0,
1399
                              0,
1400
                              0,
1401
                              "below threshold bytes=%zu",
1402
                              image_bytes);
1403
        }
1404
        return SIXEL_BAD_ARGUMENT;
1405
    }
1406

1407
    threads = sixel_threads_resolve();
100✔
1408
    if (threads < 2) {
100!
1409
        if (logger != NULL) {
100!
1410
            sixel_logger_logf(logger,
1411
                              "controller",
1412
                              "scale",
1413
                              "skip",
1414
                              -1,
1415
                              -1,
1416
                              0,
1417
                              0,
1418
                              0,
1419
                              0,
1420
                              "threads=%d",
1421
                              threads);
1422
        }
1423
        return SIXEL_BAD_ARGUMENT;
100✔
1424
    }
1425

1426
    logger_ready = logger != NULL && logger->active;
×
1427
    if (logger_ready) {
×
1428
        sixel_logger_logf(logger,
1429
                          "controller",
1430
                          "scale",
1431
                          "start",
1432
                          -1,
1433
                          -1,
1434
                          0,
1435
                          srch,
1436
                          0,
1437
                          dsth,
1438
                          "parallel scale src=%dx%d dst=%dx%d",
1439
                          srcw,
1440
                          srch,
1441
                          dstw,
1442
                          dsth);
1443
    }
1444

1445
    ctx.dst = dst;
1446
    ctx.src = src;
1447
    ctx.tmp = tmp;
1448
    ctx.srcw = srcw;
1449
    ctx.srch = srch;
1450
    ctx.dstw = dstw;
1451
    ctx.dsth = dsth;
1452
    ctx.depth = depth;
1453
    ctx.f_resample = f_resample;
1454
    ctx.n = n;
1455
    ctx.simd_level = sixel_scale_simd_level();
1456
    ctx.logger = logger_ready ? logger : NULL;
×
1457

1458
    queue_depth = threads * 3;
1459
    if (queue_depth > srch) {
×
1460
        queue_depth = srch;
1461
    }
1462
    if (queue_depth < 1) {
×
1463
        queue_depth = 1;
1464
    }
1465

1466
    ctx.pass = SCALE_PASS_HORIZONTAL;
1467
    if (logger_ready) {
×
1468
        sixel_logger_logf(logger,
1469
                          "controller",
1470
                          "scale",
1471
                          "pass_start",
1472
                          -1,
1473
                          0,
1474
                          0,
1475
                          srch,
1476
                          0,
1477
                          ctx.dstw,
1478
                          "horizontal queue=%d threads=%d",
1479
                          queue_depth,
1480
                          threads);
1481
    }
1482
    pool = threadpool_create(threads,
1483
                             queue_depth,
1484
                             0,
1485
                             scale_parallel_worker,
1486
                             &ctx);
1487
    if (pool == NULL) {
×
1488
        return SIXEL_BAD_ALLOCATION;
1489
    }
1490

1491
    for (y = 0; y < srch; y++) {
×
1492
        job.band_index = y;
1493
        threadpool_push(pool, job);
1494
    }
1495
    threadpool_finish(pool);
1496
    rc = threadpool_get_error(pool);
1497
    threadpool_destroy(pool);
1498
    if (rc != SIXEL_OK) {
×
1499
        return rc;
1500
    }
1501

1502
    if (logger_ready) {
×
1503
        sixel_logger_logf(logger,
1504
                          "controller",
1505
                          "scale",
1506
                          "pass_finish",
1507
                          -1,
1508
                          srch - 1,
1509
                          0,
1510
                          srch,
1511
                          0,
1512
                          ctx.dstw,
1513
                          "horizontal complete");
1514
    }
1515

1516
    queue_depth = threads * 3;
1517
    if (queue_depth > dsth) {
×
1518
        queue_depth = dsth;
1519
    }
1520
    if (queue_depth < 1) {
×
1521
        queue_depth = 1;
1522
    }
1523

1524
    ctx.pass = SCALE_PASS_VERTICAL;
1525
    if (logger_ready) {
×
1526
        sixel_logger_logf(logger,
1527
                          "controller",
1528
                          "scale",
1529
                          "pass_start",
1530
                          -1,
1531
                          0,
1532
                          0,
1533
                          dsth,
1534
                          0,
1535
                          ctx.srch,
1536
                          "vertical queue=%d threads=%d",
1537
                          queue_depth,
1538
                          threads);
1539
    }
1540
    pool = threadpool_create(threads,
1541
                             queue_depth,
1542
                             0,
1543
                             scale_parallel_worker,
1544
                             &ctx);
1545
    if (pool == NULL) {
×
1546
        return SIXEL_BAD_ALLOCATION;
1547
    }
1548

1549
    for (y = 0; y < dsth; y++) {
×
1550
        job.band_index = y;
1551
        threadpool_push(pool, job);
1552
    }
1553
    threadpool_finish(pool);
1554
    rc = threadpool_get_error(pool);
1555
    threadpool_destroy(pool);
1556

1557
    if (logger_ready) {
×
1558
        sixel_logger_logf(logger,
1559
                          "controller",
1560
                          "scale",
1561
                          "pass_finish",
1562
                          -1,
1563
                          dsth - 1,
1564
                          0,
1565
                          dsth,
1566
                          0,
1567
                          ctx.srch,
1568
                          "vertical complete rc=%d",
1569
                          rc);
1570
        sixel_logger_logf(logger,
1571
                          "controller",
1572
                          "scale",
1573
                          "finish",
1574
                          -1,
1575
                          dsth - 1,
1576
                          0,
1577
                          dsth,
1578
                          0,
1579
                          ctx.srch,
1580
                          "parallel scale status=%d",
1581
                          rc);
1582
    }
1583

1584
    return rc;
1585
}
25✔
1586
#endif /* SIXEL_ENABLE_THREADS */
1587

1588
/*
1589
 * Allocate shared scratch storage and attempt the parallel pipeline first so
1590
 * larger inputs benefit from threading while smaller ones retain the serial
1591
 * behavior.
1592
 */
1593
static void
1594
scale_with_resampling(
125✔
1595
    unsigned char *dst,
1596
    unsigned char const *src,
1597
    int const srcw,
1598
    int const srch,
1599
    int const dstw,
1600
    int const dsth,
1601
    int const depth,
1602
    resample_fn_t const f_resample,
1603
    double n,
1604
    sixel_allocator_t *allocator)
1605
{
1606
    unsigned char *tmp;
1607
    size_t tmp_size;
1608
#if SIXEL_ENABLE_THREADS
1609
    int rc;
1610
    sixel_logger_t logger;
1611
    int logger_prepared;
1612
#endif
1613

1614
#if SIXEL_ENABLE_THREADS
1615
    sixel_logger_init(&logger);
100✔
1616
    logger_prepared = 0;
100✔
1617
    (void)sixel_logger_prepare_env(&logger);
100✔
1618
    logger_prepared = logger.active;
100✔
1619
#endif
1620

1621
    tmp_size = (size_t)dstw * (size_t)srch * (size_t)depth;
125✔
1622
    tmp = (unsigned char *)sixel_allocator_malloc(allocator, tmp_size);
125✔
1623
    if (tmp == NULL) {
125!
1624
#if SIXEL_ENABLE_THREADS
1625
        if (logger_prepared) {
×
1626
            sixel_logger_close(&logger);
1627
        }
1628
#endif
1629
        return;
×
1630
    }
1631

1632
#if SIXEL_ENABLE_THREADS
1633
    rc = scale_with_resampling_parallel(dst,
125!
1634
                                        src,
25✔
1635
                                        srcw,
25✔
1636
                                        srch,
25✔
1637
                                        dstw,
25✔
1638
                                        dsth,
25✔
1639
                                        depth,
25✔
1640
                                        f_resample,
25✔
1641
                                        n,
25✔
1642
                                        tmp,
25✔
1643
                                        logger_prepared
25!
1644
                                            ? &logger
1645
                                            : NULL);
1646
    if (rc == SIXEL_OK) {
100!
1647
        sixel_allocator_free(allocator, tmp);
1648
        if (logger_prepared) {
×
1649
            sixel_logger_close(&logger);
1650
        }
1651
        return;
1652
    }
1653

1654
    if (logger_prepared) {
100!
1655
        sixel_logger_logf(&logger,
1656
                          "controller",
1657
                          "scale",
1658
                          "fallback",
1659
                          -1,
1660
                          -1,
1661
                          0,
1662
                          dsth,
1663
                          0,
1664
                          srch,
1665
                          "parallel rc=%d",
1666
                          rc);
1667
    }
1668
#endif
1669

1670
    scale_with_resampling_serial(dst,
150✔
1671
                                 src,
25✔
1672
                                 srcw,
25✔
1673
                                 srch,
25✔
1674
                                 dstw,
25✔
1675
                                 dsth,
25✔
1676
                                 depth,
25✔
1677
                                 f_resample,
25✔
1678
                                 n,
25✔
1679
                                 tmp);
25✔
1680

1681
    sixel_allocator_free(allocator, tmp);
125✔
1682
#if SIXEL_ENABLE_THREADS
1683
    if (logger_prepared) {
100!
1684
        sixel_logger_close(&logger);
1685
    }
1686
#endif
1687
}
25✔
1688

1689
static void
1690
scale_with_resampling_float32(
×
1691
    float *dst,
1692
    float const *src,
1693
    int const srcw,
1694
    int const srch,
1695
    int const dstw,
1696
    int const dsth,
1697
    int const depth,
1698
    resample_fn_t const f_resample,
1699
    double n,
1700
    sixel_allocator_t *allocator)
1701
{
1702
    int w;
1703
    int h;
1704
    int x;
1705
    int y;
1706
    int i;
1707
    int pos;
1708
    int x_first;
1709
    int x_last;
1710
    int y_first;
1711
    int y_last;
1712
    double center_x;
1713
    double center_y;
1714
    double diff_x;
1715
    double diff_y;
1716
    double weight;
1717
    double total;
1718
    double offsets[8];
1719
    float *tmp;
1720
#if defined(SIXEL_USE_SSE2) || defined(SIXEL_USE_NEON)
1721
    float vecbuf[4];
1722
#endif
1723
    int simd_level;
1724
#if defined(SIXEL_USE_AVX512)
1725
    __m512 acc512;
1726
#endif
1727
#if defined(SIXEL_USE_AVX2) || defined(SIXEL_USE_AVX)
1728
    __m256 acc256;
1729
#endif
1730
#if defined(SIXEL_USE_SSE2)
1731
    __m128 acc128;
1732
    __m128 pixf128;
1733
    __m128 wv128;
1734
    __m128 scalev128;
1735
    __m128 minv128;
1736
    __m128 maxv128;
1737
#elif defined(SIXEL_USE_NEON)
1738
    float32x4_t acc_neon;
1739
    float32x4_t pixf_neon;
1740
    float32x4_t wv_neon;
1741
    float32x4_t scalev_neon;
1742
    float32x4_t minv_neon;
1743
    float32x4_t maxv_neon;
1744
#endif
1745

1746
    tmp = (float *)sixel_allocator_malloc(
×
1747
        allocator,
1748
        (size_t)(dstw * srch * depth * (int)sizeof(float)));
×
1749
    if (tmp == NULL) {
×
1750
        return;
×
1751
    }
1752

1753
    simd_level = sixel_scale_simd_level();
×
1754

1755
    for (y = 0; y < srch; y++) {
×
1756
        for (w = 0; w < dstw; w++) {
×
1757
            total = 0.0;
×
1758
            for (i = 0; i < depth; i++) {
×
1759
                offsets[i] = 0.0;
×
1760
            }
1761

1762
            if (dstw >= srcw) {
×
1763
                center_x = (w + 0.5) * srcw / dstw;
×
1764
                x_first = MAX(center_x - n, 0);
×
1765
                x_last = MIN(center_x + n, srcw - 1);
×
1766
            } else {
1767
                center_x = w + 0.5;
×
1768
                x_first = MAX(floor((center_x - n) * srcw / dstw), 0);
×
1769
                x_last = MIN(floor((center_x + n) * srcw / dstw),
×
1770
                             srcw - 1);
1771
            }
1772

1773
#if defined(SIXEL_USE_AVX512)
1774
            if (depth == 3 &&
×
1775
                simd_level >= SIXEL_SIMD_LEVEL_AVX512) {
1776
                acc512 = sixel_avx512_zero_ps();
1777

1778
                for (x = x_first; x <= x_last; x++) {
×
1779
                    diff_x = (dstw >= srcw)
1780
                                 ? (x + 0.5) - center_x
1781
                                 : (x + 0.5) * srcw / dstw - center_x;
×
1782
                    weight = f_resample(fabs(diff_x));
1783
                    pos = (y * srcw + x) * depth;
1784
                    acc512 = sixel_avx512_muladd_ps(
1785
                        acc512,
1786
                        sixel_avx512_load_rgb_f32(src + pos),
1787
                        (float)weight);
1788
                    total += weight;
1789
                }
1790
                if (total > 0.0) {
×
1791
                    pos = (y * dstw + w) * depth;
1792
                    sixel_avx512_store_rgb_f32(acc512, total, tmp + pos);
1793
                }
1794
            } else
1795
#endif
1796
#if defined(SIXEL_USE_AVX2)
1797
            if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX2) {
×
1798
                acc256 = sixel_avx2_zero_ps();
1799

1800
                for (x = x_first; x <= x_last; x++) {
×
1801
                    diff_x = (dstw >= srcw)
1802
                                 ? (x + 0.5) - center_x
1803
                                 : (x + 0.5) * srcw / dstw - center_x;
×
1804
                    weight = f_resample(fabs(diff_x));
1805
                    pos = (y * srcw + x) * depth;
1806
                    acc256 = sixel_avx2_muladd_ps(
1807
                        acc256,
1808
                        sixel_avx2_load_rgb_f32(src + pos),
1809
                        (float)weight);
1810
                    total += weight;
1811
                }
1812
                if (total > 0.0) {
×
1813
                    pos = (y * dstw + w) * depth;
1814
                    sixel_avx2_store_rgb_f32(acc256, total, tmp + pos);
1815
                }
1816
            } else
1817
#endif
1818
#if defined(SIXEL_USE_AVX)
1819
            if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX) {
×
1820
                acc256 = sixel_avx_zero_ps();
1821

1822
                for (x = x_first; x <= x_last; x++) {
×
1823
                    diff_x = (dstw >= srcw)
1824
                                 ? (x + 0.5) - center_x
1825
                                 : (x + 0.5) * srcw / dstw - center_x;
×
1826
                    weight = f_resample(fabs(diff_x));
1827
                    pos = (y * srcw + x) * depth;
1828
                    acc256 = sixel_avx_muladd_ps(
1829
                        acc256,
1830
                        sixel_avx_load_rgb_f32(src + pos),
1831
                        (float)weight);
1832
                    total += weight;
1833
                }
1834
                if (total > 0.0) {
×
1835
                    pos = (y * dstw + w) * depth;
1836
                    sixel_avx_store_rgb_f32(acc256, total, tmp + pos);
1837
                }
1838
            } else
1839
#endif
1840
#if defined(SIXEL_USE_SSE2) || defined(SIXEL_USE_NEON)
1841
            if (depth == 3
×
1842
# if defined(SIXEL_USE_SSE2)
1843
                && simd_level >= SIXEL_SIMD_LEVEL_SSE2
×
1844
# elif defined(SIXEL_USE_NEON)
1845
                && simd_level >= SIXEL_SIMD_LEVEL_NEON
×
1846
# endif
1847
                ) {
1848
#if defined(SIXEL_USE_SSE2)
1849
                acc128 = _mm_setzero_ps();
1850
                minv128 = _mm_set1_ps(0.0f);
1851
                maxv128 = _mm_set1_ps(1.0f);
1852
#elif defined(SIXEL_USE_NEON)
1853
                acc_neon = vdupq_n_f32(0.0f);
1854
                minv_neon = vdupq_n_f32(0.0f);
1855
                maxv_neon = vdupq_n_f32(1.0f);
1856
#endif
1857
                for (x = x_first; x <= x_last; x++) {
×
1858
                    diff_x = (dstw >= srcw)
×
1859
                                 ? (x + 0.5) - center_x
1860
                                 : (x + 0.5) * srcw / dstw - center_x;
×
1861
                    weight = f_resample(fabs(diff_x));
1862
                    pos = (y * srcw + x) * depth;
1863
                    const float *psrc = src + pos;
1864
#if defined(SIXEL_USE_SSE2)
1865
                    pixf128 = _mm_set_ps(
1866
                        0.0f, psrc[2], psrc[1], psrc[0]);
1867
                    wv128 = _mm_set1_ps((float)weight);
1868
                    acc128 = _mm_add_ps(acc128,
1869
                                        _mm_mul_ps(pixf128, wv128));
1870
#else /* NEON */
1871
                    /*
1872
                     * Expand the RGB triple into a NEON vector without
1873
                     * brace initialization to keep older toolchains
1874
                     * happy.
1875
                     */
1876
                    pixf_neon = vdupq_n_f32(0.0f);
1877
                    pixf_neon = vsetq_lane_f32(psrc[0], pixf_neon, 0);
1878
                    pixf_neon = vsetq_lane_f32(psrc[1], pixf_neon, 1);
1879
                    pixf_neon = vsetq_lane_f32(psrc[2], pixf_neon, 2);
1880
                    wv_neon = vdupq_n_f32((float)weight);
1881
                    acc_neon = vmlaq_f32(acc_neon, pixf_neon, wv_neon);
1882
#endif
1883
                    total += weight;
1884
                }
1885
                if (total > 0.0) {
×
1886
#if defined(SIXEL_USE_SSE2)
1887
                    scalev128 = _mm_set1_ps((float)(1.0 / total));
1888
                    acc128 = _mm_mul_ps(acc128, scalev128);
1889
                    acc128 = _mm_max_ps(minv128,
1890
                                        _mm_min_ps(acc128, maxv128));
1891
                    _mm_storeu_ps(vecbuf, acc128);
1892
#else /* NEON */
1893
                    scalev_neon = vdupq_n_f32(
1894
                        (float)(1.0 / total));
1895
                    acc_neon = vmulq_f32(acc_neon, scalev_neon);
1896
                    acc_neon = vmaxq_f32(minv_neon,
1897
                                         vminq_f32(acc_neon, maxv_neon));
1898
                    vst1q_f32(vecbuf, acc_neon);
1899
#endif
1900
                    pos = (y * dstw + w) * depth;
1901
                    tmp[pos + 0] = vecbuf[0];
1902
                    tmp[pos + 1] = vecbuf[1];
1903
                    tmp[pos + 2] = vecbuf[2];
1904
                }
1905
            } else
1906
#endif
1907
            {
1908
                for (x = x_first; x <= x_last; x++) {
×
1909
                    diff_x = (dstw >= srcw)
×
1910
                                 ? (x + 0.5) - center_x
×
1911
                                 : (x + 0.5) * srcw / dstw - center_x;
×
1912
                    weight = f_resample(fabs(diff_x));
×
1913
                    for (i = 0; i < depth; i++) {
×
1914
                        pos = (y * srcw + x) * depth + i;
×
1915
                        offsets[i] += src[pos] * weight;
×
1916
                    }
1917
                    total += weight;
×
1918
                }
1919

1920
                if (total > 0.0) {
×
1921
                    for (i = 0; i < depth; i++) {
×
1922
                        pos = (y * dstw + w) * depth + i;
×
1923
                        tmp[pos] = sixel_clamp_unit_f32(
×
1924
                            (float)(offsets[i] / total));
×
1925
                    }
1926
                }
1927
            }
1928
        }
1929
    }
1930

1931
    for (h = 0; h < dsth; h++) {
×
1932
        for (w = 0; w < dstw; w++) {
×
1933
            total = 0.0;
×
1934
            for (i = 0; i < depth; i++) {
×
1935
                offsets[i] = 0.0;
×
1936
            }
1937

1938
            if (dsth >= srch) {
×
1939
                center_y = (h + 0.5) * srch / dsth;
×
1940
                y_first = MAX(center_y - n, 0);
×
1941
                y_last = MIN(center_y + n, srch - 1);
×
1942
            } else {
1943
                center_y = h + 0.5;
×
1944
                y_first = MAX(floor((center_y - n) * srch / dsth), 0);
×
1945
                y_last = MIN(floor((center_y + n) * srch / dsth),
×
1946
                             srch - 1);
1947
            }
1948

1949
#if defined(SIXEL_USE_AVX512)
1950
            if (depth == 3 &&
×
1951
                simd_level >= SIXEL_SIMD_LEVEL_AVX512) {
1952
                acc512 = sixel_avx512_zero_ps();
1953

1954
                for (y = y_first; y <= y_last; y++) {
×
1955
                    diff_y = (dsth >= srch)
1956
                                 ? (y + 0.5) - center_y
1957
                                 : (y + 0.5) * dsth / srch - center_y;
×
1958
                    weight = f_resample(fabs(diff_y));
1959
                    pos = (y * dstw + w) * depth;
1960
                    acc512 = sixel_avx512_muladd_ps(
1961
                        acc512,
1962
                        sixel_avx512_load_rgb_f32(tmp + pos),
1963
                        (float)weight);
1964
                    total += weight;
1965
                }
1966
                if (total > 0.0) {
×
1967
                    pos = (h * dstw + w) * depth;
1968
                    sixel_avx512_store_rgb_f32(acc512, total, dst + pos);
1969
                }
1970
            } else
1971
#endif
1972
#if defined(SIXEL_USE_AVX2)
1973
            if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX2) {
×
1974
                acc256 = sixel_avx2_zero_ps();
1975

1976
                for (y = y_first; y <= y_last; y++) {
×
1977
                    diff_y = (dsth >= srch)
1978
                                 ? (y + 0.5) - center_y
1979
                                 : (y + 0.5) * dsth / srch - center_y;
×
1980
                    weight = f_resample(fabs(diff_y));
1981
                    pos = (y * dstw + w) * depth;
1982
                    acc256 = sixel_avx2_muladd_ps(
1983
                        acc256,
1984
                        sixel_avx2_load_rgb_f32(tmp + pos),
1985
                        (float)weight);
1986
                    total += weight;
1987
                }
1988
                if (total > 0.0) {
×
1989
                    pos = (h * dstw + w) * depth;
1990
                    sixel_avx2_store_rgb_f32(acc256, total, dst + pos);
1991
                }
1992
            } else
1993
#endif
1994
#if defined(SIXEL_USE_AVX)
1995
            if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX) {
×
1996
                acc256 = sixel_avx_zero_ps();
1997

1998
                for (y = y_first; y <= y_last; y++) {
×
1999
                    diff_y = (dsth >= srch)
2000
                                 ? (y + 0.5) - center_y
2001
                                 : (y + 0.5) * dsth / srch - center_y;
×
2002
                    weight = f_resample(fabs(diff_y));
2003
                    pos = (y * dstw + w) * depth;
2004
                    acc256 = sixel_avx_muladd_ps(
2005
                        acc256,
2006
                        sixel_avx_load_rgb_f32(tmp + pos),
2007
                        (float)weight);
2008
                    total += weight;
2009
                }
2010
                if (total > 0.0) {
×
2011
                    pos = (h * dstw + w) * depth;
2012
                    sixel_avx_store_rgb_f32(acc256, total, dst + pos);
2013
                }
2014
            } else
2015
#endif
2016
#if defined(SIXEL_USE_SSE2) || defined(SIXEL_USE_NEON)
2017
            if (depth == 3
×
2018
# if defined(SIXEL_USE_SSE2)
2019
                && simd_level >= SIXEL_SIMD_LEVEL_SSE2
×
2020
# elif defined(SIXEL_USE_NEON)
2021
                && simd_level >= SIXEL_SIMD_LEVEL_NEON
×
2022
# endif
2023
                ) {
2024
#if defined(SIXEL_USE_SSE2)
2025
                acc128 = _mm_setzero_ps();
2026
                minv128 = _mm_set1_ps(0.0f);
2027
                maxv128 = _mm_set1_ps(1.0f);
2028
#elif defined(SIXEL_USE_NEON)
2029
                acc_neon = vdupq_n_f32(0.0f);
2030
                minv_neon = vdupq_n_f32(0.0f);
2031
                maxv_neon = vdupq_n_f32(1.0f);
2032
#endif
2033
                for (y = y_first; y <= y_last; y++) {
×
2034
                    diff_y = (dsth >= srch)
×
2035
                                 ? (y + 0.5) - center_y
2036
                                 : (y + 0.5) * dsth / srch - center_y;
×
2037
                    weight = f_resample(fabs(diff_y));
2038
                    pos = (y * dstw + w) * depth;
2039
                    const float *psrc = tmp + pos;
2040
#if defined(SIXEL_USE_SSE2)
2041
                    pixf128 = _mm_set_ps(
2042
                        0.0f, psrc[2], psrc[1], psrc[0]);
2043
                    wv128 = _mm_set1_ps((float)weight);
2044
                    acc128 = _mm_add_ps(acc128,
2045
                                        _mm_mul_ps(pixf128, wv128));
2046
#else /* NEON */
2047
                    /*
2048
                     * Expand the RGB triple into a NEON vector without
2049
                     * brace initialization to keep older toolchains
2050
                     * happy.
2051
                     */
2052
                    pixf_neon = vdupq_n_f32(0.0f);
2053
                    pixf_neon = vsetq_lane_f32(psrc[0], pixf_neon, 0);
2054
                    pixf_neon = vsetq_lane_f32(psrc[1], pixf_neon, 1);
2055
                    pixf_neon = vsetq_lane_f32(psrc[2], pixf_neon, 2);
2056
                    wv_neon = vdupq_n_f32((float)weight);
2057
                    acc_neon = vmlaq_f32(acc_neon, pixf_neon, wv_neon);
2058
#endif
2059
                    total += weight;
2060
                }
2061
                if (total > 0.0) {
×
2062
#if defined(SIXEL_USE_SSE2)
2063
                    scalev128 = _mm_set1_ps((float)(1.0 / total));
2064
                    acc128 = _mm_mul_ps(acc128, scalev128);
2065
                    acc128 = _mm_max_ps(minv128,
2066
                                        _mm_min_ps(acc128, maxv128));
2067
                    _mm_storeu_ps(vecbuf, acc128);
2068
#else /* NEON */
2069
                    scalev_neon = vdupq_n_f32(
2070
                        (float)(1.0 / total));
2071
                    acc_neon = vmulq_f32(acc_neon, scalev_neon);
2072
                    acc_neon = vmaxq_f32(minv_neon,
2073
                                         vminq_f32(acc_neon, maxv_neon));
2074
                    vst1q_f32(vecbuf, acc_neon);
2075
#endif
2076
                    pos = (h * dstw + w) * depth;
2077
                    dst[pos + 0] = vecbuf[0];
2078
                    dst[pos + 1] = vecbuf[1];
2079
                    dst[pos + 2] = vecbuf[2];
2080
                }
2081
            } else
2082
#endif
2083
            {
2084
                for (y = y_first; y <= y_last; y++) {
×
2085
                    diff_y = (dsth >= srch)
×
2086
                                 ? (y + 0.5) - center_y
×
2087
                                 : (y + 0.5) * dsth / srch - center_y;
×
2088
                    weight = f_resample(fabs(diff_y));
×
2089
                    for (i = 0; i < depth; i++) {
×
2090
                        pos = (y * dstw + w) * depth + i;
×
2091
                        offsets[i] += tmp[pos] * weight;
×
2092
                    }
2093
                    total += weight;
×
2094
                }
2095

2096
                if (total > 0.0) {
×
2097
                    for (i = 0; i < depth; i++) {
×
2098
                        pos = (h * dstw + w) * depth + i;
×
2099
                        dst[pos] = sixel_clamp_unit_f32(
×
2100
                            (float)(offsets[i] / total));
×
2101
                    }
2102
                }
2103
            }
2104
        }
2105
    }
2106

2107
    sixel_allocator_free(allocator, tmp);
×
2108
}
2109

2110

2111
SIXELAPI int
2112
sixel_helper_scale_image(
155✔
2113
    unsigned char       /* out */ *dst,
2114
    unsigned char const /* in */  *src,                   /* source image data */
2115
    int                 /* in */  srcw,                   /* source image width */
2116
    int                 /* in */  srch,                   /* source image height */
2117
    int                 /* in */  pixelformat,            /* one of enum pixelFormat */
2118
    int                 /* in */  dstw,                   /* destination image width */
2119
    int                 /* in */  dsth,                   /* destination image height */
2120
    int                 /* in */  method_for_resampling,  /* one of methodForResampling */
2121
    sixel_allocator_t   /* in */  *allocator)             /* allocator object */
2122
{
2123
    /*
2124
     * Convert the source image to RGB24 if necessary and scale it to the
2125
     * requested destination size.  The caller supplies an allocator used
2126
     * for any temporary buffers required during conversion or filtering.
2127
     */
2128
    int const depth = sixel_helper_compute_depth(pixelformat);
155✔
2129
    unsigned char *new_src = NULL;  /* optional converted source buffer */
155✔
2130
    int nret;
2131
    int new_pixelformat;
2132

2133
    /* ensure the scaler operates on RGB triples */
2134
    if (depth != 3) {
155!
2135
        new_src = (unsigned char *)sixel_allocator_malloc(allocator,
×
2136
                                                          (size_t)(srcw * srch * 3));
×
2137
        if (new_src == NULL) {
×
2138
            return (-1);
×
2139
        }
2140
        nret = sixel_helper_normalize_pixelformat(new_src,
×
2141
                                                  &new_pixelformat,
2142
                                                  src, pixelformat,
2143
                                                  srcw, srch);
2144
        if (nret != 0) {
×
2145
            sixel_allocator_free(allocator, new_src);
×
2146
            return (-1);
×
2147
        }
2148

2149
        src = new_src;  /* use converted buffer from here on */
×
2150
    } else {
2151
        new_pixelformat = pixelformat;
155✔
2152
    }
2153

2154
    /* choose re-sampling strategy */
2155
    switch (method_for_resampling) {
155!
2156
    case SIXEL_RES_NEAREST:
24✔
2157
        scale_without_resampling(dst, src, srcw, srch, dstw, dsth, depth);
30✔
2158
        break;
30✔
2159
    case SIXEL_RES_GAUSSIAN:
4✔
2160
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
6✔
2161
                              gaussian, 1.0, allocator);
1✔
2162
        break;
5✔
2163
    case SIXEL_RES_HANNING:
4✔
2164
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
6✔
2165
                              hanning, 1.0, allocator);
1✔
2166
        break;
5✔
2167
    case SIXEL_RES_HAMMING:
4✔
2168
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
6✔
2169
                              hamming, 1.0, allocator);
1✔
2170
        break;
5✔
2171
    case SIXEL_RES_WELSH:
4✔
2172
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
6✔
2173
                              welsh, 1.0, allocator);
1✔
2174
        break;
5✔
2175
    case SIXEL_RES_BICUBIC:
4✔
2176
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
6✔
2177
                              bicubic, 2.0, allocator);
1✔
2178
        break;
5✔
2179
    case SIXEL_RES_LANCZOS2:
8✔
2180
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
12✔
2181
                              lanczos2, 2.0, allocator);
2✔
2182
        break;
10✔
2183
    case SIXEL_RES_LANCZOS3:
4✔
2184
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
6✔
2185
                              lanczos3, 3.0, allocator);
1✔
2186
        break;
5✔
2187
    case SIXEL_RES_LANCZOS4:
4✔
2188
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
6✔
2189
                              lanczos4, 4.0, allocator);
1✔
2190
        break;
5✔
2191
    case SIXEL_RES_BILINEAR:
80✔
2192
    default:
2193
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
96✔
2194
                              bilinear, 1.0, allocator);
16✔
2195
        break;
80✔
2196
    }
2197

2198
    /* release temporary copy created for pixel-format normalization */
2199
    sixel_allocator_free(allocator, new_src);
155✔
2200
    return 0;
155✔
2201
}
31✔
2202

2203
SIXELAPI int
2204
sixel_helper_scale_image_float32(
×
2205
    float             /* out */ *dst,
2206
    float const       /* in */  *src,
2207
    int               /* in */  srcw,
2208
    int               /* in */  srch,
2209
    int               /* in */  pixelformat,
2210
    int               /* in */  dstw,
2211
    int               /* in */  dsth,
2212
    int               /* in */  method_for_resampling,
2213
    sixel_allocator_t /* in */  *allocator)
2214
{
2215
    int depth;
2216
    int depth_bytes;
2217

2218
    depth_bytes = sixel_helper_compute_depth(pixelformat);
×
2219
    if (depth_bytes <= 0) {
×
2220
        return (-1);
×
2221
    }
2222

2223
    depth = depth_bytes / (int)sizeof(float);
×
2224
    if (depth * (int)sizeof(float) != depth_bytes) {
×
2225
        return (-1);
×
2226
    }
2227

2228
    switch (method_for_resampling) {
×
2229
    case SIXEL_RES_NEAREST:
2230
        scale_without_resampling_float32(
×
2231
            dst, src, srcw, srch, dstw, dsth, depth);
2232
        break;
×
2233
    case SIXEL_RES_GAUSSIAN:
2234
        scale_with_resampling_float32(
×
2235
            dst, src, srcw, srch, dstw, dsth, depth,
2236
            gaussian, 1.0, allocator);
2237
        break;
×
2238
    case SIXEL_RES_HANNING:
2239
        scale_with_resampling_float32(
×
2240
            dst, src, srcw, srch, dstw, dsth, depth,
2241
            hanning, 1.0, allocator);
2242
        break;
×
2243
    case SIXEL_RES_HAMMING:
2244
        scale_with_resampling_float32(
×
2245
            dst, src, srcw, srch, dstw, dsth, depth,
2246
            hamming, 1.0, allocator);
2247
        break;
×
2248
    case SIXEL_RES_WELSH:
2249
        scale_with_resampling_float32(
×
2250
            dst, src, srcw, srch, dstw, dsth, depth,
2251
            welsh, 1.0, allocator);
2252
        break;
×
2253
    case SIXEL_RES_BICUBIC:
2254
        scale_with_resampling_float32(
×
2255
            dst, src, srcw, srch, dstw, dsth, depth,
2256
            bicubic, 2.0, allocator);
2257
        break;
×
2258
    case SIXEL_RES_LANCZOS2:
2259
        scale_with_resampling_float32(
×
2260
            dst, src, srcw, srch, dstw, dsth, depth,
2261
            lanczos2, 2.0, allocator);
2262
        break;
×
2263
    case SIXEL_RES_LANCZOS3:
2264
        scale_with_resampling_float32(
×
2265
            dst, src, srcw, srch, dstw, dsth, depth,
2266
            lanczos3, 3.0, allocator);
2267
        break;
×
2268
    case SIXEL_RES_LANCZOS4:
2269
        scale_with_resampling_float32(
×
2270
            dst, src, srcw, srch, dstw, dsth, depth,
2271
            lanczos4, 4.0, allocator);
2272
        break;
×
2273
    case SIXEL_RES_BILINEAR:
×
2274
    default:
2275
        scale_with_resampling_float32(
×
2276
            dst, src, srcw, srch, dstw, dsth, depth,
2277
            bilinear, 1.0, allocator);
2278
        break;
×
2279
    }
2280

2281
    return 0;
×
2282
}
2283

2284
#if HAVE_TESTS
2285

2286
static void
2287
reference_scale(
×
2288
    unsigned char *dst,
2289
    unsigned char const *src,
2290
    int const srcw,
2291
    int const srch,
2292
    int const dstw,
2293
    int const dsth,
2294
    int const depth)
2295
{
2296
    int w;
2297
    int h;
2298
    int x;
2299
    int y;
2300
    int i;
2301
    int pos;
2302

2303
    for (h = 0; h < dsth; h++) {
×
2304
        for (w = 0; w < dstw; w++) {
×
2305
            x = (long)w * srcw / dstw;
×
2306
            y = (long)h * srch / dsth;
×
2307
            for (i = 0; i < depth; i++) {
×
2308
                pos = (y * srcw + x) * depth + i;
×
2309
                dst[(h * dstw + w) * depth + i] = src[pos];
×
2310
            }
2311
        }
2312
    }
2313
}
×
2314

2315
static int
2316
test_without_resampling_case(
×
2317
    int srcw,
2318
    int srch,
2319
    int dstw,
2320
    int dsth,
2321
    int depth)
2322
{
2323
    int nret = EXIT_FAILURE;
×
2324
    size_t srcsize = (size_t)srcw * srch * depth;
×
2325
    size_t dstsize = (size_t)dstw * dsth * depth;
×
2326
    unsigned char *src = NULL;
×
2327
    unsigned char *ref = NULL;
×
2328
    unsigned char *out = NULL;
×
2329
    size_t i;
2330

2331
    src = (unsigned char *)malloc(srcsize);
×
2332
    ref = (unsigned char *)malloc(dstsize);
×
2333
    out = (unsigned char *)malloc(dstsize);
×
2334
    if (src == NULL || ref == NULL || out == NULL) {
×
2335
        goto end;
×
2336
    }
2337

2338
    for (i = 0; i < srcsize; ++i) {
×
2339
        src[i] = (unsigned char)(i & 0xff);
×
2340
    }
2341

2342
    reference_scale(ref, src, srcw, srch, dstw, dsth, depth);
×
2343
    scale_without_resampling(out, src, srcw, srch, dstw, dsth, depth);
×
2344

2345
    if (memcmp(ref, out, dstsize) != 0) {
×
2346
        goto end;
×
2347
    }
2348

2349
    nret = EXIT_SUCCESS;
×
2350

2351
end:
2352
    free(src);
×
2353
    free(ref);
×
2354
    free(out);
×
2355
    return nret;
×
2356
}
2357

2358
SIXELAPI int
2359
sixel_scale_tests_main(void)
×
2360
{
2361
    int nret = EXIT_FAILURE;
×
2362
    size_t i;
2363
    struct {
2364
        int srcw;
2365
        int srch;
2366
        int dstw;
2367
        int dsth;
2368
        int depth;
2369
    } cases[] = {
×
2370
        {8, 4, 3, 7, 3},
2371
        {13, 9, 17, 6, 4}
2372
    };
2373

2374
    for (i = 0; i < sizeof(cases) / sizeof(cases[0]); ++i) {
×
2375
        nret = test_without_resampling_case(cases[i].srcw,
×
2376
                                            cases[i].srch,
2377
                                            cases[i].dstw,
2378
                                            cases[i].dsth,
2379
                                            cases[i].depth);
2380
        if (nret != EXIT_SUCCESS) {
×
2381
            goto end;
×
2382
        }
2383
    }
2384

2385
    nret = EXIT_SUCCESS;
×
2386

2387
end:
2388
    return nret;
×
2389
}
2390

2391
#endif /* HAVE_TESTS */
2392

2393
#if defined(__GNUC__) && !defined(__clang__)
2394
# pragma GCC diagnostic pop
2395
#endif
2396

2397
/* emacs Local Variables:      */
2398
/* emacs mode: c               */
2399
/* emacs tab-width: 4          */
2400
/* emacs indent-tabs-mode: nil */
2401
/* emacs c-basic-offset: 4     */
2402
/* emacs End:                  */
2403
/* vim: set expandtab ts=4 sts=4 sw=4 : */
2404
/* EOF */
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc