• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

saitoha / libsixel / 19765269522

28 Nov 2025 01:30PM UTC coverage: 39.983% (-1.6%) from 41.616%
19765269522

push

github

web-flow
Merge pull request #214 from saitoha/codex/add-logging-to-resize-processing

Limit scale logging and add timeline window controls

9788 of 35562 branches covered (27.52%)

9 of 63 new or added lines in 1 file covered. (14.29%)

281 existing lines in 19 files now uncovered.

12991 of 32491 relevant lines covered (39.98%)

619662.16 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

49.26
/src/scale.c
1
/*
2
 * SPDX-License-Identifier: MIT
3
 *
4
 * Copyright (c) 2021-2025 libsixel developers. See `AUTHORS`.
5
 * Copyright (c) 2014-2016 Hayaki Saito
6
 *
7
 * Permission is hereby granted, free of charge, to any person obtaining a copy of
8
 * this software and associated documentation files (the "Software"), to deal in
9
 * the Software without restriction, including without limitation the rights to
10
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
11
 * the Software, and to permit persons to whom the Software is furnished to do so,
12
 * subject to the following conditions:
13
 *
14
 * The above copyright notice and this permission notice shall be included in all
15
 * copies or substantial portions of the Software.
16
 *
17
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
19
 * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
20
 * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
21
 * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
 */
24

25
#include "config.h"
26

27
/* STDC_HEADERS */
28
#include <stdlib.h>
29

30
#if HAVE_ERRNO_H
31
# include <errno.h>
32
#endif  /* HAVE_ERRNO_H */
33
#if HAVE_LIMITS_H
34
# include <limits.h>
35
#endif  /* HAVE_LIMITS_H */
36
#if HAVE_STRING_H
37
# include <string.h>
38
#endif  /* HAVE_STRING_H */
39
#if HAVE_STDINT_H
40
# include <stdint.h>
41
#endif  /* HAVE_STDINT_H */
42

43
#if HAVE_MATH_H
44
# define _USE_MATH_DEFINES  /* for MSVC */
45
# include <math.h>
46
#endif  /* HAVE_MATH_H */
47
#ifndef M_PI
48
# define M_PI 3.14159265358979323846
49
#endif
50

51
#include <sixel.h>
52

53
#include "cpu.h"
54
#include "logger.h"
55

56
#if SIXEL_ENABLE_THREADS
57
# include "sixel_threads_config.h"
58
# include "threadpool.h"
59
#endif
60

61
#if defined(HAVE_IMMINTRIN_H) && \
62
    (defined(__x86_64__) || defined(_M_X64) || defined(__i386) || \
63
     defined(_M_IX86))
64
# define SIXEL_HAS_X86_INTRIN 1
65
# include <immintrin.h>
66
#endif
67

68
#if defined(HAVE_SSE2)
69
# if defined(__SSE2__)
70
#  if defined(HAVE_EMMINTRIN_H)
71
#   include <emmintrin.h>
72
#   define SIXEL_USE_SSE2 1
73
#  endif
74
# endif
75
#endif
76

77
#if defined(SIXEL_HAS_X86_INTRIN)
78
# if defined(__GNUC__)
79
#  if !defined(__clang__)
80
#   define SIXEL_TARGET_AVX __attribute__((target("avx")))
81
#   define SIXEL_TARGET_AVX2 __attribute__((target("avx2")))
82
#   define SIXEL_TARGET_AVX512 __attribute__((target("avx512f")))
83
#   define SIXEL_USE_AVX 1
84
#   define SIXEL_USE_AVX2 1
85
#   define SIXEL_USE_AVX512 1
86
#  else
87
/*
88
 * clang rejects returning AVX vectors when the translation unit target
89
 * does not already include the corresponding ISA.  Guard runtime AVX
90
 * helpers with compile-time ISA availability to keep non-AVX builds
91
 * warning-free while still using AVX when the compiler enables it.
92
 */
93
#   define SIXEL_TARGET_AVX
94
#   define SIXEL_TARGET_AVX2
95
#   define SIXEL_TARGET_AVX512
96
#   if defined(__AVX__)
97
#    define SIXEL_USE_AVX 1
98
#   endif
99
#   if defined(__AVX2__)
100
#    define SIXEL_USE_AVX2 1
101
#   endif
102
#   if defined(__AVX512F__)
103
#    define SIXEL_USE_AVX512 1
104
#   endif
105
#  endif
106
# else
107
#  define SIXEL_TARGET_AVX
108
#  define SIXEL_TARGET_AVX2
109
#  define SIXEL_TARGET_AVX512
110
#  if defined(__AVX__)
111
#   define SIXEL_USE_AVX 1
112
#  endif
113
#  if defined(__AVX2__)
114
#   define SIXEL_USE_AVX2 1
115
#  endif
116
#  if defined(__AVX512F__)
117
#   define SIXEL_USE_AVX512 1
118
#  endif
119
# endif
120
#endif
121

122
#if defined(__GNUC__) && !defined(__clang__)
123
# pragma GCC diagnostic push
124
# pragma GCC diagnostic ignored "-Wpsabi"
125
#endif
126

127
#if defined(HAVE_NEON)
128
# if (defined(__ARM_NEON) || defined(__ARM_NEON__))
129
#  if defined(HAVE_ARM_NEON_H)
130
#   include <arm_neon.h>
131
#   define SIXEL_USE_NEON 1
132
#  endif
133
# endif
134
#endif
135

136
#if !defined(MAX)
137
# define MAX(l, r) ((l) > (r) ? (l) : (r))
138
#endif
139
#if !defined(MIN)
140
#define MIN(l, r) ((l) < (r) ? (l) : (r))
141
#endif
142

143

144
#if 0
145
/* function Nearest Neighbor */
146
static double
147
nearest_neighbor(double const d)
148
{
149
    if (d <= 0.5) {
150
        return 1.0;
151
    }
152
    return 0.0;
153
}
154
#endif
155

156

157
/* function Bi-linear */
158
static double
159
bilinear(double const d)
38,860,368✔
160
{
161
    if (d < 1.0) {
38,860,368✔
162
        return 1.0 - d;
31,476,348✔
163
    }
164
    return 0.0;
7,384,020✔
165
}
12,953,456✔
166

167

168
/* function Welsh */
169
static double
170
welsh(double const d)
2,512,200✔
171
{
172
    if (d < 1.0) {
2,512,200✔
173
        return 1.0 - d * d;
1,796,100✔
174
    }
175
    return 0.0;
716,100✔
176
}
837,400✔
177

178

179
/* function Bi-cubic */
180
static double
181
bicubic(double const d)
3,894,600✔
182
{
183
    if (d <= 1.0) {
3,894,600✔
184
        return 1.0 + (d - 2.0) * d * d;
1,880,100✔
185
    }
186
    if (d <= 2.0) {
2,014,500✔
187
        return 4.0 + d * (-8.0 + d * (5.0 - d));
1,860,300✔
188
    }
189
    return 0.0;
154,200✔
190
}
1,298,200✔
191

192

193
/* function sinc
194
 * sinc(x) = sin(PI * x) / (PI * x)
195
 */
196
static double
197
sinc(double const x)
45,685,716✔
198
{
199
    return sin(M_PI * x) / (M_PI * x);
45,685,716✔
200
}
201

202

203
/* function Lanczos-2
204
 * Lanczos(x) = sinc(x) * sinc(x / 2) , |x| <= 2
205
 *            = 0, |x| > 2
206
 */
207
static double
208
lanczos2(double const d)
9,311,994✔
209
{
210
    if (d == 0.0) {
9,311,994!
211
        return 1.0;
×
212
    }
213
    if (d < 2.0) {
9,311,994✔
214
        return sinc(d) * sinc(d / 2.0);
8,585,934✔
215
    }
216
    return 0.0;
726,060✔
217
}
3,103,998✔
218

219

220
/* function Lanczos-3
221
 * Lanczos(x) = sinc(x) * sinc(x / 3) , |x| <= 3
222
 *            = 0, |x| > 3
223
 */
224
static double
225
lanczos3(double const d)
6,079,794✔
226
{
227
    if (d == 0.0) {
6,079,794!
228
        return 1.0;
×
229
    }
230
    if (d < 3.0) {
6,079,794✔
231
        return sinc(d) * sinc(d / 3.0);
5,867,340✔
232
    }
233
    return 0.0;
212,454✔
234
}
2,026,598✔
235

236
/* function Lanczos-4
237
 * Lanczos(x) = sinc(x) * sinc(x / 4) , |x| <= 4
238
 *            = 0, |x| > 4
239
 */
240
static double
241
lanczos4(double const d)
8,711,172✔
242
{
243
    if (d == 0.0) {
8,711,172!
244
        return 1.0;
×
245
    }
246
    if (d < 4.0) {
8,711,172✔
247
        return sinc(d) * sinc(d / 4.0);
8,389,584✔
248
    }
249
    return 0.0;
321,588✔
250
}
2,903,724✔
251

252

253
static double
254
gaussian(double const d)
2,035,950✔
255
{
256
    return exp(-2.0 * d * d) * sqrt(2.0 / M_PI);
2,035,950✔
257
}
258

259

260
static double
261
hanning(double const d)
2,188,056✔
262
{
263
    return 0.5 + 0.5 * cos(d * M_PI);
2,188,056✔
264
}
265

266

267
static double
268
hamming(const double d)
2,512,200✔
269
{
270
    return 0.54 + 0.46 * cos(d * M_PI);
2,512,200✔
271
}
272

273

274
static unsigned char
275
normalize(double x, double total)
×
276
{
277
    int result;
278

279
    result = floor(x / total);
×
280
    if (result > 255) {
×
281
        return 0xff;
×
282
    }
283
    if (result < 0) {
×
284
        return 0x00;
×
285
    }
286
    return (unsigned char)result;
×
287
}
288

289
static int
290
sixel_scale_simd_level(void)
75✔
291
{
292
    static int simd_level = -2;
293

294
    if (simd_level == -2) {
75!
295
        simd_level = sixel_cpu_simd_level();
75✔
296
    }
25✔
297

298
    return simd_level;
75✔
299
}
300

301
static float
302
sixel_clamp_unit_f32(float value)
×
303
{
304
    /*
305
     * Resampling kernels with negative lobes can push linear RGB values
306
     * outside the unit interval. Clamp here so downstream conversions do
307
     * not collapse to black.
308
     */
309
    if (value < 0.0f) {
×
310
        return 0.0f;
×
311
    }
312
    if (value > 1.0f) {
×
313
        return 1.0f;
×
314
    }
315

316
    return value;
×
317
}
318

319
#if defined(HAVE_IMMINTRIN_H)
320
#if defined(SIXEL_USE_AVX)
321
static SIXEL_TARGET_AVX __m256
322
sixel_avx_load_rgb_ps(unsigned char const *psrc)
323
{
324
    __m128i pixi128;
325
    __m128 pixf128;
326
    __m256 pixf256;
327

328
    /*
329
     * Build the byte vector explicitly so the AVX path never accumulates
330
     * garbage data when widening to 32-bit lanes.
331
     */
332
    pixi128 = _mm_setr_epi8((char)psrc[0],
333
                            (char)psrc[1],
334
                            (char)psrc[2],
335
                            0,
336
                            0, 0, 0, 0,
337
                            0, 0, 0, 0,
338
                            0, 0, 0, 0);
339
    pixf128 = _mm_cvtepi32_ps(pixi128);
340
    pixf256 = _mm256_castps128_ps256(pixf128);
341
    pixf256 = _mm256_insertf128_ps(pixf256, _mm_setzero_ps(), 1);
342
    return pixf256;
343
}
344

345
static SIXEL_TARGET_AVX void
346
sixel_avx_store_rgb_u8(__m256 acc, double total, unsigned char *dst)
347
{
348
    __m256 scalev;
349
    __m256 minv;
350
    __m256 maxv;
351
    __m256i acci;
352
    int out[8];
353

354
    scalev = _mm256_set1_ps((float)(1.0 / total));
355
    acc = _mm256_mul_ps(acc, scalev);
356
    minv = _mm256_set1_ps(0.0f);
357
    maxv = _mm256_set1_ps(255.0f);
358
    acc = _mm256_max_ps(minv, _mm256_min_ps(acc, maxv));
359
    acci = _mm256_cvtps_epi32(acc);
360
    _mm256_storeu_si256((__m256i *)out, acci);
361
    dst[0] = (unsigned char)out[0];
362
    dst[1] = (unsigned char)out[1];
363
    dst[2] = (unsigned char)out[2];
364
}
365

366
static SIXEL_TARGET_AVX __m256
367
sixel_avx_zero_ps(void)
368
{
369
    return _mm256_setzero_ps();
370
}
371

372
static SIXEL_TARGET_AVX __m256
373
sixel_avx_muladd_ps(__m256 acc, __m256 pix, float weight)
374
{
375
    __m256 wv;
376

377
    wv = _mm256_set1_ps(weight);
378
    return _mm256_add_ps(acc, _mm256_mul_ps(pix, wv));
379
}
380

381
static SIXEL_TARGET_AVX __m256
382
sixel_avx_load_rgb_f32(float const *psrc)
383
{
384
    __m256 pixf;
385

386
    pixf = _mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f,
387
                         psrc[2], psrc[1], psrc[0], 0.0f);
388
    return pixf;
389
}
390

391
static SIXEL_TARGET_AVX void
392
sixel_avx_store_rgb_f32(__m256 acc, double total, float *dst)
393
{
394
    __m256 scalev;
395
    __m256 minv;
396
    __m256 maxv;
397
    float out[8];
398

399
    scalev = _mm256_set1_ps((float)(1.0 / total));
400
    acc = _mm256_mul_ps(acc, scalev);
401
    minv = _mm256_set1_ps(0.0f);
402
    maxv = _mm256_set1_ps(1.0f);
403
    acc = _mm256_max_ps(minv, _mm256_min_ps(acc, maxv));
404
    _mm256_storeu_ps(out, acc);
405
    dst[0] = out[0];
406
    dst[1] = out[1];
407
    dst[2] = out[2];
408
}
409
#endif  /* SIXEL_USE_AVX */
410

411
#if defined(SIXEL_USE_AVX2)
412
static SIXEL_TARGET_AVX2 __m256
413
sixel_avx2_load_rgb_ps(unsigned char const *psrc)
50,737,556✔
414
{
415
    __m128i pixi128;
416
    __m256i pixi256;
417

418
    /*
419
     * Keep the unused bytes zeroed so widening to epi32 does not pull in
420
     * stack junk and bias every output channel toward white.
421
     */
422
    pixi128 = _mm_setr_epi8((char)psrc[0],
50,737,556✔
423
                            (char)psrc[1],
50,737,556✔
424
                            (char)psrc[2],
50,737,556✔
425
                            0,
426
                            0, 0, 0, 0,
427
                            0, 0, 0, 0,
428
                            0, 0, 0, 0);
429
    pixi256 = _mm256_cvtepu8_epi32(pixi128);
50,737,556✔
430
    return _mm256_cvtepi32_ps(pixi256);
50,737,556✔
431
}
432

433
static SIXEL_TARGET_AVX2 void
434
sixel_avx2_store_rgb_u8(__m256 acc, double total, unsigned char *dst)
5,882,512✔
435
{
436
    __m256 scalev;
437
    __m256 minv;
438
    __m256 maxv;
439
    __m256i acci;
440
    int out[8];
441

442
    scalev = _mm256_set1_ps((float)(1.0 / total));
11,765,024✔
443
    acc = _mm256_mul_ps(acc, scalev);
5,882,512✔
444
    minv = _mm256_set1_ps(0.0f);
5,882,512✔
445
    maxv = _mm256_set1_ps(255.0f);
5,882,512✔
446
    acc = _mm256_max_ps(minv, _mm256_min_ps(acc, maxv));
11,765,024✔
447
    acci = _mm256_cvtps_epi32(acc);
5,882,512✔
448
    _mm256_storeu_si256((__m256i *)out, acci);
449
    dst[0] = (unsigned char)out[0];
5,882,512✔
450
    dst[1] = (unsigned char)out[1];
5,882,512✔
451
    dst[2] = (unsigned char)out[2];
5,882,512✔
452
}
5,882,512✔
453

454
static SIXEL_TARGET_AVX2 __m256
455
sixel_avx2_zero_ps(void)
5,882,512✔
456
{
457
    return _mm256_setzero_ps();
5,882,512✔
458
}
459

460
static SIXEL_TARGET_AVX2 __m256
461
sixel_avx2_muladd_ps(__m256 acc, __m256 pix, float weight)
50,737,556✔
462
{
463
    __m256 wv;
464

465
    wv = _mm256_set1_ps(weight);
50,737,556✔
466
    return _mm256_add_ps(acc, _mm256_mul_ps(pix, wv));
101,475,112✔
467
}
468

469
static SIXEL_TARGET_AVX2 __m256
470
sixel_avx2_load_rgb_f32(float const *psrc)
471
{
472
    __m256 pixf;
473

474
    pixf = _mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f,
475
                         psrc[2], psrc[1], psrc[0], 0.0f);
476
    return pixf;
477
}
478

479
static SIXEL_TARGET_AVX2 void
480
sixel_avx2_store_rgb_f32(__m256 acc, double total, float *dst)
481
{
482
    __m256 scalev;
483
    __m256 minv;
484
    __m256 maxv;
485
    float out[8];
486

487
    scalev = _mm256_set1_ps((float)(1.0 / total));
488
    acc = _mm256_mul_ps(acc, scalev);
489
    minv = _mm256_set1_ps(0.0f);
490
    maxv = _mm256_set1_ps(1.0f);
491
    acc = _mm256_max_ps(minv, _mm256_min_ps(acc, maxv));
492
    _mm256_storeu_ps(out, acc);
493
    dst[0] = out[0];
494
    dst[1] = out[1];
495
    dst[2] = out[2];
496
}
497
#endif  /* SIXEL_USE_AVX2 */
498

499
#if defined(SIXEL_USE_AVX512)
500
static SIXEL_TARGET_AVX512 __m512
501
sixel_avx512_load_rgb_ps(unsigned char const *psrc)
502
{
503
    __m128i pixi128;
504
    __m512i pixi512;
505

506
    pixi128 = _mm_setr_epi8((char)psrc[0],
507
                            (char)psrc[1],
508
                            (char)psrc[2],
509
                            0,
510
                            0, 0, 0, 0,
511
                            0, 0, 0, 0,
512
                            0, 0, 0, 0);
513
    pixi512 = _mm512_cvtepu8_epi32(pixi128);
514
    return _mm512_cvtepi32_ps(pixi512);
515
}
516

517
static SIXEL_TARGET_AVX512 void
518
sixel_avx512_store_rgb_u8(__m512 acc, double total, unsigned char *dst)
519
{
520
    __m512 scalev;
521
    __m512 minv;
522
    __m512 maxv;
523
    __m512i acci;
524
    int out[16];
525

526
    scalev = _mm512_set1_ps((float)(1.0 / total));
527
    acc = _mm512_mul_ps(acc, scalev);
528
    minv = _mm512_set1_ps(0.0f);
529
    maxv = _mm512_set1_ps(255.0f);
530
    acc = _mm512_max_ps(minv, _mm512_min_ps(acc, maxv));
531
    acci = _mm512_cvtps_epi32(acc);
532
    _mm512_storeu_si512((void *)out, acci);
533
    dst[0] = (unsigned char)out[0];
534
    dst[1] = (unsigned char)out[1];
535
    dst[2] = (unsigned char)out[2];
536
}
537

538
static SIXEL_TARGET_AVX512 __m512
539
sixel_avx512_zero_ps(void)
540
{
541
    return _mm512_setzero_ps();
542
}
543

544
static SIXEL_TARGET_AVX512 __m512
545
sixel_avx512_muladd_ps(__m512 acc, __m512 pix, float weight)
546
{
547
    __m512 wv;
548

549
    wv = _mm512_set1_ps(weight);
550
    return _mm512_add_ps(acc, _mm512_mul_ps(pix, wv));
551
}
552

553
static SIXEL_TARGET_AVX512 __m512
554
sixel_avx512_load_rgb_f32(float const *psrc)
555
{
556
    __m512 pixf;
557

558
    pixf = _mm512_set_ps(0.0f, 0.0f, 0.0f, 0.0f,
559
                         0.0f, 0.0f, 0.0f, 0.0f,
560
                         0.0f, 0.0f, 0.0f, 0.0f,
561
                         psrc[2], psrc[1], psrc[0], 0.0f);
562
    return pixf;
563
}
564

565
static SIXEL_TARGET_AVX512 void
566
sixel_avx512_store_rgb_f32(__m512 acc, double total, float *dst)
567
{
568
    __m512 scalev;
569
    __m512 minv;
570
    __m512 maxv;
571
    float out[16];
572

573
    scalev = _mm512_set1_ps((float)(1.0 / total));
574
    acc = _mm512_mul_ps(acc, scalev);
575
    minv = _mm512_set1_ps(0.0f);
576
    maxv = _mm512_set1_ps(1.0f);
577
    acc = _mm512_max_ps(minv, _mm512_min_ps(acc, maxv));
578
    _mm512_storeu_ps(out, acc);
579
    dst[0] = out[0];
580
    dst[1] = out[1];
581
    dst[2] = out[2];
582
}
583
#endif  /* SIXEL_USE_AVX512 */
584
#endif /* HAVE_IMMINTRIN_H */
585

586

587
static void
588
scale_without_resampling(
18✔
589
    unsigned char *dst,
590
    unsigned char const *src,
591
    int const srcw,
592
    int const srch,
593
    int const dstw,
594
    int const dsth,
595
    int const depth)
596
{
597
    int w;
598
    int h;
599
    int x;
600
    int y;
601
    int i;
602
    int pos;
603

604
    for (h = 0; h < dsth; h++) {
498✔
605
        for (w = 0; w < dstw; w++) {
191,640✔
606
            x = (long)w * srcw / dstw;
191,160✔
607
            y = (long)h * srch / dsth;
191,160✔
608
            for (i = 0; i < depth; i++) {
764,640✔
609
                pos = (y * srcw + x) * depth + i;
573,480✔
610
                dst[(h * dstw + w) * depth + i] = src[pos];
573,480✔
611
            }
191,160✔
612
        }
63,720✔
613
    }
160✔
614
}
18✔
615

616
static void
617
scale_without_resampling_float32(
×
618
    float *dst,
619
    float const *src,
620
    int const srcw,
621
    int const srch,
622
    int const dstw,
623
    int const dsth,
624
    int const depth)
625
{
626
    int w;
627
    int h;
628
    int x;
629
    int y;
630
    int i;
631
    int pos;
632

633
    for (h = 0; h < dsth; h++) {
×
634
        for (w = 0; w < dstw; w++) {
×
635
            x = (long)w * srcw / dstw;
×
636
            y = (long)h * srch / dsth;
×
637
            for (i = 0; i < depth; i++) {
×
638
                pos = (y * srcw + x) * depth + i;
×
639
                dst[(h * dstw + w) * depth + i] = src[pos];
×
640
            }
641
        }
642
    }
643
}
×
644

645

646
typedef double (*resample_fn_t)(double const d);
647

648
/*
649
 * Two-pass separable filter helpers. Each function processes a single row so
650
 * the caller may invoke them serially or from a threadpool worker.
651
 */
652
static void
653
scale_horizontal_row(
31,950✔
654
    unsigned char *tmp,
655
    unsigned char const *src,
656
    int const srcw,
657
    int const dstw,
658
    int const depth,
659
    int const y,
660
    resample_fn_t const f_resample,
661
    double const n,
662
    int const simd_level)
663
{
664
    int w;
665
    int x;
666
    int i;
667
    int pos;
668
    int x_first;
669
    int x_last;
670
    double center_x;
671
    double diff_x;
672
    double weight;
673
    double total;
674
    double offsets[8];
675
#if defined(SIXEL_USE_AVX512)
676
    __m512 acc512;
677
#endif
678
#if defined(SIXEL_USE_AVX2) || defined(SIXEL_USE_AVX)
679
    __m256 acc256;
680
#endif
681
#if defined(SIXEL_USE_SSE2)
682
    __m128 acc128;
683
    __m128 minv128;
684
    __m128 maxv128;
685
    __m128 scalev128;
686
    __m128 wv128;
687
    __m128 pixf128;
688
    __m128i pixi128;
689
    __m128i acci128;
690
    __m128i acc16_128;
691
    unsigned int pixel128;
692
#endif
693
#if defined(SIXEL_USE_NEON)
694
    float32x4_t acc_neon;
695
    float32x4_t minv_neon;
696
    float32x4_t maxv_neon;
697
    float32x4_t scalev_neon;
698
    float32x4_t wv_neon;
699
    float32x4_t pixf_neon;
700
    uint32x4_t pix32_neon;
701
    uint32x4_t acci_neon;
702
    uint16x4_t acc16_neon;
703
    uint8x8_t acc8_neon;
704
    uint8_t outb_neon[8];
705
#endif
706

707
    for (w = 0; w < dstw; w++) {
5,662,350✔
708
        total = 0.0;
5,630,400✔
709
        for (i = 0; i < depth; i++) {
22,521,600✔
710
            offsets[i] = 0;
16,891,200✔
711
        }
5,630,400✔
712

713
        if (dstw >= srcw) {
5,630,400✔
714
            center_x = (w + 0.5) * srcw / dstw;
576,000✔
715
            x_first = MAX(center_x - n, 0);
576,000✔
716
            x_last = MIN(center_x + n, srcw - 1);
576,000✔
717
        } else {
192,000✔
718
            center_x = w + 0.5;
5,054,400✔
719
            x_first = MAX(floor((center_x - n) * srcw / dstw), 0);
5,054,400✔
720
            x_last = MIN(floor((center_x + n) * srcw / dstw), srcw - 1);
5,054,400✔
721
        }
722

723
#if defined(SIXEL_USE_AVX512)
724
        if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX512) {
3,753,600!
725
            acc512 = sixel_avx512_zero_ps();
726

727
            for (x = x_first; x <= x_last; x++) {
×
728
                diff_x = (dstw >= srcw)
729
                             ? (x + 0.5) - center_x
730
                             : (x + 0.5) * dstw / srcw - center_x;
×
731
                weight = f_resample(fabs(diff_x));
732
                pos = (y * srcw + x) * depth;
733
                acc512 = sixel_avx512_muladd_ps(
734
                    acc512,
735
                    sixel_avx512_load_rgb_ps(src + pos),
736
                    (float)weight);
737
                total += weight;
738
            }
739
            if (total > 0.0) {
×
740
                pos = (y * dstw + w) * depth;
741
                sixel_avx512_store_rgb_u8(acc512, total, tmp + pos);
742
            }
743
            continue;
744
        }
745
#endif
746
#if defined(SIXEL_USE_AVX2)
747
        if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX2) {
3,753,600!
748
            acc256 = sixel_avx2_zero_ps();
3,753,600✔
749

750
            for (x = x_first; x <= x_last; x++) {
41,525,100✔
751
                diff_x = (dstw >= srcw)
37,771,500✔
752
                             ? (x + 0.5) - center_x
1,149,600✔
753
                             : (x + 0.5) * dstw / srcw - center_x;
37,771,500✔
754
                weight = f_resample(fabs(diff_x));
37,771,500✔
755
                pos = (y * srcw + x) * depth;
37,771,500✔
756
                acc256 = sixel_avx2_muladd_ps(
37,771,500✔
757
                    acc256,
758
                    sixel_avx2_load_rgb_ps(src + pos),
759
                    (float)weight);
760
                total += weight;
37,771,500✔
761
            }
762
            if (total > 0.0) {
3,753,600!
763
                pos = (y * dstw + w) * depth;
3,753,600✔
764
                sixel_avx2_store_rgb_u8(acc256, total, tmp + pos);
3,753,600✔
765
            }
766
            continue;
3,753,600✔
767
        }
768
#endif
769
#if defined(SIXEL_USE_AVX)
770
        if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX) {
×
771
            acc256 = sixel_avx_zero_ps();
772

773
            for (x = x_first; x <= x_last; x++) {
×
774
                diff_x = (dstw >= srcw)
775
                             ? (x + 0.5) - center_x
776
                             : (x + 0.5) * dstw / srcw - center_x;
×
777
                weight = f_resample(fabs(diff_x));
778
                pos = (y * srcw + x) * depth;
779
                acc256 = sixel_avx_muladd_ps(
780
                    acc256,
781
                    sixel_avx_load_rgb_ps(src + pos),
782
                    (float)weight);
783
                total += weight;
784
            }
785
            if (total > 0.0) {
×
786
                pos = (y * dstw + w) * depth;
787
                sixel_avx_store_rgb_u8(acc256, total, tmp + pos);
788
            }
789
            continue;
790
        }
791
#endif
792
#if defined(SIXEL_USE_SSE2) || defined(SIXEL_USE_NEON)
793
        if (depth == 3
1,876,800!
794
# if defined(SIXEL_USE_SSE2)
795
            && simd_level >= SIXEL_SIMD_LEVEL_SSE2
×
796
# elif defined(SIXEL_USE_NEON)
797
            && simd_level >= SIXEL_SIMD_LEVEL_NEON
1,876,800!
798
# endif
799
            ) {
800
#if defined(SIXEL_USE_SSE2)
801
            acc128 = _mm_setzero_ps();
802
#elif defined(SIXEL_USE_NEON)
803
            acc_neon = vdupq_n_f32(0.0f);
1,876,800✔
804
#endif
805
            for (x = x_first; x <= x_last; x++) {
20,762,550!
806
                diff_x = (dstw >= srcw)
18,885,750✔
807
                             ? (x + 0.5) - center_x
574,800✔
808
                             : (x + 0.5) * dstw / srcw - center_x;
18,310,950!
809
                weight = f_resample(fabs(diff_x));
18,885,750✔
810
                pos = (y * srcw + x) * depth;
18,885,750✔
811
                const unsigned char *psrc = src + pos;
18,885,750✔
812
#if defined(SIXEL_USE_SSE2)
813
                pixel128 = psrc[0] | (psrc[1] << 8) | (psrc[2] << 16);
814
                pixi128 = _mm_cvtsi32_si128((int)pixel128);
815
                pixi128 = _mm_unpacklo_epi8(pixi128, _mm_setzero_si128());
816
                pixi128 = _mm_unpacklo_epi16(pixi128, _mm_setzero_si128());
817
                pixf128 = _mm_cvtepi32_ps(pixi128);
818
                wv128 = _mm_set1_ps((float)weight);
819
                acc128 = _mm_add_ps(acc128, _mm_mul_ps(pixf128, wv128));
820
#else /* NEON */
821
                pix32_neon = (uint32x4_t){psrc[0], psrc[1], psrc[2], 0};
18,885,750✔
822
                pixf_neon = vcvtq_f32_u32(pix32_neon);
18,885,750✔
823
                wv_neon = vdupq_n_f32((float)weight);
18,885,750✔
824
                acc_neon = vmlaq_f32(acc_neon, pixf_neon, wv_neon);
18,885,750✔
825
#endif
826
                total += weight;
18,885,750✔
827
            }
18,885,750✔
828
            if (total > 0.0) {
1,876,800!
829
#if defined(SIXEL_USE_SSE2)
830
                scalev128 = _mm_set1_ps((float)(1.0 / total));
831
                acc128 = _mm_mul_ps(acc128, scalev128);
832
                minv128 = _mm_set1_ps(0.0f);
833
                maxv128 = _mm_set1_ps(255.0f);
834
                acc128 = _mm_max_ps(minv128, _mm_min_ps(acc128, maxv128));
835
                acci128 = _mm_cvtps_epi32(acc128);
836
                acc16_128 = _mm_packs_epi32(acci128, _mm_setzero_si128());
837
                acc16_128 = _mm_packus_epi16(acc16_128, _mm_setzero_si128());
838
                pos = (y * dstw + w) * depth;
839
                pixel128 = (unsigned int)_mm_cvtsi128_si32(acc16_128);
840
                tmp[pos + 0] = (unsigned char)pixel128;
841
                tmp[pos + 1] = (unsigned char)(pixel128 >> 8);
842
                tmp[pos + 2] = (unsigned char)(pixel128 >> 16);
843
#else /* NEON */
844
                scalev_neon = vdupq_n_f32((float)(1.0 / total));
1,876,800✔
845
                acc_neon = vmulq_f32(acc_neon, scalev_neon);
1,876,800✔
846
                minv_neon = vdupq_n_f32(0.0f);
1,876,800✔
847
                maxv_neon = vdupq_n_f32(255.0f);
1,876,800✔
848
                acc_neon = vmaxq_f32(minv_neon,
3,753,600✔
849
                                     vminq_f32(acc_neon, maxv_neon));
1,876,800✔
850
                acci_neon = vcvtq_u32_f32(acc_neon);
1,876,800✔
851
                acc16_neon = vmovn_u32(acci_neon);
1,876,800✔
852
                acc8_neon = vmovn_u16(vcombine_u16(acc16_neon, acc16_neon));
1,876,800✔
853

854
                vst1_u8(outb_neon, acc8_neon);
1,876,800✔
855
                pos = (y * dstw + w) * depth;
1,876,800✔
856
                tmp[pos + 0] = outb_neon[0];
1,876,800✔
857
                tmp[pos + 1] = outb_neon[1];
1,876,800✔
858
                tmp[pos + 2] = outb_neon[2];
1,876,800✔
859
#endif
860
            }
1,876,800✔
861
            continue;
1,876,800✔
862
        }
863
#endif /* SIMD paths */
864

865
        for (x = x_first; x <= x_last; x++) {
×
866
            diff_x = (dstw >= srcw)
×
867
                         ? (x + 0.5) - center_x
×
868
                         : (x + 0.5) * dstw / srcw - center_x;
×
869
            weight = f_resample(fabs(diff_x));
×
870
            for (i = 0; i < depth; i++) {
×
871
                pos = (y * srcw + x) * depth + i;
×
872
                offsets[i] += src[pos] * weight;
×
873
            }
874
            total += weight;
×
875
        }
876

877
        if (total > 0.0) {
×
878
            for (i = 0; i < depth; i++) {
×
879
                pos = (y * dstw + w) * depth + i;
×
880
                tmp[pos] = normalize(offsets[i], total);
×
881
            }
882
        }
883
    }
884
}
31,950✔
885

886
static void
887
scale_vertical_row(
11,967✔
888
    unsigned char *dst,
889
    unsigned char const *tmp,
890
    int const dstw,
891
    int const dsth,
892
    int const depth,
893
    int const srch,
894
    int const h,
895
    resample_fn_t const f_resample,
896
    double const n,
897
    int const simd_level)
898
{
899
    int w;
900
    int y;
901
    int i;
902
    int pos;
903
    int y_first;
904
    int y_last;
905
    double center_y;
906
    double diff_y;
907
    double weight;
908
    double total;
909
    double offsets[8];
910
#if defined(SIXEL_USE_AVX512)
911
    __m512 acc512;
912
#endif
913
#if defined(SIXEL_USE_AVX2) || defined(SIXEL_USE_AVX)
914
    __m256 acc256;
915
#endif
916
#if defined(SIXEL_USE_SSE2)
917
    __m128 acc128;
918
    __m128 minv128;
919
    __m128 maxv128;
920
    __m128 scalev128;
921
    __m128 wv128;
922
    __m128 pixf128;
923
    __m128i pixi128;
924
    __m128i acci128;
925
    __m128i acc16_128;
926
    unsigned int pixel128;
927
#endif
928
#if defined(SIXEL_USE_NEON)
929
    float32x4_t acc_neon;
930
    float32x4_t minv_neon;
931
    float32x4_t maxv_neon;
932
    float32x4_t scalev_neon;
933
    float32x4_t wv_neon;
934
    float32x4_t pixf_neon;
935
    uint32x4_t pix32_neon;
936
    uint32x4_t acci_neon;
937
    uint16x4_t acc16_neon;
938
    uint8x8_t acc8_neon;
939
    uint8_t outb_neon[8];
940
#endif
941

942
    for (w = 0; w < dstw; w++) {
3,205,335✔
943
        total = 0.0;
3,193,368✔
944
        for (i = 0; i < depth; i++) {
12,773,472✔
945
            offsets[i] = 0;
9,580,104✔
946
        }
3,193,368✔
947

948
        if (dsth >= srch) {
3,193,368✔
949
            center_y = (h + 0.5) * srch / dsth;
1,759,500✔
950
            y_first = MAX(center_y - n, 0);
1,759,500✔
951
            y_last = MIN(center_y + n, srch - 1);
1,759,500✔
952
        } else {
586,500✔
953
            center_y = h + 0.5;
1,433,868✔
954
            y_first = MAX(floor((center_y - n) * srch / dsth), 0);
1,433,868✔
955
            y_last = MIN(floor((center_y + n) * srch / dsth), srch - 1);
1,433,868✔
956
        }
957

958
#if defined(SIXEL_USE_AVX512)
959
        if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX512) {
2,128,912!
960
            acc512 = sixel_avx512_zero_ps();
961

962
            for (y = y_first; y <= y_last; y++) {
×
963
                diff_y = (dsth >= srch)
964
                             ? (y + 0.5) - center_y
965
                             : (y + 0.5) * dsth / srch - center_y;
×
966
                weight = f_resample(fabs(diff_y));
967
                pos = (y * dstw + w) * depth;
968
                acc512 = sixel_avx512_muladd_ps(
969
                    acc512,
970
                    sixel_avx512_load_rgb_ps(tmp + pos),
971
                    (float)weight);
972
                total += weight;
973
            }
974
            if (total > 0.0) {
×
975
                pos = (h * dstw + w) * depth;
976
                sixel_avx512_store_rgb_u8(acc512, total, dst + pos);
977
            }
978
            continue;
979
        }
980
#endif
981
#if defined(SIXEL_USE_AVX2)
982
        if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX2) {
2,128,912!
983
            acc256 = sixel_avx2_zero_ps();
2,128,912✔
984

985
            for (y = y_first; y <= y_last; y++) {
15,094,968✔
986
                diff_y = (dsth >= srch)
12,966,056✔
987
                             ? (y + 0.5) - center_y
3,505,040✔
988
                             : (y + 0.5) * dsth / srch - center_y;
12,966,056✔
989
                weight = f_resample(fabs(diff_y));
12,966,056✔
990
                pos = (y * dstw + w) * depth;
12,966,056✔
991
                acc256 = sixel_avx2_muladd_ps(
12,966,056✔
992
                    acc256,
993
                    sixel_avx2_load_rgb_ps(tmp + pos),
994
                    (float)weight);
995
                total += weight;
12,966,056✔
996
            }
997
            if (total > 0.0) {
2,128,912!
998
                pos = (h * dstw + w) * depth;
2,128,912✔
999
                sixel_avx2_store_rgb_u8(acc256, total, dst + pos);
2,128,912✔
1000
            }
1001
            continue;
2,128,912✔
1002
        }
1003
#endif
1004
#if defined(SIXEL_USE_AVX)
1005
        if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX) {
×
1006
            acc256 = sixel_avx_zero_ps();
1007

1008
            for (y = y_first; y <= y_last; y++) {
×
1009
                diff_y = (dsth >= srch)
1010
                             ? (y + 0.5) - center_y
1011
                             : (y + 0.5) * dsth / srch - center_y;
×
1012
                weight = f_resample(fabs(diff_y));
1013
                pos = (y * dstw + w) * depth;
1014
                acc256 = sixel_avx_muladd_ps(
1015
                    acc256,
1016
                    sixel_avx_load_rgb_ps(tmp + pos),
1017
                    (float)weight);
1018
                total += weight;
1019
            }
1020
            if (total > 0.0) {
×
1021
                pos = (h * dstw + w) * depth;
1022
                sixel_avx_store_rgb_u8(acc256, total, dst + pos);
1023
            }
1024
            continue;
1025
        }
1026
#endif
1027
#if defined(SIXEL_USE_SSE2) || defined(SIXEL_USE_NEON)
1028
        if (depth == 3
1,064,456!
1029
# if defined(SIXEL_USE_SSE2)
1030
            && simd_level >= SIXEL_SIMD_LEVEL_SSE2
×
1031
# elif defined(SIXEL_USE_NEON)
1032
            && simd_level >= SIXEL_SIMD_LEVEL_NEON
1,064,456!
1033
# endif
1034
            ) {
1035
#if defined(SIXEL_USE_SSE2)
1036
            acc128 = _mm_setzero_ps();
1037
#elif defined(SIXEL_USE_NEON)
1038
            acc_neon = vdupq_n_f32(0.0f);
1,064,456✔
1039
#endif
1040
            for (y = y_first; y <= y_last; y++) {
7,547,484!
1041
                diff_y = (dsth >= srch)
6,483,028✔
1042
                             ? (y + 0.5) - center_y
1,752,520✔
1043
                             : (y + 0.5) * dsth / srch - center_y;
4,730,508!
1044
                weight = f_resample(fabs(diff_y));
6,483,028✔
1045
                pos = (y * dstw + w) * depth;
6,483,028✔
1046
                const unsigned char *psrc = tmp + pos;
6,483,028✔
1047
#if defined(SIXEL_USE_SSE2)
1048
                pixel128 = psrc[0] | (psrc[1] << 8) | (psrc[2] << 16);
1049
                pixi128 = _mm_cvtsi32_si128((int)pixel128);
1050
                pixi128 = _mm_unpacklo_epi8(pixi128, _mm_setzero_si128());
1051
                pixi128 = _mm_unpacklo_epi16(pixi128, _mm_setzero_si128());
1052
                pixf128 = _mm_cvtepi32_ps(pixi128);
1053
                wv128 = _mm_set1_ps((float)weight);
1054
                acc128 = _mm_add_ps(acc128, _mm_mul_ps(pixf128, wv128));
1055
#else /* NEON */
1056
                pix32_neon = (uint32x4_t){psrc[0], psrc[1], psrc[2], 0};
6,483,028✔
1057
                pixf_neon = vcvtq_f32_u32(pix32_neon);
6,483,028✔
1058
                wv_neon = vdupq_n_f32((float)weight);
6,483,028✔
1059
                acc_neon = vmlaq_f32(acc_neon, pixf_neon, wv_neon);
6,483,028✔
1060
#endif
1061
                total += weight;
6,483,028✔
1062
            }
6,483,028✔
1063
            if (total > 0.0) {
1,064,456!
1064
#if defined(SIXEL_USE_SSE2)
1065
                scalev128 = _mm_set1_ps((float)(1.0 / total));
1066
                acc128 = _mm_mul_ps(acc128, scalev128);
1067
                minv128 = _mm_set1_ps(0.0f);
1068
                maxv128 = _mm_set1_ps(255.0f);
1069
                acc128 = _mm_max_ps(minv128, _mm_min_ps(acc128, maxv128));
1070
                acci128 = _mm_cvtps_epi32(acc128);
1071
                acc16_128 = _mm_packs_epi32(acci128, _mm_setzero_si128());
1072
                acc16_128 = _mm_packus_epi16(acc16_128, _mm_setzero_si128());
1073
                pos = (h * dstw + w) * depth;
1074
                pixel128 = (unsigned int)_mm_cvtsi128_si32(acc16_128);
1075
                dst[pos + 0] = (unsigned char)pixel128;
1076
                dst[pos + 1] = (unsigned char)(pixel128 >> 8);
1077
                dst[pos + 2] = (unsigned char)(pixel128 >> 16);
1078
#else /* NEON */
1079
                scalev_neon = vdupq_n_f32((float)(1.0 / total));
1,064,456✔
1080
                acc_neon = vmulq_f32(acc_neon, scalev_neon);
1,064,456✔
1081
                minv_neon = vdupq_n_f32(0.0f);
1,064,456✔
1082
                maxv_neon = vdupq_n_f32(255.0f);
1,064,456✔
1083
                acc_neon = vmaxq_f32(minv_neon,
2,128,912✔
1084
                                     vminq_f32(acc_neon, maxv_neon));
1,064,456✔
1085
                acci_neon = vcvtq_u32_f32(acc_neon);
1,064,456✔
1086
                acc16_neon = vmovn_u32(acci_neon);
1,064,456✔
1087
                acc8_neon = vmovn_u16(vcombine_u16(acc16_neon, acc16_neon));
1,064,456✔
1088

1089
                vst1_u8(outb_neon, acc8_neon);
1,064,456✔
1090
                pos = (h * dstw + w) * depth;
1,064,456✔
1091
                dst[pos + 0] = outb_neon[0];
1,064,456✔
1092
                dst[pos + 1] = outb_neon[1];
1,064,456✔
1093
                dst[pos + 2] = outb_neon[2];
1,064,456✔
1094
#endif
1095
            }
1,064,456✔
1096
            continue;
1,064,456✔
1097
        }
1098
#endif /* SIMD paths */
1099
        for (y = y_first; y <= y_last; y++) {
×
1100
            diff_y = (dsth >= srch)
×
1101
                         ? (y + 0.5) - center_y
×
1102
                         : (y + 0.5) * dsth / srch - center_y;
×
1103
            weight = f_resample(fabs(diff_y));
×
1104
            for (i = 0; i < depth; i++) {
×
1105
                pos = (y * dstw + w) * depth + i;
×
1106
                offsets[i] += tmp[pos] * weight;
×
1107
            }
1108
            total += weight;
×
1109
        }
1110

1111
        if (total > 0.0) {
×
1112
            for (i = 0; i < depth; i++) {
×
1113
                pos = (h * dstw + w) * depth + i;
×
1114
                dst[pos] = normalize(offsets[i], total);
×
1115
            }
1116
        }
1117
    }
1118
}
11,967✔
1119

1120
static void
1121
scale_with_resampling_serial(
75✔
1122
    unsigned char *dst,
1123
    unsigned char const *src,
1124
    int const srcw,
1125
    int const srch,
1126
    int const dstw,
1127
    int const dsth,
1128
    int const depth,
1129
    resample_fn_t const f_resample,
1130
    double const n,
1131
    unsigned char *tmp)
1132
{
1133
    int y;
1134
    int h;
1135
    int simd_level;
1136

1137
    simd_level = sixel_scale_simd_level();
75✔
1138

1139
    for (y = 0; y < srch; y++) {
32,025✔
1140
        scale_horizontal_row(tmp,
42,600✔
1141
                             src,
10,650✔
1142
                             srcw,
10,650✔
1143
                             dstw,
10,650✔
1144
                             depth,
10,650✔
1145
                             y,
10,650✔
1146
                             f_resample,
10,650✔
1147
                             n,
10,650✔
1148
                             simd_level);
10,650✔
1149
    }
10,650✔
1150

1151
    for (h = 0; h < dsth; h++) {
12,042✔
1152
        scale_vertical_row(dst,
15,956✔
1153
                           tmp,
3,989✔
1154
                           dstw,
3,989✔
1155
                           dsth,
3,989✔
1156
                           depth,
3,989✔
1157
                           srch,
3,989✔
1158
                           h,
3,989✔
1159
                           f_resample,
3,989✔
1160
                           n,
3,989✔
1161
                           simd_level);
3,989✔
1162
    }
3,989✔
1163
}
75✔
1164

1165
#if SIXEL_ENABLE_THREADS
1166
typedef enum scale_parallel_pass {
1167
    SCALE_PASS_HORIZONTAL = 0,
1168
    SCALE_PASS_VERTICAL = 1
1169
} scale_parallel_pass_t;
1170

1171
typedef struct scale_parallel_context {
1172
    unsigned char *dst;
1173
    unsigned char const *src;
1174
    unsigned char *tmp;
1175
    int srcw;
1176
    int srch;
1177
    int dstw;
1178
    int dsth;
1179
    int depth;
1180
    resample_fn_t f_resample;
1181
    double n;
1182
    scale_parallel_pass_t pass;
1183
    int simd_level;
1184
    sixel_logger_t *logger;
1185
} scale_parallel_context_t;
1186

1187
/*
1188
 * Emit worker-level timeline entries only for the first and last rows so the
1189
 * visualization remains readable even when there are many jobs.
1190
 */
1191
static int
NEW
1192
scale_parallel_should_log(scale_parallel_context_t const *ctx, int index)
×
1193
{
1194
    int last;
1195

NEW
1196
    if (ctx == NULL || ctx->logger == NULL || !ctx->logger->active) {
×
NEW
1197
        return 0;
×
1198
    }
1199

NEW
1200
    if (index < 0) {
×
NEW
1201
        return 0;
×
1202
    }
1203

NEW
1204
    last = 0;
×
NEW
1205
    if (ctx->pass == SCALE_PASS_HORIZONTAL) {
×
NEW
1206
        if (ctx->srch <= 0) {
×
NEW
1207
            return 0;
×
1208
        }
NEW
1209
        last = ctx->srch - 1;
×
1210
    } else {
NEW
1211
        if (ctx->dsth <= 0) {
×
NEW
1212
            return 0;
×
1213
        }
NEW
1214
        last = ctx->dsth - 1;
×
1215
    }
1216

NEW
1217
    return index == 0 || index == last;
×
1218
}
1219

1220
/*
1221
 * Allow callers to raise the floor for parallel execution using
1222
 * SIXEL_SCALE_PARALLEL_MIN_BYTES. The default of zero preserves the previous
1223
 * eager behavior while permitting deployments to defer threading on tiny
1224
 * inputs.
1225
 */
1226
static size_t
1227
scale_parallel_min_bytes(void)
75✔
1228
{
1229
    static int initialized = 0;
1230
    static size_t threshold = 0;
1231
    char const *text;
1232
    char *endptr;
1233
    unsigned long long parsed;
1234

1235
    if (initialized) {
75!
1236
        return threshold;
×
1237
    }
1238

1239
    initialized = 1;
75✔
1240
    text = getenv("SIXEL_SCALE_PARALLEL_MIN_BYTES");
75✔
1241
    if (text == NULL || text[0] == '\0') {
75!
1242
        return threshold;
75✔
1243
    }
1244

1245
    errno = 0;
×
1246
    parsed = strtoull(text, &endptr, 10);
×
1247
    if (endptr == text || *endptr != '\0' || errno == ERANGE) {
×
1248
        return threshold;
×
1249
    }
1250

1251
    if (parsed > (unsigned long long)SIZE_MAX) {
×
1252
        threshold = SIZE_MAX;
1253
    } else {
1254
        threshold = (size_t)parsed;
×
1255
    }
1256

1257
    return threshold;
×
1258
}
25✔
1259

1260
static int
1261
scale_parallel_worker(tp_job_t job, void *userdata, void *workspace)
×
1262
{
1263
    scale_parallel_context_t *ctx;
1264
    int index;
1265
    char const *role;
1266
    int y0;
1267
    int y1;
1268
    int in0;
1269
    int in1;
1270

1271
    (void)workspace;
1272
    ctx = (scale_parallel_context_t *)userdata;
×
1273
    if (ctx == NULL) {
×
1274
        return SIXEL_BAD_ARGUMENT;
×
1275
    }
1276

NEW
1277
    role = "horizontal";
×
NEW
1278
    y0 = 0;
×
NEW
1279
    y1 = 0;
×
NEW
1280
    in0 = 0;
×
NEW
1281
    in1 = 0;
×
1282
    index = job.band_index;
×
1283
    if (ctx->pass == SCALE_PASS_HORIZONTAL) {
×
1284
        if (index < 0 || index >= ctx->srch) {
×
1285
            return SIXEL_BAD_ARGUMENT;
×
1286
        }
NEW
1287
        y0 = index;
×
NEW
1288
        y1 = index + 1;
×
NEW
1289
        in1 = ctx->dstw;
×
NEW
1290
        if (scale_parallel_should_log(ctx, index)) {
×
NEW
1291
            sixel_logger_logf(ctx->logger,
×
1292
                              role,
1293
                              "scale",
1294
                              "start",
1295
                              index,
1296
                              index,
1297
                              y0,
1298
                              y1,
1299
                              in0,
1300
                              in1,
1301
                              "horizontal pass");
1302
        }
UNCOV
1303
        scale_horizontal_row(ctx->tmp,
×
1304
                             ctx->src,
1305
                             ctx->srcw,
1306
                             ctx->dstw,
1307
                             ctx->depth,
1308
                             index,
1309
                             ctx->f_resample,
1310
                             ctx->n,
1311
                             ctx->simd_level);
1312
    } else {
1313
        if (index < 0 || index >= ctx->dsth) {
×
1314
            return SIXEL_BAD_ARGUMENT;
×
1315
        }
NEW
1316
        role = "vertical";
×
NEW
1317
        y0 = index;
×
NEW
1318
        y1 = index + 1;
×
NEW
1319
        in1 = ctx->srch;
×
NEW
1320
        if (scale_parallel_should_log(ctx, index)) {
×
NEW
1321
            sixel_logger_logf(ctx->logger,
×
1322
                              role,
1323
                              "scale",
1324
                              "start",
1325
                              index,
1326
                              index,
1327
                              y0,
1328
                              y1,
1329
                              in0,
1330
                              in1,
1331
                              "vertical pass");
1332
        }
1333
        scale_vertical_row(ctx->dst,
×
1334
                           ctx->tmp,
×
1335
                           ctx->dstw,
1336
                           ctx->dsth,
1337
                           ctx->depth,
1338
                           ctx->srch,
1339
                           index,
1340
                           ctx->f_resample,
1341
                           ctx->n,
1342
                           ctx->simd_level);
1343
    }
1344

NEW
1345
    if (scale_parallel_should_log(ctx, index)) {
×
NEW
1346
        sixel_logger_logf(ctx->logger,
×
1347
                          role,
1348
                          "scale",
1349
                          "finish",
1350
                          index,
1351
                          index,
1352
                          y0,
1353
                          y1,
1354
                          in0,
1355
                          in1,
1356
                          "pass complete");
1357
    }
1358

UNCOV
1359
    return SIXEL_OK;
×
1360
}
1361

1362
/*
1363
 * Parallel path mirrors the encoder and dither thread selection through
1364
 * sixel_threads_resolve(). Rows become individual jobs for both passes so the
1365
 * caller can saturate the threadpool without altering the filtering math.
1366
 */
1367
static int
1368
scale_with_resampling_parallel(
75✔
1369
    unsigned char *dst,
1370
    unsigned char const *src,
1371
    int const srcw,
1372
    int const srch,
1373
    int const dstw,
1374
    int const dsth,
1375
    int const depth,
1376
    resample_fn_t const f_resample,
1377
    double const n,
1378
    unsigned char *tmp,
1379
    sixel_logger_t *logger)
1380
{
1381
    scale_parallel_context_t ctx;
1382
    threadpool_t *pool;
1383
    tp_job_t job;
1384
    size_t image_bytes;
1385
    int threads;
1386
    int queue_depth;
1387
    int y;
1388
    int rc;
1389
    int logger_ready;
1390

1391
    image_bytes = (size_t)srcw * (size_t)srch * (size_t)depth;
75✔
1392
    if (image_bytes < scale_parallel_min_bytes()) {
75!
NEW
1393
        if (logger != NULL) {
×
NEW
1394
            sixel_logger_logf(logger,
×
1395
                              "controller",
1396
                              "scale",
1397
                              "skip",
1398
                              -1,
1399
                              -1,
1400
                              0,
1401
                              0,
1402
                              0,
1403
                              0,
1404
                              "below threshold bytes=%zu",
1405
                              image_bytes);
1406
        }
UNCOV
1407
        return SIXEL_BAD_ARGUMENT;
×
1408
    }
1409

1410
    threads = sixel_threads_resolve();
75✔
1411
    if (threads < 2) {
75!
1412
        if (logger != NULL) {
75!
NEW
1413
            sixel_logger_logf(logger,
×
1414
                              "controller",
1415
                              "scale",
1416
                              "skip",
1417
                              -1,
1418
                              -1,
1419
                              0,
1420
                              0,
1421
                              0,
1422
                              0,
1423
                              "threads=%d",
1424
                              threads);
1425
        }
1426
        return SIXEL_BAD_ARGUMENT;
75✔
1427
    }
1428

NEW
1429
    logger_ready = logger != NULL && logger->active;
×
NEW
1430
    if (logger_ready) {
×
NEW
1431
        sixel_logger_logf(logger,
×
1432
                          "controller",
1433
                          "scale",
1434
                          "start",
1435
                          -1,
1436
                          -1,
1437
                          0,
1438
                          srch,
1439
                          0,
1440
                          dsth,
1441
                          "parallel scale src=%dx%d dst=%dx%d",
1442
                          srcw,
1443
                          srch,
1444
                          dstw,
1445
                          dsth);
1446
    }
1447

1448
    ctx.dst = dst;
×
1449
    ctx.src = src;
×
1450
    ctx.tmp = tmp;
×
1451
    ctx.srcw = srcw;
×
1452
    ctx.srch = srch;
×
1453
    ctx.dstw = dstw;
×
1454
    ctx.dsth = dsth;
×
1455
    ctx.depth = depth;
×
1456
    ctx.f_resample = f_resample;
×
1457
    ctx.n = n;
×
1458
    ctx.simd_level = sixel_scale_simd_level();
×
NEW
1459
    ctx.logger = logger_ready ? logger : NULL;
×
1460

1461
    queue_depth = threads * 3;
×
1462
    if (queue_depth > srch) {
×
1463
        queue_depth = srch;
×
1464
    }
1465
    if (queue_depth < 1) {
×
1466
        queue_depth = 1;
×
1467
    }
1468

1469
    ctx.pass = SCALE_PASS_HORIZONTAL;
×
NEW
1470
    if (logger_ready) {
×
NEW
1471
        sixel_logger_logf(logger,
×
1472
                          "controller",
1473
                          "scale",
1474
                          "pass_start",
1475
                          -1,
1476
                          0,
1477
                          0,
1478
                          srch,
1479
                          0,
1480
                          ctx.dstw,
1481
                          "horizontal queue=%d threads=%d",
1482
                          queue_depth,
1483
                          threads);
1484
    }
UNCOV
1485
    pool = threadpool_create(threads,
×
1486
                             queue_depth,
1487
                             0,
1488
                             scale_parallel_worker,
1489
                             &ctx);
1490
    if (pool == NULL) {
×
1491
        return SIXEL_BAD_ALLOCATION;
×
1492
    }
1493

1494
    for (y = 0; y < srch; y++) {
×
1495
        job.band_index = y;
×
1496
        threadpool_push(pool, job);
×
1497
    }
1498
    threadpool_finish(pool);
×
1499
    rc = threadpool_get_error(pool);
×
1500
    threadpool_destroy(pool);
×
1501
    if (rc != SIXEL_OK) {
×
1502
        return rc;
×
1503
    }
1504

NEW
1505
    if (logger_ready) {
×
NEW
1506
        sixel_logger_logf(logger,
×
1507
                          "controller",
1508
                          "scale",
1509
                          "pass_finish",
1510
                          -1,
1511
                          srch - 1,
1512
                          0,
1513
                          srch,
1514
                          0,
1515
                          ctx.dstw,
1516
                          "horizontal complete");
1517
    }
1518

1519
    queue_depth = threads * 3;
×
1520
    if (queue_depth > dsth) {
×
1521
        queue_depth = dsth;
×
1522
    }
1523
    if (queue_depth < 1) {
×
1524
        queue_depth = 1;
×
1525
    }
1526

1527
    ctx.pass = SCALE_PASS_VERTICAL;
×
NEW
1528
    if (logger_ready) {
×
NEW
1529
        sixel_logger_logf(logger,
×
1530
                          "controller",
1531
                          "scale",
1532
                          "pass_start",
1533
                          -1,
1534
                          0,
1535
                          0,
1536
                          dsth,
1537
                          0,
1538
                          ctx.srch,
1539
                          "vertical queue=%d threads=%d",
1540
                          queue_depth,
1541
                          threads);
1542
    }
UNCOV
1543
    pool = threadpool_create(threads,
×
1544
                             queue_depth,
1545
                             0,
1546
                             scale_parallel_worker,
1547
                             &ctx);
1548
    if (pool == NULL) {
×
1549
        return SIXEL_BAD_ALLOCATION;
×
1550
    }
1551

1552
    for (y = 0; y < dsth; y++) {
×
1553
        job.band_index = y;
×
1554
        threadpool_push(pool, job);
×
1555
    }
1556
    threadpool_finish(pool);
×
1557
    rc = threadpool_get_error(pool);
×
1558
    threadpool_destroy(pool);
×
1559

NEW
1560
    if (logger_ready) {
×
NEW
1561
        sixel_logger_logf(logger,
×
1562
                          "controller",
1563
                          "scale",
1564
                          "pass_finish",
1565
                          -1,
1566
                          dsth - 1,
1567
                          0,
1568
                          dsth,
1569
                          0,
1570
                          ctx.srch,
1571
                          "vertical complete rc=%d",
1572
                          rc);
NEW
1573
        sixel_logger_logf(logger,
×
1574
                          "controller",
1575
                          "scale",
1576
                          "finish",
1577
                          -1,
1578
                          dsth - 1,
1579
                          0,
1580
                          dsth,
1581
                          0,
1582
                          ctx.srch,
1583
                          "parallel scale status=%d",
1584
                          rc);
1585
    }
1586

UNCOV
1587
    return rc;
×
1588
}
25✔
1589
#endif /* SIXEL_ENABLE_THREADS */
1590

1591
/*
1592
 * Allocate shared scratch storage and attempt the parallel pipeline first so
1593
 * larger inputs benefit from threading while smaller ones retain the serial
1594
 * behavior.
1595
 */
1596
static void
1597
scale_with_resampling(
75✔
1598
    unsigned char *dst,
1599
    unsigned char const *src,
1600
    int const srcw,
1601
    int const srch,
1602
    int const dstw,
1603
    int const dsth,
1604
    int const depth,
1605
    resample_fn_t const f_resample,
1606
    double n,
1607
    sixel_allocator_t *allocator)
1608
{
1609
    unsigned char *tmp;
1610
    size_t tmp_size;
1611
#if SIXEL_ENABLE_THREADS
1612
    int rc;
1613
    sixel_logger_t logger;
1614
    int logger_prepared;
1615
#endif
1616

1617
#if SIXEL_ENABLE_THREADS
1618
    sixel_logger_init(&logger);
75✔
1619
    logger_prepared = 0;
75✔
1620
    (void)sixel_logger_prepare_env(&logger);
75✔
1621
    logger_prepared = logger.active;
75✔
1622
#endif
1623

1624
    tmp_size = (size_t)dstw * (size_t)srch * (size_t)depth;
75✔
1625
    tmp = (unsigned char *)sixel_allocator_malloc(allocator, tmp_size);
75✔
1626
    if (tmp == NULL) {
75!
1627
#if SIXEL_ENABLE_THREADS
NEW
1628
        if (logger_prepared) {
×
NEW
1629
            sixel_logger_close(&logger);
×
1630
        }
1631
#endif
UNCOV
1632
        return;
×
1633
    }
1634

1635
#if SIXEL_ENABLE_THREADS
1636
    rc = scale_with_resampling_parallel(dst,
100!
1637
                                        src,
25✔
1638
                                        srcw,
25✔
1639
                                        srch,
25✔
1640
                                        dstw,
25✔
1641
                                        dsth,
25✔
1642
                                        depth,
25✔
1643
                                        f_resample,
25✔
1644
                                        n,
25✔
1645
                                        tmp,
25✔
1646
                                        logger_prepared
25!
1647
                                            ? &logger
1648
                                            : NULL);
1649
    if (rc == SIXEL_OK) {
75!
1650
        sixel_allocator_free(allocator, tmp);
×
NEW
1651
        if (logger_prepared) {
×
NEW
1652
            sixel_logger_close(&logger);
×
1653
        }
UNCOV
1654
        return;
×
1655
    }
1656

1657
    if (logger_prepared) {
75!
NEW
1658
        sixel_logger_logf(&logger,
×
1659
                          "controller",
1660
                          "scale",
1661
                          "fallback",
1662
                          -1,
1663
                          -1,
1664
                          0,
1665
                          dsth,
1666
                          0,
1667
                          srch,
1668
                          "parallel rc=%d",
1669
                          rc);
1670
    }
1671
#endif
1672

1673
    scale_with_resampling_serial(dst,
100✔
1674
                                 src,
25✔
1675
                                 srcw,
25✔
1676
                                 srch,
25✔
1677
                                 dstw,
25✔
1678
                                 dsth,
25✔
1679
                                 depth,
25✔
1680
                                 f_resample,
25✔
1681
                                 n,
25✔
1682
                                 tmp);
25✔
1683

1684
    sixel_allocator_free(allocator, tmp);
75✔
1685
#if SIXEL_ENABLE_THREADS
1686
    if (logger_prepared) {
75!
NEW
1687
        sixel_logger_close(&logger);
×
1688
    }
1689
#endif
1690
}
25✔
1691

1692
static void
1693
scale_with_resampling_float32(
×
1694
    float *dst,
1695
    float const *src,
1696
    int const srcw,
1697
    int const srch,
1698
    int const dstw,
1699
    int const dsth,
1700
    int const depth,
1701
    resample_fn_t const f_resample,
1702
    double n,
1703
    sixel_allocator_t *allocator)
1704
{
1705
    int w;
1706
    int h;
1707
    int x;
1708
    int y;
1709
    int i;
1710
    int pos;
1711
    int x_first;
1712
    int x_last;
1713
    int y_first;
1714
    int y_last;
1715
    double center_x;
1716
    double center_y;
1717
    double diff_x;
1718
    double diff_y;
1719
    double weight;
1720
    double total;
1721
    double offsets[8];
1722
    float *tmp;
1723
#if defined(SIXEL_USE_SSE2) || defined(SIXEL_USE_NEON)
1724
    float vecbuf[4];
1725
#endif
1726
    int simd_level;
1727
#if defined(SIXEL_USE_AVX512)
1728
    __m512 acc512;
1729
#endif
1730
#if defined(SIXEL_USE_AVX2) || defined(SIXEL_USE_AVX)
1731
    __m256 acc256;
1732
#endif
1733
#if defined(SIXEL_USE_SSE2)
1734
    __m128 acc128;
1735
    __m128 pixf128;
1736
    __m128 wv128;
1737
    __m128 scalev128;
1738
    __m128 minv128;
1739
    __m128 maxv128;
1740
#elif defined(SIXEL_USE_NEON)
1741
    float32x4_t acc_neon;
1742
    float32x4_t pixf_neon;
1743
    float32x4_t wv_neon;
1744
    float32x4_t scalev_neon;
1745
    float32x4_t minv_neon;
1746
    float32x4_t maxv_neon;
1747
#endif
1748

1749
    tmp = (float *)sixel_allocator_malloc(
×
1750
        allocator,
1751
        (size_t)(dstw * srch * depth * (int)sizeof(float)));
×
1752
    if (tmp == NULL) {
×
1753
        return;
×
1754
    }
1755

1756
    simd_level = sixel_scale_simd_level();
×
1757

1758
    for (y = 0; y < srch; y++) {
×
1759
        for (w = 0; w < dstw; w++) {
×
1760
            total = 0.0;
×
1761
            for (i = 0; i < depth; i++) {
×
1762
                offsets[i] = 0.0;
×
1763
            }
1764

1765
            if (dstw >= srcw) {
×
1766
                center_x = (w + 0.5) * srcw / dstw;
×
1767
                x_first = MAX(center_x - n, 0);
×
1768
                x_last = MIN(center_x + n, srcw - 1);
×
1769
            } else {
1770
                center_x = w + 0.5;
×
1771
                x_first = MAX(floor((center_x - n) * srcw / dstw), 0);
×
1772
                x_last = MIN(floor((center_x + n) * srcw / dstw),
×
1773
                             srcw - 1);
1774
            }
1775

1776
#if defined(SIXEL_USE_AVX512)
1777
            if (depth == 3 &&
×
1778
                simd_level >= SIXEL_SIMD_LEVEL_AVX512) {
1779
                acc512 = sixel_avx512_zero_ps();
1780

1781
                for (x = x_first; x <= x_last; x++) {
×
1782
                    diff_x = (dstw >= srcw)
1783
                                 ? (x + 0.5) - center_x
1784
                                 : (x + 0.5) * srcw / dstw - center_x;
×
1785
                    weight = f_resample(fabs(diff_x));
1786
                    pos = (y * srcw + x) * depth;
1787
                    acc512 = sixel_avx512_muladd_ps(
1788
                        acc512,
1789
                        sixel_avx512_load_rgb_f32(src + pos),
1790
                        (float)weight);
1791
                    total += weight;
1792
                }
1793
                if (total > 0.0) {
×
1794
                    pos = (y * dstw + w) * depth;
1795
                    sixel_avx512_store_rgb_f32(acc512, total, tmp + pos);
1796
                }
1797
            } else
1798
#endif
1799
#if defined(SIXEL_USE_AVX2)
1800
            if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX2) {
×
1801
                acc256 = sixel_avx2_zero_ps();
1802

1803
                for (x = x_first; x <= x_last; x++) {
×
1804
                    diff_x = (dstw >= srcw)
1805
                                 ? (x + 0.5) - center_x
1806
                                 : (x + 0.5) * srcw / dstw - center_x;
×
1807
                    weight = f_resample(fabs(diff_x));
1808
                    pos = (y * srcw + x) * depth;
1809
                    acc256 = sixel_avx2_muladd_ps(
1810
                        acc256,
1811
                        sixel_avx2_load_rgb_f32(src + pos),
1812
                        (float)weight);
1813
                    total += weight;
1814
                }
1815
                if (total > 0.0) {
×
1816
                    pos = (y * dstw + w) * depth;
1817
                    sixel_avx2_store_rgb_f32(acc256, total, tmp + pos);
1818
                }
1819
            } else
1820
#endif
1821
#if defined(SIXEL_USE_AVX)
1822
            if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX) {
×
1823
                acc256 = sixel_avx_zero_ps();
1824

1825
                for (x = x_first; x <= x_last; x++) {
×
1826
                    diff_x = (dstw >= srcw)
1827
                                 ? (x + 0.5) - center_x
1828
                                 : (x + 0.5) * srcw / dstw - center_x;
×
1829
                    weight = f_resample(fabs(diff_x));
1830
                    pos = (y * srcw + x) * depth;
1831
                    acc256 = sixel_avx_muladd_ps(
1832
                        acc256,
1833
                        sixel_avx_load_rgb_f32(src + pos),
1834
                        (float)weight);
1835
                    total += weight;
1836
                }
1837
                if (total > 0.0) {
×
1838
                    pos = (y * dstw + w) * depth;
1839
                    sixel_avx_store_rgb_f32(acc256, total, tmp + pos);
1840
                }
1841
            } else
1842
#endif
1843
#if defined(SIXEL_USE_SSE2) || defined(SIXEL_USE_NEON)
1844
            if (depth == 3
×
1845
# if defined(SIXEL_USE_SSE2)
1846
                && simd_level >= SIXEL_SIMD_LEVEL_SSE2
×
1847
# elif defined(SIXEL_USE_NEON)
1848
                && simd_level >= SIXEL_SIMD_LEVEL_NEON
×
1849
# endif
1850
                ) {
1851
#if defined(SIXEL_USE_SSE2)
1852
                acc128 = _mm_setzero_ps();
1853
                minv128 = _mm_set1_ps(0.0f);
1854
                maxv128 = _mm_set1_ps(1.0f);
1855
#elif defined(SIXEL_USE_NEON)
1856
                acc_neon = vdupq_n_f32(0.0f);
1857
                minv_neon = vdupq_n_f32(0.0f);
1858
                maxv_neon = vdupq_n_f32(1.0f);
1859
#endif
1860
                for (x = x_first; x <= x_last; x++) {
×
1861
                    diff_x = (dstw >= srcw)
×
1862
                                 ? (x + 0.5) - center_x
×
1863
                                 : (x + 0.5) * srcw / dstw - center_x;
×
1864
                    weight = f_resample(fabs(diff_x));
×
1865
                    pos = (y * srcw + x) * depth;
×
1866
                    const float *psrc = src + pos;
×
1867
#if defined(SIXEL_USE_SSE2)
1868
                    pixf128 = _mm_set_ps(
1869
                        0.0f, psrc[2], psrc[1], psrc[0]);
1870
                    wv128 = _mm_set1_ps((float)weight);
1871
                    acc128 = _mm_add_ps(acc128,
1872
                                        _mm_mul_ps(pixf128, wv128));
1873
#else /* NEON */
1874
                    /*
1875
                     * Expand the RGB triple into a NEON vector without
1876
                     * brace initialization to keep older toolchains
1877
                     * happy.
1878
                     */
1879
                    pixf_neon = vdupq_n_f32(0.0f);
1880
                    pixf_neon = vsetq_lane_f32(psrc[0], pixf_neon, 0);
1881
                    pixf_neon = vsetq_lane_f32(psrc[1], pixf_neon, 1);
1882
                    pixf_neon = vsetq_lane_f32(psrc[2], pixf_neon, 2);
1883
                    wv_neon = vdupq_n_f32((float)weight);
1884
                    acc_neon = vmlaq_f32(acc_neon, pixf_neon, wv_neon);
1885
#endif
1886
                    total += weight;
×
1887
                }
1888
                if (total > 0.0) {
×
1889
#if defined(SIXEL_USE_SSE2)
1890
                    scalev128 = _mm_set1_ps((float)(1.0 / total));
1891
                    acc128 = _mm_mul_ps(acc128, scalev128);
1892
                    acc128 = _mm_max_ps(minv128,
1893
                                        _mm_min_ps(acc128, maxv128));
1894
                    _mm_storeu_ps(vecbuf, acc128);
1895
#else /* NEON */
1896
                    scalev_neon = vdupq_n_f32(
1897
                        (float)(1.0 / total));
1898
                    acc_neon = vmulq_f32(acc_neon, scalev_neon);
1899
                    acc_neon = vmaxq_f32(minv_neon,
1900
                                         vminq_f32(acc_neon, maxv_neon));
1901
                    vst1q_f32(vecbuf, acc_neon);
1902
#endif
1903
                    pos = (y * dstw + w) * depth;
×
1904
                    tmp[pos + 0] = vecbuf[0];
×
1905
                    tmp[pos + 1] = vecbuf[1];
×
1906
                    tmp[pos + 2] = vecbuf[2];
×
1907
                }
1908
            } else
1909
#endif
1910
            {
1911
                for (x = x_first; x <= x_last; x++) {
×
1912
                    diff_x = (dstw >= srcw)
×
1913
                                 ? (x + 0.5) - center_x
×
1914
                                 : (x + 0.5) * srcw / dstw - center_x;
×
1915
                    weight = f_resample(fabs(diff_x));
×
1916
                    for (i = 0; i < depth; i++) {
×
1917
                        pos = (y * srcw + x) * depth + i;
×
1918
                        offsets[i] += src[pos] * weight;
×
1919
                    }
1920
                    total += weight;
×
1921
                }
1922

1923
                if (total > 0.0) {
×
1924
                    for (i = 0; i < depth; i++) {
×
1925
                        pos = (y * dstw + w) * depth + i;
×
1926
                        tmp[pos] = sixel_clamp_unit_f32(
×
1927
                            (float)(offsets[i] / total));
×
1928
                    }
1929
                }
1930
            }
1931
        }
1932
    }
1933

1934
    for (h = 0; h < dsth; h++) {
×
1935
        for (w = 0; w < dstw; w++) {
×
1936
            total = 0.0;
×
1937
            for (i = 0; i < depth; i++) {
×
1938
                offsets[i] = 0.0;
×
1939
            }
1940

1941
            if (dsth >= srch) {
×
1942
                center_y = (h + 0.5) * srch / dsth;
×
1943
                y_first = MAX(center_y - n, 0);
×
1944
                y_last = MIN(center_y + n, srch - 1);
×
1945
            } else {
1946
                center_y = h + 0.5;
×
1947
                y_first = MAX(floor((center_y - n) * srch / dsth), 0);
×
1948
                y_last = MIN(floor((center_y + n) * srch / dsth),
×
1949
                             srch - 1);
1950
            }
1951

1952
#if defined(SIXEL_USE_AVX512)
1953
            if (depth == 3 &&
×
1954
                simd_level >= SIXEL_SIMD_LEVEL_AVX512) {
1955
                acc512 = sixel_avx512_zero_ps();
1956

1957
                for (y = y_first; y <= y_last; y++) {
×
1958
                    diff_y = (dsth >= srch)
1959
                                 ? (y + 0.5) - center_y
1960
                                 : (y + 0.5) * dsth / srch - center_y;
×
1961
                    weight = f_resample(fabs(diff_y));
1962
                    pos = (y * dstw + w) * depth;
1963
                    acc512 = sixel_avx512_muladd_ps(
1964
                        acc512,
1965
                        sixel_avx512_load_rgb_f32(tmp + pos),
1966
                        (float)weight);
1967
                    total += weight;
1968
                }
1969
                if (total > 0.0) {
×
1970
                    pos = (h * dstw + w) * depth;
1971
                    sixel_avx512_store_rgb_f32(acc512, total, dst + pos);
1972
                }
1973
            } else
1974
#endif
1975
#if defined(SIXEL_USE_AVX2)
1976
            if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX2) {
×
1977
                acc256 = sixel_avx2_zero_ps();
1978

1979
                for (y = y_first; y <= y_last; y++) {
×
1980
                    diff_y = (dsth >= srch)
1981
                                 ? (y + 0.5) - center_y
1982
                                 : (y + 0.5) * dsth / srch - center_y;
×
1983
                    weight = f_resample(fabs(diff_y));
1984
                    pos = (y * dstw + w) * depth;
1985
                    acc256 = sixel_avx2_muladd_ps(
1986
                        acc256,
1987
                        sixel_avx2_load_rgb_f32(tmp + pos),
1988
                        (float)weight);
1989
                    total += weight;
1990
                }
1991
                if (total > 0.0) {
×
1992
                    pos = (h * dstw + w) * depth;
1993
                    sixel_avx2_store_rgb_f32(acc256, total, dst + pos);
1994
                }
1995
            } else
1996
#endif
1997
#if defined(SIXEL_USE_AVX)
1998
            if (depth == 3 && simd_level >= SIXEL_SIMD_LEVEL_AVX) {
×
1999
                acc256 = sixel_avx_zero_ps();
2000

2001
                for (y = y_first; y <= y_last; y++) {
×
2002
                    diff_y = (dsth >= srch)
2003
                                 ? (y + 0.5) - center_y
2004
                                 : (y + 0.5) * dsth / srch - center_y;
×
2005
                    weight = f_resample(fabs(diff_y));
2006
                    pos = (y * dstw + w) * depth;
2007
                    acc256 = sixel_avx_muladd_ps(
2008
                        acc256,
2009
                        sixel_avx_load_rgb_f32(tmp + pos),
2010
                        (float)weight);
2011
                    total += weight;
2012
                }
2013
                if (total > 0.0) {
×
2014
                    pos = (h * dstw + w) * depth;
2015
                    sixel_avx_store_rgb_f32(acc256, total, dst + pos);
2016
                }
2017
            } else
2018
#endif
2019
#if defined(SIXEL_USE_SSE2) || defined(SIXEL_USE_NEON)
2020
            if (depth == 3
×
2021
# if defined(SIXEL_USE_SSE2)
2022
                && simd_level >= SIXEL_SIMD_LEVEL_SSE2
×
2023
# elif defined(SIXEL_USE_NEON)
2024
                && simd_level >= SIXEL_SIMD_LEVEL_NEON
×
2025
# endif
2026
                ) {
2027
#if defined(SIXEL_USE_SSE2)
2028
                acc128 = _mm_setzero_ps();
2029
                minv128 = _mm_set1_ps(0.0f);
2030
                maxv128 = _mm_set1_ps(1.0f);
2031
#elif defined(SIXEL_USE_NEON)
2032
                acc_neon = vdupq_n_f32(0.0f);
2033
                minv_neon = vdupq_n_f32(0.0f);
2034
                maxv_neon = vdupq_n_f32(1.0f);
2035
#endif
2036
                for (y = y_first; y <= y_last; y++) {
×
2037
                    diff_y = (dsth >= srch)
×
2038
                                 ? (y + 0.5) - center_y
×
2039
                                 : (y + 0.5) * dsth / srch - center_y;
×
2040
                    weight = f_resample(fabs(diff_y));
×
2041
                    pos = (y * dstw + w) * depth;
×
2042
                    const float *psrc = tmp + pos;
×
2043
#if defined(SIXEL_USE_SSE2)
2044
                    pixf128 = _mm_set_ps(
2045
                        0.0f, psrc[2], psrc[1], psrc[0]);
2046
                    wv128 = _mm_set1_ps((float)weight);
2047
                    acc128 = _mm_add_ps(acc128,
2048
                                        _mm_mul_ps(pixf128, wv128));
2049
#else /* NEON */
2050
                    /*
2051
                     * Expand the RGB triple into a NEON vector without
2052
                     * brace initialization to keep older toolchains
2053
                     * happy.
2054
                     */
2055
                    pixf_neon = vdupq_n_f32(0.0f);
2056
                    pixf_neon = vsetq_lane_f32(psrc[0], pixf_neon, 0);
2057
                    pixf_neon = vsetq_lane_f32(psrc[1], pixf_neon, 1);
2058
                    pixf_neon = vsetq_lane_f32(psrc[2], pixf_neon, 2);
2059
                    wv_neon = vdupq_n_f32((float)weight);
2060
                    acc_neon = vmlaq_f32(acc_neon, pixf_neon, wv_neon);
2061
#endif
2062
                    total += weight;
×
2063
                }
2064
                if (total > 0.0) {
×
2065
#if defined(SIXEL_USE_SSE2)
2066
                    scalev128 = _mm_set1_ps((float)(1.0 / total));
2067
                    acc128 = _mm_mul_ps(acc128, scalev128);
2068
                    acc128 = _mm_max_ps(minv128,
2069
                                        _mm_min_ps(acc128, maxv128));
2070
                    _mm_storeu_ps(vecbuf, acc128);
2071
#else /* NEON */
2072
                    scalev_neon = vdupq_n_f32(
2073
                        (float)(1.0 / total));
2074
                    acc_neon = vmulq_f32(acc_neon, scalev_neon);
2075
                    acc_neon = vmaxq_f32(minv_neon,
2076
                                         vminq_f32(acc_neon, maxv_neon));
2077
                    vst1q_f32(vecbuf, acc_neon);
2078
#endif
2079
                    pos = (h * dstw + w) * depth;
×
2080
                    dst[pos + 0] = vecbuf[0];
×
2081
                    dst[pos + 1] = vecbuf[1];
×
2082
                    dst[pos + 2] = vecbuf[2];
×
2083
                }
2084
            } else
2085
#endif
2086
            {
2087
                for (y = y_first; y <= y_last; y++) {
×
2088
                    diff_y = (dsth >= srch)
×
2089
                                 ? (y + 0.5) - center_y
×
2090
                                 : (y + 0.5) * dsth / srch - center_y;
×
2091
                    weight = f_resample(fabs(diff_y));
×
2092
                    for (i = 0; i < depth; i++) {
×
2093
                        pos = (y * dstw + w) * depth + i;
×
2094
                        offsets[i] += tmp[pos] * weight;
×
2095
                    }
2096
                    total += weight;
×
2097
                }
2098

2099
                if (total > 0.0) {
×
2100
                    for (i = 0; i < depth; i++) {
×
2101
                        pos = (h * dstw + w) * depth + i;
×
2102
                        dst[pos] = sixel_clamp_unit_f32(
×
2103
                            (float)(offsets[i] / total));
×
2104
                    }
2105
                }
2106
            }
2107
        }
2108
    }
2109

2110
    sixel_allocator_free(allocator, tmp);
×
2111
}
2112

2113

2114
SIXELAPI int
2115
sixel_helper_scale_image(
93✔
2116
    unsigned char       /* out */ *dst,
2117
    unsigned char const /* in */  *src,                   /* source image data */
2118
    int                 /* in */  srcw,                   /* source image width */
2119
    int                 /* in */  srch,                   /* source image height */
2120
    int                 /* in */  pixelformat,            /* one of enum pixelFormat */
2121
    int                 /* in */  dstw,                   /* destination image width */
2122
    int                 /* in */  dsth,                   /* destination image height */
2123
    int                 /* in */  method_for_resampling,  /* one of methodForResampling */
2124
    sixel_allocator_t   /* in */  *allocator)             /* allocator object */
2125
{
2126
    /*
2127
     * Convert the source image to RGB24 if necessary and scale it to the
2128
     * requested destination size.  The caller supplies an allocator used
2129
     * for any temporary buffers required during conversion or filtering.
2130
     */
2131
    int const depth = sixel_helper_compute_depth(pixelformat);
93✔
2132
    unsigned char *new_src = NULL;  /* optional converted source buffer */
93✔
2133
    int nret;
2134
    int new_pixelformat;
2135

2136
    /* ensure the scaler operates on RGB triples */
2137
    if (depth != 3) {
93!
2138
        new_src = (unsigned char *)sixel_allocator_malloc(allocator,
×
2139
                                                          (size_t)(srcw * srch * 3));
×
2140
        if (new_src == NULL) {
×
2141
            return (-1);
×
2142
        }
2143
        nret = sixel_helper_normalize_pixelformat(new_src,
×
2144
                                                  &new_pixelformat,
2145
                                                  src, pixelformat,
2146
                                                  srcw, srch);
2147
        if (nret != 0) {
×
2148
            sixel_allocator_free(allocator, new_src);
×
2149
            return (-1);
×
2150
        }
2151

2152
        src = new_src;  /* use converted buffer from here on */
×
2153
    } else {
2154
        new_pixelformat = pixelformat;
93✔
2155
    }
2156

2157
    /* choose re-sampling strategy */
2158
    switch (method_for_resampling) {
93!
2159
    case SIXEL_RES_NEAREST:
12✔
2160
        scale_without_resampling(dst, src, srcw, srch, dstw, dsth, depth);
18✔
2161
        break;
18✔
2162
    case SIXEL_RES_GAUSSIAN:
2✔
2163
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
4✔
2164
                              gaussian, 1.0, allocator);
1✔
2165
        break;
3✔
2166
    case SIXEL_RES_HANNING:
2✔
2167
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
4✔
2168
                              hanning, 1.0, allocator);
1✔
2169
        break;
3✔
2170
    case SIXEL_RES_HAMMING:
2✔
2171
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
4✔
2172
                              hamming, 1.0, allocator);
1✔
2173
        break;
3✔
2174
    case SIXEL_RES_WELSH:
2✔
2175
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
4✔
2176
                              welsh, 1.0, allocator);
1✔
2177
        break;
3✔
2178
    case SIXEL_RES_BICUBIC:
2✔
2179
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
4✔
2180
                              bicubic, 2.0, allocator);
1✔
2181
        break;
3✔
2182
    case SIXEL_RES_LANCZOS2:
4✔
2183
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
8✔
2184
                              lanczos2, 2.0, allocator);
2✔
2185
        break;
6✔
2186
    case SIXEL_RES_LANCZOS3:
2✔
2187
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
4✔
2188
                              lanczos3, 3.0, allocator);
1✔
2189
        break;
3✔
2190
    case SIXEL_RES_LANCZOS4:
2✔
2191
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
4✔
2192
                              lanczos4, 4.0, allocator);
1✔
2193
        break;
3✔
2194
    case SIXEL_RES_BILINEAR:
48✔
2195
    default:
2196
        scale_with_resampling(dst, src, srcw, srch, dstw, dsth, depth,
64✔
2197
                              bilinear, 1.0, allocator);
16✔
2198
        break;
48✔
2199
    }
2200

2201
    /* release temporary copy created for pixel-format normalization */
2202
    sixel_allocator_free(allocator, new_src);
93✔
2203
    return 0;
93✔
2204
}
31✔
2205

2206
SIXELAPI int
2207
sixel_helper_scale_image_float32(
×
2208
    float             /* out */ *dst,
2209
    float const       /* in */  *src,
2210
    int               /* in */  srcw,
2211
    int               /* in */  srch,
2212
    int               /* in */  pixelformat,
2213
    int               /* in */  dstw,
2214
    int               /* in */  dsth,
2215
    int               /* in */  method_for_resampling,
2216
    sixel_allocator_t /* in */  *allocator)
2217
{
2218
    int depth;
2219
    int depth_bytes;
2220

2221
    depth_bytes = sixel_helper_compute_depth(pixelformat);
×
2222
    if (depth_bytes <= 0) {
×
2223
        return (-1);
×
2224
    }
2225

2226
    depth = depth_bytes / (int)sizeof(float);
×
2227
    if (depth * (int)sizeof(float) != depth_bytes) {
×
2228
        return (-1);
×
2229
    }
2230

2231
    switch (method_for_resampling) {
×
2232
    case SIXEL_RES_NEAREST:
2233
        scale_without_resampling_float32(
×
2234
            dst, src, srcw, srch, dstw, dsth, depth);
2235
        break;
×
2236
    case SIXEL_RES_GAUSSIAN:
2237
        scale_with_resampling_float32(
×
2238
            dst, src, srcw, srch, dstw, dsth, depth,
2239
            gaussian, 1.0, allocator);
2240
        break;
×
2241
    case SIXEL_RES_HANNING:
2242
        scale_with_resampling_float32(
×
2243
            dst, src, srcw, srch, dstw, dsth, depth,
2244
            hanning, 1.0, allocator);
2245
        break;
×
2246
    case SIXEL_RES_HAMMING:
2247
        scale_with_resampling_float32(
×
2248
            dst, src, srcw, srch, dstw, dsth, depth,
2249
            hamming, 1.0, allocator);
2250
        break;
×
2251
    case SIXEL_RES_WELSH:
2252
        scale_with_resampling_float32(
×
2253
            dst, src, srcw, srch, dstw, dsth, depth,
2254
            welsh, 1.0, allocator);
2255
        break;
×
2256
    case SIXEL_RES_BICUBIC:
2257
        scale_with_resampling_float32(
×
2258
            dst, src, srcw, srch, dstw, dsth, depth,
2259
            bicubic, 2.0, allocator);
2260
        break;
×
2261
    case SIXEL_RES_LANCZOS2:
2262
        scale_with_resampling_float32(
×
2263
            dst, src, srcw, srch, dstw, dsth, depth,
2264
            lanczos2, 2.0, allocator);
2265
        break;
×
2266
    case SIXEL_RES_LANCZOS3:
2267
        scale_with_resampling_float32(
×
2268
            dst, src, srcw, srch, dstw, dsth, depth,
2269
            lanczos3, 3.0, allocator);
2270
        break;
×
2271
    case SIXEL_RES_LANCZOS4:
2272
        scale_with_resampling_float32(
×
2273
            dst, src, srcw, srch, dstw, dsth, depth,
2274
            lanczos4, 4.0, allocator);
2275
        break;
×
2276
    case SIXEL_RES_BILINEAR:
×
2277
    default:
2278
        scale_with_resampling_float32(
×
2279
            dst, src, srcw, srch, dstw, dsth, depth,
2280
            bilinear, 1.0, allocator);
2281
        break;
×
2282
    }
2283

2284
    return 0;
×
2285
}
2286

2287
#if HAVE_TESTS
2288

2289
static void
2290
reference_scale(
×
2291
    unsigned char *dst,
2292
    unsigned char const *src,
2293
    int const srcw,
2294
    int const srch,
2295
    int const dstw,
2296
    int const dsth,
2297
    int const depth)
2298
{
2299
    int w;
2300
    int h;
2301
    int x;
2302
    int y;
2303
    int i;
2304
    int pos;
2305

2306
    for (h = 0; h < dsth; h++) {
×
2307
        for (w = 0; w < dstw; w++) {
×
2308
            x = (long)w * srcw / dstw;
×
2309
            y = (long)h * srch / dsth;
×
2310
            for (i = 0; i < depth; i++) {
×
2311
                pos = (y * srcw + x) * depth + i;
×
2312
                dst[(h * dstw + w) * depth + i] = src[pos];
×
2313
            }
2314
        }
2315
    }
2316
}
×
2317

2318
static int
2319
test_without_resampling_case(
×
2320
    int srcw,
2321
    int srch,
2322
    int dstw,
2323
    int dsth,
2324
    int depth)
2325
{
2326
    int nret = EXIT_FAILURE;
×
2327
    size_t srcsize = (size_t)srcw * srch * depth;
×
2328
    size_t dstsize = (size_t)dstw * dsth * depth;
×
2329
    unsigned char *src = NULL;
×
2330
    unsigned char *ref = NULL;
×
2331
    unsigned char *out = NULL;
×
2332
    size_t i;
2333

2334
    src = (unsigned char *)malloc(srcsize);
×
2335
    ref = (unsigned char *)malloc(dstsize);
×
2336
    out = (unsigned char *)malloc(dstsize);
×
2337
    if (src == NULL || ref == NULL || out == NULL) {
×
2338
        goto end;
×
2339
    }
2340

2341
    for (i = 0; i < srcsize; ++i) {
×
2342
        src[i] = (unsigned char)(i & 0xff);
×
2343
    }
2344

2345
    reference_scale(ref, src, srcw, srch, dstw, dsth, depth);
×
2346
    scale_without_resampling(out, src, srcw, srch, dstw, dsth, depth);
×
2347

2348
    if (memcmp(ref, out, dstsize) != 0) {
×
2349
        goto end;
×
2350
    }
2351

2352
    nret = EXIT_SUCCESS;
×
2353

2354
end:
2355
    free(src);
×
2356
    free(ref);
×
2357
    free(out);
×
2358
    return nret;
×
2359
}
2360

2361
SIXELAPI int
2362
sixel_scale_tests_main(void)
×
2363
{
2364
    int nret = EXIT_FAILURE;
×
2365
    size_t i;
2366
    struct {
2367
        int srcw;
2368
        int srch;
2369
        int dstw;
2370
        int dsth;
2371
        int depth;
2372
    } cases[] = {
×
2373
        {8, 4, 3, 7, 3},
2374
        {13, 9, 17, 6, 4}
2375
    };
2376

2377
    for (i = 0; i < sizeof(cases) / sizeof(cases[0]); ++i) {
×
2378
        nret = test_without_resampling_case(cases[i].srcw,
×
2379
                                            cases[i].srch,
2380
                                            cases[i].dstw,
2381
                                            cases[i].dsth,
2382
                                            cases[i].depth);
2383
        if (nret != EXIT_SUCCESS) {
×
2384
            goto end;
×
2385
        }
2386
    }
2387

2388
    nret = EXIT_SUCCESS;
×
2389

2390
end:
2391
    return nret;
×
2392
}
2393

2394
#endif /* HAVE_TESTS */
2395

2396
#if defined(__GNUC__) && !defined(__clang__)
2397
# pragma GCC diagnostic pop
2398
#endif
2399

2400
/* emacs Local Variables:      */
2401
/* emacs mode: c               */
2402
/* emacs tab-width: 4          */
2403
/* emacs indent-tabs-mode: nil */
2404
/* emacs c-basic-offset: 4     */
2405
/* emacs End:                  */
2406
/* vim: set expandtab ts=4 sts=4 sw=4 : */
2407
/* EOF */
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc