• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

OSGeo / gdal / 15899162844

26 Jun 2025 10:14AM UTC coverage: 71.088% (+0.004%) from 71.084%
15899162844

Pull #12623

github

web-flow
Merge c704a8392 into f5cb024d4
Pull Request #12623: gdal raster overview add: add a --overview-src option

209 of 244 new or added lines in 5 files covered. (85.66%)

96 existing lines in 44 files now uncovered.

574014 of 807474 relevant lines covered (71.09%)

250815.03 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

87.45
/gcore/overview.cpp
1

2
/******************************************************************************
3
 *
4
 * Project:  GDAL Core
5
 * Purpose:  Helper code to implement overview support in different drivers.
6
 * Author:   Frank Warmerdam, warmerdam@pobox.com
7
 *
8
 ******************************************************************************
9
 * Copyright (c) 2000, Frank Warmerdam
10
 * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11
 *
12
 * SPDX-License-Identifier: MIT
13
 ****************************************************************************/
14

15
#include "cpl_port.h"
16
#include "gdal_priv.h"
17

18
#include <cmath>
19
#include <cstddef>
20
#include <cstdlib>
21

22
#include <algorithm>
23
#include <complex>
24
#include <condition_variable>
25
#include <limits>
26
#include <list>
27
#include <memory>
28
#include <mutex>
29
#include <vector>
30

31
#include "cpl_conv.h"
32
#include "cpl_error.h"
33
#include "cpl_float.h"
34
#include "cpl_progress.h"
35
#include "cpl_vsi.h"
36
#include "gdal.h"
37
#include "gdal_thread_pool.h"
38
#include "gdalwarper.h"
39
#include "gdal_vrt.h"
40
#include "vrtdataset.h"
41

42
#ifdef USE_NEON_OPTIMIZATIONS
43
#include "include_sse2neon.h"
44
#define USE_SSE2
45

46
#include "gdalsse_priv.h"
47

48
// Restrict to 64bit processors because they are guaranteed to have SSE2,
49
// or if __AVX2__ is defined.
50
#elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
51
#define USE_SSE2
52

53
#include "gdalsse_priv.h"
54

55
#ifdef __SSE3__
56
#include <pmmintrin.h>
57
#endif
58
#ifdef __SSSE3__
59
#include <tmmintrin.h>
60
#endif
61
#ifdef __SSE4_1__
62
#include <smmintrin.h>
63
#endif
64
#ifdef __AVX2__
65
#include <immintrin.h>
66
#endif
67

68
#endif
69

70
// To be included after above USE_SSE2 and include gdalsse_priv.h
71
// to avoid build issue on Windows x86
72
#include "gdal_priv_templates.hpp"
73

74
/************************************************************************/
75
/*                      GDALResampleChunk_Near()                        */
76
/************************************************************************/
77

78
template <class T>
79
static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
1,233✔
80
                                      const T *pChunk, T **ppDstBuffer)
81

82
{
83
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1,233✔
84
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1,233✔
85
    const GDALDataType eWrkDataType = args.eWrkDataType;
1,233✔
86
    const int nChunkXOff = args.nChunkXOff;
1,233✔
87
    const int nChunkXSize = args.nChunkXSize;
1,233✔
88
    const int nChunkYOff = args.nChunkYOff;
1,233✔
89
    const int nDstXOff = args.nDstXOff;
1,233✔
90
    const int nDstXOff2 = args.nDstXOff2;
1,233✔
91
    const int nDstYOff = args.nDstYOff;
1,233✔
92
    const int nDstYOff2 = args.nDstYOff2;
1,233✔
93
    const int nDstXWidth = nDstXOff2 - nDstXOff;
1,233✔
94

95
    /* -------------------------------------------------------------------- */
96
    /*      Allocate buffers.                                               */
97
    /* -------------------------------------------------------------------- */
98
    *ppDstBuffer = static_cast<T *>(
1,233✔
99
        VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1,233✔
100
                            GDALGetDataTypeSizeBytes(eWrkDataType)));
101
    if (*ppDstBuffer == nullptr)
1,233✔
102
    {
103
        return CE_Failure;
×
104
    }
105
    T *const pDstBuffer = *ppDstBuffer;
1,233✔
106

107
    int *panSrcXOff =
108
        static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
1,233✔
109

110
    if (panSrcXOff == nullptr)
1,233✔
111
    {
112
        return CE_Failure;
×
113
    }
114

115
    /* ==================================================================== */
116
    /*      Precompute inner loop constants.                                */
117
    /* ==================================================================== */
118
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
842,009✔
119
    {
120
        int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
840,776✔
121
        if (nSrcXOff < nChunkXOff)
840,776✔
122
            nSrcXOff = nChunkXOff;
×
123

124
        panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
840,776✔
125
    }
126

127
    /* ==================================================================== */
128
    /*      Loop over destination scanlines.                                */
129
    /* ==================================================================== */
130
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
141,825✔
131
    {
132
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
140,592✔
133
        if (nSrcYOff < nChunkYOff)
140,592✔
134
            nSrcYOff = nChunkYOff;
×
135

136
        const T *const pSrcScanline =
140,592✔
137
            pChunk +
138
            (static_cast<size_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
140,592✔
139
            nChunkXOff;
138,074✔
140

141
        /* --------------------------------------------------------------------
142
         */
143
        /*      Loop over destination pixels */
144
        /* --------------------------------------------------------------------
145
         */
146
        T *pDstScanline =
140,592✔
147
            pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
140,592✔
148
        for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
119,627,130✔
149
        {
150
            pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
119,486,612✔
151
        }
152
    }
153

154
    CPLFree(panSrcXOff);
1,233✔
155

156
    return CE_None;
1,233✔
157
}
158

159
static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
1,233✔
160
                                     const void *pChunk, void **ppDstBuffer,
161
                                     GDALDataType *peDstBufferDataType)
162
{
163
    *peDstBufferDataType = args.eWrkDataType;
1,233✔
164
    switch (args.eWrkDataType)
1,233✔
165
    {
166
        // For nearest resampling, as no computation is done, only the
167
        // size of the data type matters.
168
        case GDT_Byte:
1,081✔
169
        case GDT_Int8:
170
        {
171
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
1,081✔
172
            return GDALResampleChunk_NearT(
1,081✔
173
                args, static_cast<const uint8_t *>(pChunk),
174
                reinterpret_cast<uint8_t **>(ppDstBuffer));
1,081✔
175
        }
176

177
        case GDT_Int16:
50✔
178
        case GDT_UInt16:
179
        case GDT_Float16:
180
        {
181
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
50✔
182
            return GDALResampleChunk_NearT(
50✔
183
                args, static_cast<const uint16_t *>(pChunk),
184
                reinterpret_cast<uint16_t **>(ppDstBuffer));
50✔
185
        }
186

187
        case GDT_CInt16:
55✔
188
        case GDT_CFloat16:
189
        case GDT_Int32:
190
        case GDT_UInt32:
191
        case GDT_Float32:
192
        {
193
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
55✔
194
            return GDALResampleChunk_NearT(
55✔
195
                args, static_cast<const uint32_t *>(pChunk),
196
                reinterpret_cast<uint32_t **>(ppDstBuffer));
55✔
197
        }
198

199
        case GDT_CInt32:
43✔
200
        case GDT_CFloat32:
201
        case GDT_Int64:
202
        case GDT_UInt64:
203
        case GDT_Float64:
204
        {
205
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
43✔
206
            return GDALResampleChunk_NearT(
43✔
207
                args, static_cast<const uint64_t *>(pChunk),
208
                reinterpret_cast<uint64_t **>(ppDstBuffer));
43✔
209
        }
210

211
        case GDT_CFloat64:
4✔
212
        {
213
            return GDALResampleChunk_NearT(
4✔
214
                args, static_cast<const std::complex<double> *>(pChunk),
215
                reinterpret_cast<std::complex<double> **>(ppDstBuffer));
4✔
216
        }
217

218
        case GDT_Unknown:
×
219
        case GDT_TypeCount:
220
            break;
×
221
    }
222
    CPLAssert(false);
×
223
    return CE_Failure;
224
}
225

226
namespace
227
{
228

229
// Find in the color table the entry whose RGB value is the closest
230
// (using quadratic distance) to the test color, ignoring transparent entries.
231
int BestColorEntry(const std::vector<GDALColorEntry> &entries,
3,837✔
232
                   const GDALColorEntry &test)
233
{
234
    int nMinDist = std::numeric_limits<int>::max();
3,837✔
235
    size_t bestEntry = 0;
3,837✔
236
    for (size_t i = 0; i < entries.size(); ++i)
986,109✔
237
    {
238
        const GDALColorEntry &entry = entries[i];
982,272✔
239
        // Ignore transparent entries
240
        if (entry.c4 == 0)
982,272✔
241
            continue;
3,237✔
242

243
        int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
979,035✔
244
                    ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
979,035✔
245
                    ((test.c3 - entry.c3) * (test.c3 - entry.c3));
979,035✔
246
        if (nDist < nMinDist)
979,035✔
247
        {
248
            nMinDist = nDist;
15,847✔
249
            bestEntry = i;
15,847✔
250
        }
251
    }
252
    return static_cast<int>(bestEntry);
3,837✔
253
}
254

255
std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
7✔
256
                                           int &transparentIdx)
257
{
258
    std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
7✔
259

260
    transparentIdx = -1;
7✔
261
    int i = 0;
7✔
262
    for (auto &entry : entries)
1,799✔
263
    {
264
        table.GetColorEntryAsRGB(i, &entry);
1,792✔
265
        if (transparentIdx < 0 && entry.c4 == 0)
1,792✔
266
            transparentIdx = i;
1✔
267
        ++i;
1,792✔
268
    }
269
    return entries;
7✔
270
}
271

272
}  // unnamed  namespace
273

274
/************************************************************************/
275
/*                             SQUARE()                                 */
276
/************************************************************************/
277

278
template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
3,721✔
279
{
280
    return static_cast<Tsquare>(val) * val;
3,721✔
281
}
282

283
/************************************************************************/
284
/*                          ComputeIntegerRMS()                         */
285
/************************************************************************/
286
// Compute rms = sqrt(sumSquares / weight) in such a way that it is the
287
// integer that minimizes abs(rms**2 - sumSquares / weight)
288
template <class T, class Twork>
289
inline T ComputeIntegerRMS(double sumSquares, double weight)
42✔
290
{
291
    const double sumDivWeight = sumSquares / weight;
42✔
292
    T rms = static_cast<T>(sqrt(sumDivWeight));
42✔
293

294
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
295
    // Naive version:
296
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
297
    if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
42✔
298
        2 * sumDivWeight)
42✔
299
        rms += 1;
6✔
300
    return rms;
42✔
301
}
302

303
template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
×
304
{
305
    CPLAssert(false);
×
306
    return 0;
307
}
308

309
template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
24✔
310
{
311
    // It has been verified that given the correction on rms below, using
312
    // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
313
    // is equivalent, so use the former as it is used twice.
314
    const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
24✔
315
    const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
24✔
316
    GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
24✔
317

318
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
319
    // Naive version:
320
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
321
    // Optimized version for integer case and weight == 4
322
    if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
24✔
323
        rms += 1;
5✔
324
    return rms;
24✔
325
}
326

327
template <>
328
inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
20✔
329
{
330
    const double sumDivWeight = sumSquares * 0.25;
20✔
331
    GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
20✔
332

333
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
334
    // Naive version:
335
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
336
    // Optimized version for integer case and weight == 4
337
    if (static_cast<GUInt32>(rms) * (rms + 1) <
20✔
338
        static_cast<GUInt32>(sumDivWeight + 0.25))
20✔
339
        rms += 1;
4✔
340
    return rms;
20✔
341
}
342

343
#ifdef USE_SSE2
344

345
/************************************************************************/
346
/*                   QuadraticMeanByteSSE2OrAVX2()                      */
347
/************************************************************************/
348

349
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
350
#define sse2_packus_epi32 _mm_packus_epi32
351
#else
352
inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
516,119✔
353
{
354
    const auto minus32768_32 = _mm_set1_epi32(-32768);
516,119✔
355
    const auto minus32768_16 = _mm_set1_epi16(-32768);
516,119✔
356
    a = _mm_add_epi32(a, minus32768_32);
516,119✔
357
    b = _mm_add_epi32(b, minus32768_32);
516,119✔
358
    a = _mm_packs_epi32(a, b);
516,119✔
359
    a = _mm_sub_epi16(a, minus32768_16);
516,119✔
360
    return a;
516,119✔
361
}
362
#endif
363

364
#if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
365
#define sse2_hadd_epi16 _mm_hadd_epi16
366
#else
367
inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
4,667,530✔
368
{
369
    // Horizontal addition of adjacent pairs
370
    const auto mask = _mm_set1_epi32(0xFFFF);
4,667,530✔
371
    const auto horizLo =
372
        _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
14,002,600✔
373
    const auto horizHi =
374
        _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
14,002,600✔
375

376
    // Recombine low and high parts
377
    return _mm_packs_epi32(horizLo, horizHi);
4,667,530✔
378
}
379
#endif
380

381
#ifdef __AVX2__
382

383
#define DEST_ELTS 16
384
#define set1_epi16 _mm256_set1_epi16
385
#define set1_epi32 _mm256_set1_epi32
386
#define setzero _mm256_setzero_si256
387
#define set1_ps _mm256_set1_ps
388
#define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
389
#define unpacklo_epi8 _mm256_unpacklo_epi8
390
#define unpackhi_epi8 _mm256_unpackhi_epi8
391
#define madd_epi16 _mm256_madd_epi16
392
#define add_epi32 _mm256_add_epi32
393
#define mul_ps _mm256_mul_ps
394
#define cvtepi32_ps _mm256_cvtepi32_ps
395
#define sqrt_ps _mm256_sqrt_ps
396
#define cvttps_epi32 _mm256_cvttps_epi32
397
#define packs_epi32 _mm256_packs_epi32
398
#define packus_epi32 _mm256_packus_epi32
399
#define srli_epi32 _mm256_srli_epi32
400
#define mullo_epi16 _mm256_mullo_epi16
401
#define srli_epi16 _mm256_srli_epi16
402
#define cmpgt_epi16 _mm256_cmpgt_epi16
403
#define add_epi16 _mm256_add_epi16
404
#define sub_epi16 _mm256_sub_epi16
405
#define packus_epi16 _mm256_packus_epi16
406

407
/* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
408
/* to get the lower 128-bit bits of what would be a true 256-bit vector register
409
 */
410

411
inline __m256i FIXUP_LANES(__m256i x)
412
{
413
    return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
414
}
415

416
#define store_lo(x, y)                                                         \
417
    _mm_storeu_si128(reinterpret_cast<__m128i *>(x),                           \
418
                     _mm256_extracti128_si256(FIXUP_LANES(y), 0))
419
#define storeu_int(x, y)                                                       \
420
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
421
#define hadd_epi16 _mm256_hadd_epi16
422
#else
423
#define DEST_ELTS 8
424
#define set1_epi16 _mm_set1_epi16
425
#define set1_epi32 _mm_set1_epi32
426
#define setzero _mm_setzero_si128
427
#define set1_ps _mm_set1_ps
428
#define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
429
#define unpacklo_epi8 _mm_unpacklo_epi8
430
#define unpackhi_epi8 _mm_unpackhi_epi8
431
#define madd_epi16 _mm_madd_epi16
432
#define add_epi32 _mm_add_epi32
433
#define mul_ps _mm_mul_ps
434
#define cvtepi32_ps _mm_cvtepi32_ps
435
#define sqrt_ps _mm_sqrt_ps
436
#define cvttps_epi32 _mm_cvttps_epi32
437
#define packs_epi32 _mm_packs_epi32
438
#define packus_epi32 sse2_packus_epi32
439
#define srli_epi32 _mm_srli_epi32
440
#define mullo_epi16 _mm_mullo_epi16
441
#define srli_epi16 _mm_srli_epi16
442
#define cmpgt_epi16 _mm_cmpgt_epi16
443
#define add_epi16 _mm_add_epi16
444
#define sub_epi16 _mm_sub_epi16
445
#define packus_epi16 _mm_packus_epi16
446
#define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
447
#define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
448
#define hadd_epi16 sse2_hadd_epi16
449
#endif
450

451
template <class T>
452
static int
453
#if defined(__GNUC__)
454
    __attribute__((noinline))
455
#endif
456
    QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
5,385✔
457
                                const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
458
                                T *CPL_RESTRICT pDstScanline)
459
{
460
    // Optimized implementation for RMS on Byte by
461
    // processing by group of 8 output pixels, so as to use
462
    // a single _mm_sqrt_ps() call for 4 output pixels
463
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
5,385✔
464

465
    int iDstPixel = 0;
5,385✔
466
    const auto one16 = set1_epi16(1);
5,385✔
467
    const auto one32 = set1_epi32(1);
5,385✔
468
    const auto zero = setzero();
5,385✔
469
    const auto minus32768 = set1_epi16(-32768);
5,385✔
470

471
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
521,496✔
472
    {
473
        // Load 2 * DEST_ELTS bytes from each line
474
        auto firstLine = loadu_int(pSrcScanlineShifted);
516,111✔
475
        auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
1,032,220✔
476
        // Extend those Bytes as UInt16s
477
        auto firstLineLo = unpacklo_epi8(firstLine, zero);
516,111✔
478
        auto firstLineHi = unpackhi_epi8(firstLine, zero);
516,111✔
479
        auto secondLineLo = unpacklo_epi8(secondLine, zero);
516,111✔
480
        auto secondLineHi = unpackhi_epi8(secondLine, zero);
516,111✔
481

482
        // Multiplication of 16 bit values and horizontal
483
        // addition of 32 bit results
484
        // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
485
        firstLineLo = madd_epi16(firstLineLo, firstLineLo);
516,111✔
486
        firstLineHi = madd_epi16(firstLineHi, firstLineHi);
516,111✔
487
        secondLineLo = madd_epi16(secondLineLo, secondLineLo);
516,111✔
488
        secondLineHi = madd_epi16(secondLineHi, secondLineHi);
516,111✔
489

490
        // Vertical addition
491
        const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
516,111✔
492
        const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
516,111✔
493

494
        const auto sumSquaresPlusOneDiv4Lo =
495
            srli_epi32(add_epi32(sumSquaresLo, one32), 2);
1,032,220✔
496
        const auto sumSquaresPlusOneDiv4Hi =
497
            srli_epi32(add_epi32(sumSquaresHi, one32), 2);
1,032,220✔
498

499
        // Take square root and truncate/floor to int32
500
        const auto rmsLo =
501
            cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
1,548,330✔
502
        const auto rmsHi =
503
            cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
1,548,330✔
504

505
        // Merge back low and high registers with each RMS value
506
        // as a 16 bit value.
507
        auto rms = packs_epi32(rmsLo, rmsHi);
516,111✔
508

509
        // Round to upper value if it minimizes the
510
        // error |rms^2 - sumSquares/4|
511
        // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
512
        //    rms += 1;
513
        // which is equivalent to:
514
        // if( rms * (rms + 1) < (sumSquares+1) / 4 )
515
        //    rms += 1;
516
        // And both left and right parts fit on 16 (unsigned) bits
517
        const auto sumSquaresPlusOneDiv4 =
518
            packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
516,111✔
519
        // cmpgt_epi16 operates on signed int16, but here
520
        // we have unsigned values, so shift them by -32768 before
521
        auto mask = cmpgt_epi16(
2,580,560✔
522
            add_epi16(sumSquaresPlusOneDiv4, minus32768),
523
            add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
524
        // The value of the mask will be -1 when the correction needs to be
525
        // applied
526
        rms = sub_epi16(rms, mask);
516,111✔
527

528
        // Pack each 16 bit RMS value to 8 bits
529
        rms = packus_epi16(rms, rms /* could be anything */);
516,111✔
530
        store_lo(&pDstScanline[iDstPixel], rms);
516,111✔
531
        pSrcScanlineShifted += 2 * DEST_ELTS;
516,111✔
532
    }
533

534
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
5,385✔
535
    return iDstPixel;
5,385✔
536
}
537

538
/************************************************************************/
539
/*                      AverageByteSSE2OrAVX2()                         */
540
/************************************************************************/
541

542
template <class T>
543
static int
544
AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
111,280✔
545
                      const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
546
                      T *CPL_RESTRICT pDstScanline)
547
{
548
    // Optimized implementation for average on Byte by
549
    // processing by group of 16 output pixels for SSE2, or 32 for AVX2
550

551
    const auto zero = setzero();
111,280✔
552
    const auto two16 = set1_epi16(2);
111,280✔
553
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
111,280✔
554

555
    int iDstPixel = 0;
111,280✔
556
    for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
2,445,050✔
557
         iDstPixel += 2 * DEST_ELTS)
558
    {
559
        decltype(setzero()) average0;
560
        {
561
            // Load 2 * DEST_ELTS bytes from each line
562
            const auto firstLine = loadu_int(pSrcScanlineShifted);
2,333,770✔
563
            const auto secondLine =
564
                loadu_int(pSrcScanlineShifted + nChunkXSize);
4,667,530✔
565
            // Extend those Bytes as UInt16s
566
            const auto firstLineLo = unpacklo_epi8(firstLine, zero);
2,333,770✔
567
            const auto firstLineHi = unpackhi_epi8(firstLine, zero);
2,333,770✔
568
            const auto secondLineLo = unpacklo_epi8(secondLine, zero);
2,333,770✔
569
            const auto secondLineHi = unpackhi_epi8(secondLine, zero);
2,333,770✔
570

571
            // Vertical addition
572
            const auto sumLo = add_epi16(firstLineLo, secondLineLo);
2,333,770✔
573
            const auto sumHi = add_epi16(firstLineHi, secondLineHi);
2,333,770✔
574

575
            // Horizontal addition of adjacent pairs, and recombine low and high
576
            // parts
577
            const auto sum = hadd_epi16(sumLo, sumHi);
2,333,770✔
578

579
            // average = (sum + 2) / 4
580
            average0 = srli_epi16(add_epi16(sum, two16), 2);
2,333,770✔
581

582
            pSrcScanlineShifted += 2 * DEST_ELTS;
2,333,770✔
583
        }
584

585
        decltype(setzero()) average1;
586
        {
587
            // Load 2 * DEST_ELTS bytes from each line
588
            const auto firstLine = loadu_int(pSrcScanlineShifted);
2,333,770✔
589
            const auto secondLine =
590
                loadu_int(pSrcScanlineShifted + nChunkXSize);
4,667,530✔
591
            // Extend those Bytes as UInt16s
592
            const auto firstLineLo = unpacklo_epi8(firstLine, zero);
2,333,770✔
593
            const auto firstLineHi = unpackhi_epi8(firstLine, zero);
2,333,770✔
594
            const auto secondLineLo = unpacklo_epi8(secondLine, zero);
2,333,770✔
595
            const auto secondLineHi = unpackhi_epi8(secondLine, zero);
2,333,770✔
596

597
            // Vertical addition
598
            const auto sumLo = add_epi16(firstLineLo, secondLineLo);
2,333,770✔
599
            const auto sumHi = add_epi16(firstLineHi, secondLineHi);
2,333,770✔
600

601
            // Horizontal addition of adjacent pairs, and recombine low and high
602
            // parts
603
            const auto sum = hadd_epi16(sumLo, sumHi);
2,333,770✔
604

605
            // average = (sum + 2) / 4
606
            average1 = srli_epi16(add_epi16(sum, two16), 2);
2,333,770✔
607

608
            pSrcScanlineShifted += 2 * DEST_ELTS;
2,333,770✔
609
        }
610

611
        // Pack each 16 bit average value to 8 bits
612
        const auto average = packus_epi16(average0, average1);
2,333,770✔
613
        storeu_int(&pDstScanline[iDstPixel], average);
2,333,770✔
614
    }
615

616
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
111,280✔
617
    return iDstPixel;
111,280✔
618
}
619

620
/************************************************************************/
621
/*                     QuadraticMeanUInt16SSE2()                        */
622
/************************************************************************/
623

624
#ifdef __SSE3__
625
#define sse2_hadd_pd _mm_hadd_pd
626
#else
627
inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
8✔
628
{
629
    auto aLo_bLo =
630
        _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
32✔
631
    auto aHi_bHi =
632
        _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
32✔
633
    return _mm_add_pd(aLo_bLo, aHi_bHi);  // (aLo + aHi, bLo + bHi)
8✔
634
}
635
#endif
636

637
inline __m128d SQUARE_PD(__m128d x)
40✔
638
{
639
    return _mm_mul_pd(x, x);
40✔
640
}
641

642
#ifdef __AVX2__
643

644
inline __m256d SQUARE_PD(__m256d x)
645
{
646
    return _mm256_mul_pd(x, x);
647
}
648

649
inline __m256d FIXUP_LANES(__m256d x)
650
{
651
    return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
652
}
653

654
inline __m256 FIXUP_LANES(__m256 x)
655
{
656
    return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
657
}
658

659
#endif
660

661
template <class T>
662
static int
663
QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
10✔
664
                        const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
665
                        T *CPL_RESTRICT pDstScanline)
666
{
667
    // Optimized implementation for RMS on UInt16 by
668
    // processing by group of 4 output pixels.
669
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
10✔
670

671
    int iDstPixel = 0;
10✔
672
    const auto zero = _mm_setzero_si128();
10✔
673

674
#ifdef __AVX2__
675
    const auto zeroDot25 = _mm256_set1_pd(0.25);
676
    const auto zeroDot5 = _mm256_set1_pd(0.5);
677

678
    // The first four 0's could be anything, as we only take the bottom
679
    // 128 bits.
680
    const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
681
#else
682
    const auto zeroDot25 = _mm_set1_pd(0.25);
10✔
683
    const auto zeroDot5 = _mm_set1_pd(0.5);
10✔
684
#endif
685

686
    for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
40✔
687
    {
688
        // Load 8 UInt16 from each line
689
        const auto firstLine = _mm_loadu_si128(
30✔
690
            reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
691
        const auto secondLine =
692
            _mm_loadu_si128(reinterpret_cast<__m128i const *>(
30✔
693
                pSrcScanlineShifted + nChunkXSize));
30✔
694

695
        // Detect if all of the source values fit in 14 bits.
696
        // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
697
        // and we can do a much faster implementation.
698
        const auto maskTmp =
699
            _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
60✔
700
#if defined(__i386__) || defined(_M_IX86)
701
        uint64_t nMaskFitsIn14Bits = 0;
702
        _mm_storel_epi64(
703
            reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
704
            _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
705
#else
706
        const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
30✔
707
            _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
708
#endif
709
        if (nMaskFitsIn14Bits == 0)
30✔
710
        {
711
            // Multiplication of 16 bit values and horizontal
712
            // addition of 32 bit results
713
            const auto firstLineHSumSquare =
714
                _mm_madd_epi16(firstLine, firstLine);
26✔
715
            const auto secondLineHSumSquare =
716
                _mm_madd_epi16(secondLine, secondLine);
26✔
717
            // Vertical addition
718
            const auto sumSquares =
719
                _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
26✔
720
            // In theory we should take sqrt(sumSquares * 0.25f)
721
            // but given the rounding we do, this is equivalent to
722
            // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
723
            // sumSquares <= 4 * 16383^2
724
            const auto one32 = _mm_set1_epi32(1);
26✔
725
            const auto sumSquaresPlusOneDiv4 =
726
                _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
52✔
727
            // Take square root and truncate/floor to int32
728
            auto rms = _mm_cvttps_epi32(
78✔
729
                _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
730

731
            // Round to upper value if it minimizes the
732
            // error |rms^2 - sumSquares/4|
733
            // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
734
            //    rms += 1;
735
            // which is equivalent to:
736
            // if( rms * rms + rms < (sumSquares+1) / 4 )
737
            //    rms += 1;
738
            auto mask =
739
                _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
78✔
740
                                _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
741
            rms = _mm_sub_epi32(rms, mask);
26✔
742
            // Pack each 32 bit RMS value to 16 bits
743
            rms = _mm_packs_epi32(rms, rms /* could be anything */);
26✔
744
            _mm_storel_epi64(
745
                reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
26✔
746
            pSrcScanlineShifted += 8;
26✔
747
            continue;
26✔
748
        }
749

750
        // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
751
        // to 32 bit would result in 4 multiplications instead of 8, but
752
        // mullo/mulhi have a worse throughput than mul_pd.
753

754
        // Extend those UInt16s as UInt32s
755
        const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
4✔
756
        const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
4✔
757
        const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
4✔
758
        const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
4✔
759

760
#ifdef __AVX2__
761
        // Multiplication of 32 bit values previously converted to 64 bit double
762
        const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
763
        const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
764
        const auto secondLineLoDbl =
765
            SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
766
        const auto secondLineHiDbl =
767
            SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
768

769
        // Vertical addition of squares
770
        const auto sumSquaresLo =
771
            _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
772
        const auto sumSquaresHi =
773
            _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
774

775
        // Horizontal addition of squares
776
        const auto sumSquares =
777
            FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
778

779
        const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
780

781
        // Take square root and truncate/floor to int32
782
        auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
783
        const auto rmsDouble = _mm256_cvtepi32_pd(rms);
784
        const auto right = _mm256_sub_pd(
785
            sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
786

787
        auto mask =
788
            _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
789
        // Extract 32-bit from each of the 4 64-bit masks
790
        // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
791
        // _MM_SHUFFLE(2,0,2,0)));
792
        mask = _mm256_permutevar8x32_ps(mask, permutation);
793
        const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
794

795
        // Apply the correction
796
        rms = _mm_sub_epi32(rms, maskI);
797

798
        // Pack each 32 bit RMS value to 16 bits
799
        rms = _mm_packus_epi32(rms, rms /* could be anything */);
800
#else
801
        // Multiplication of 32 bit values previously converted to 64 bit double
802
        const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
4✔
803
        const auto firstLineLoHi =
804
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
8✔
805
        const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
4✔
806
        const auto firstLineHiHi =
807
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
8✔
808

809
        const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
4✔
810
        const auto secondLineLoHi =
811
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
8✔
812
        const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
4✔
813
        const auto secondLineHiHi =
814
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
8✔
815

816
        // Vertical addition of squares
817
        const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
4✔
818
        const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
4✔
819
        const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
4✔
820
        const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
4✔
821

822
        // Horizontal addition of squares
823
        const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
4✔
824
        const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
4✔
825

826
        const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
4✔
827
        const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
4✔
828
        // Take square root and truncate/floor to int32
829
        const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
8✔
830
        const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
8✔
831

832
        // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
833
        // if( 0.5 < sumDivWeight - (rms * rms + rms) )
834
        //     rms += 1;
835
        const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
4✔
836
        const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
4✔
837
        const auto rightLo = _mm_sub_pd(
8✔
838
            sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
839
        const auto rightHi = _mm_sub_pd(
12✔
840
            sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
841

842
        const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
8✔
843
        const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
4✔
844
        // The value of the mask will be -1 when the correction needs to be
845
        // applied
846
        const auto mask = _mm_castps_si128(_mm_shuffle_ps(
8✔
847
            maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
848

849
        auto rms = _mm_castps_si128(
16✔
850
            _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
851
        // Apply the correction
852
        rms = _mm_sub_epi32(rms, mask);
4✔
853

854
        // Pack each 32 bit RMS value to 16 bits
855
        rms = sse2_packus_epi32(rms, rms /* could be anything */);
4✔
856
#endif
857

858
        _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
4✔
859
                         rms);
860
        pSrcScanlineShifted += 8;
4✔
861
    }
862

863
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
10✔
864
    return iDstPixel;
10✔
865
}
866

867
/************************************************************************/
868
/*                         AverageUInt16SSE2()                          */
869
/************************************************************************/
870

871
template <class T>
872
static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
9✔
873
                             const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
874
                             T *CPL_RESTRICT pDstScanline)
875
{
876
    // Optimized implementation for average on UInt16 by
877
    // processing by group of 8 output pixels.
878

879
    const auto mask = _mm_set1_epi32(0xFFFF);
9✔
880
    const auto two = _mm_set1_epi32(2);
9✔
881
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
9✔
882

883
    int iDstPixel = 0;
9✔
884
    for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
13✔
885
    {
886
        __m128i averageLow;
887
        // Load 8 UInt16 from each line
888
        {
889
            const auto firstLine = _mm_loadu_si128(
4✔
890
                reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
891
            const auto secondLine =
892
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
4✔
893
                    pSrcScanlineShifted + nChunkXSize));
4✔
894

895
            // Horizontal addition and extension to 32 bit
896
            const auto horizAddFirstLine = _mm_add_epi32(
12✔
897
                _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
898
            const auto horizAddSecondLine =
899
                _mm_add_epi32(_mm_and_si128(secondLine, mask),
12✔
900
                              _mm_srli_epi32(secondLine, 16));
901

902
            // Vertical addition and average computation
903
            // average = (sum + 2) >> 2
904
            const auto sum = _mm_add_epi32(
8✔
905
                _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
906
            averageLow = _mm_srli_epi32(sum, 2);
4✔
907
        }
908
        // Load 8 UInt16 from each line
909
        __m128i averageHigh;
910
        {
911
            const auto firstLine = _mm_loadu_si128(
4✔
912
                reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
4✔
913
            const auto secondLine =
914
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
4✔
915
                    pSrcScanlineShifted + 8 + nChunkXSize));
4✔
916

917
            // Horizontal addition and extension to 32 bit
918
            const auto horizAddFirstLine = _mm_add_epi32(
12✔
919
                _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
920
            const auto horizAddSecondLine =
921
                _mm_add_epi32(_mm_and_si128(secondLine, mask),
12✔
922
                              _mm_srli_epi32(secondLine, 16));
923

924
            // Vertical addition and average computation
925
            // average = (sum + 2) >> 2
926
            const auto sum = _mm_add_epi32(
8✔
927
                _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
928
            averageHigh = _mm_srli_epi32(sum, 2);
4✔
929
        }
930

931
        // Pack each 32 bit average value to 16 bits
932
        auto average = sse2_packus_epi32(averageLow, averageHigh);
4✔
933
        _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
4✔
934
                         average);
935
        pSrcScanlineShifted += 16;
4✔
936
    }
937

938
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
9✔
939
    return iDstPixel;
9✔
940
}
941

942
/************************************************************************/
943
/*                      QuadraticMeanFloatSSE2()                        */
944
/************************************************************************/
945

946
#ifdef __SSE3__
947
#define sse2_hadd_ps _mm_hadd_ps
948
#else
949
inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
18✔
950
{
951
    auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
18✔
952
    auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
18✔
953
    return _mm_add_ps(aEven_bEven, aOdd_bOdd);  // (aEven + aOdd, bEven + bOdd)
18✔
954
}
955
#endif
956

957
#ifdef __AVX2__
958
#define RMS_FLOAT_ELTS 8
959
#define set1_ps _mm256_set1_ps
960
#define loadu_ps _mm256_loadu_ps
961
#define andnot_ps _mm256_andnot_ps
962
#define and_ps _mm256_and_ps
963
#define max_ps _mm256_max_ps
964
#define shuffle_ps _mm256_shuffle_ps
965
#define div_ps _mm256_div_ps
966
#define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
967
#define mul_ps _mm256_mul_ps
968
#define add_ps _mm256_add_ps
969
#define hadd_ps _mm256_hadd_ps
970
#define sqrt_ps _mm256_sqrt_ps
971
#define or_ps _mm256_or_ps
972
#define unpacklo_ps _mm256_unpacklo_ps
973
#define unpackhi_ps _mm256_unpackhi_ps
974
#define storeu_ps _mm256_storeu_ps
975

976
inline __m256 SQUARE_PS(__m256 x)
977
{
978
    return _mm256_mul_ps(x, x);
979
}
980

981
#else
982

983
#define RMS_FLOAT_ELTS 4
984
#define set1_ps _mm_set1_ps
985
#define loadu_ps _mm_loadu_ps
986
#define andnot_ps _mm_andnot_ps
987
#define and_ps _mm_and_ps
988
#define max_ps _mm_max_ps
989
#define shuffle_ps _mm_shuffle_ps
990
#define div_ps _mm_div_ps
991
#define cmpeq_ps _mm_cmpeq_ps
992
#define mul_ps _mm_mul_ps
993
#define add_ps _mm_add_ps
994
#define hadd_ps sse2_hadd_ps
995
#define sqrt_ps _mm_sqrt_ps
996
#define or_ps _mm_or_ps
997
#define unpacklo_ps _mm_unpacklo_ps
998
#define unpackhi_ps _mm_unpackhi_ps
999
#define storeu_ps _mm_storeu_ps
1000

1001
inline __m128 SQUARE_PS(__m128 x)
272✔
1002
{
1003
    return _mm_mul_ps(x, x);
272✔
1004
}
1005

1006
inline __m128 FIXUP_LANES(__m128 x)
68✔
1007
{
1008
    return x;
68✔
1009
}
1010

1011
#endif
1012

1013
static int
1014
#if defined(__GNUC__)
1015
    __attribute__((noinline))
1016
#endif
1017
    QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
34✔
1018
                           const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1019
                           float *CPL_RESTRICT pDstScanline)
1020
{
1021
    // Optimized implementation for RMS on Float32 by
1022
    // processing by group of RMS_FLOAT_ELTS output pixels.
1023
    const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
34✔
1024

1025
    int iDstPixel = 0;
34✔
1026
    const auto minus_zero = set1_ps(-0.0f);
34✔
1027
    const auto zeroDot25 = set1_ps(0.25f);
34✔
1028
    const auto one = set1_ps(1.0f);
34✔
1029
    const auto infv = set1_ps(std::numeric_limits<float>::infinity());
68✔
1030

1031
    for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
102✔
1032
         iDstPixel += RMS_FLOAT_ELTS)
68✔
1033
    {
1034
        // Load 2*RMS_FLOAT_ELTS Float32 from each line
1035
        auto firstLineLo = loadu_ps(pSrcScanlineShifted);
68✔
1036
        auto firstLineHi = loadu_ps(pSrcScanlineShifted + RMS_FLOAT_ELTS);
68✔
1037
        auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
68✔
1038
        auto secondLineHi =
1039
            loadu_ps(pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize);
136✔
1040

1041
        // Take the absolute value
1042
        firstLineLo = andnot_ps(minus_zero, firstLineLo);
68✔
1043
        firstLineHi = andnot_ps(minus_zero, firstLineHi);
68✔
1044
        secondLineLo = andnot_ps(minus_zero, secondLineLo);
68✔
1045
        secondLineHi = andnot_ps(minus_zero, secondLineHi);
68✔
1046

1047
        auto firstLineEven =
1048
            shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
68✔
1049
        auto firstLineOdd =
1050
            shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
68✔
1051
        auto secondLineEven =
1052
            shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
68✔
1053
        auto secondLineOdd =
1054
            shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
68✔
1055

1056
        // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
1057
        const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
204✔
1058
                                 max_ps(secondLineEven, secondLineEven));
1059

1060
        // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
1061
        // This step is important to avoid that the square evaluates to infinity
1062
        // for sufficiently big input.
1063
        auto invMax = div_ps(one, maxV);
68✔
1064
        // Deal with 0 being the maximum to correct division by zero
1065
        // note: comparing to -0 leads to identical results as to comparing with
1066
        // 0
1067
        invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
136✔
1068

1069
        firstLineEven = mul_ps(firstLineEven, invMax);
68✔
1070
        firstLineOdd = mul_ps(firstLineOdd, invMax);
68✔
1071
        secondLineEven = mul_ps(secondLineEven, invMax);
68✔
1072
        secondLineOdd = mul_ps(secondLineOdd, invMax);
68✔
1073

1074
        // Compute squares
1075
        firstLineEven = SQUARE_PS(firstLineEven);
68✔
1076
        firstLineOdd = SQUARE_PS(firstLineOdd);
68✔
1077
        secondLineEven = SQUARE_PS(secondLineEven);
68✔
1078
        secondLineOdd = SQUARE_PS(secondLineOdd);
68✔
1079

1080
        const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
204✔
1081
                                       add_ps(secondLineEven, secondLineOdd));
1082

1083
        auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
204✔
1084

1085
        // Deal with infinity being the maximum
1086
        const auto maskIsInf = cmpeq_ps(maxV, infv);
68✔
1087
        rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
136✔
1088

1089
        rms = FIXUP_LANES(rms);
68✔
1090

1091
        storeu_ps(&pDstScanline[iDstPixel], rms);
68✔
1092
        pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
68✔
1093
    }
1094

1095
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
34✔
1096
    return iDstPixel;
34✔
1097
}
1098

1099
/************************************************************************/
1100
/*                        AverageFloatSSE2()                            */
1101
/************************************************************************/
1102

1103
static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
14✔
1104
                            const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1105
                            float *CPL_RESTRICT pDstScanline)
1106
{
1107
    // Optimized implementation for average on Float32 by
1108
    // processing by group of 4 output pixels.
1109
    const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
14✔
1110

1111
    int iDstPixel = 0;
14✔
1112
    const auto zeroDot25 = _mm_set1_ps(0.25f);
14✔
1113

1114
    for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
32✔
1115
    {
1116
        // Load 8 Float32 from each line
1117
        const auto firstLineLo = _mm_loadu_ps(pSrcScanlineShifted);
18✔
1118
        const auto firstLineHi = _mm_loadu_ps(pSrcScanlineShifted + 4);
18✔
1119
        const auto secondLineLo =
1120
            _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize);
18✔
1121
        const auto secondLineHi =
1122
            _mm_loadu_ps(pSrcScanlineShifted + 4 + nChunkXSize);
36✔
1123

1124
        // Vertical addition
1125
        const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
18✔
1126
        const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
18✔
1127

1128
        // Horizontal addition
1129
        const auto sum = sse2_hadd_ps(sumLo, sumHi);
18✔
1130

1131
        const auto average = _mm_mul_ps(sum, zeroDot25);
18✔
1132

1133
        _mm_storeu_ps(&pDstScanline[iDstPixel], average);
18✔
1134
        pSrcScanlineShifted += 8;
18✔
1135
    }
1136

1137
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
14✔
1138
    return iDstPixel;
14✔
1139
}
1140

1141
#endif
1142

1143
/************************************************************************/
1144
/*                    GDALResampleChunk_AverageOrRMS()                  */
1145
/************************************************************************/
1146

1147
template <class T, class Tsum, GDALDataType eWrkDataType>
1148
static CPLErr
1149
GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
2,319✔
1150
                                 const T *pChunk, void **ppDstBuffer)
1151
{
1152
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2,319✔
1153
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2,319✔
1154
    const double dfSrcXDelta = args.dfSrcXDelta;
2,319✔
1155
    const double dfSrcYDelta = args.dfSrcYDelta;
2,319✔
1156
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2,319✔
1157
    const int nChunkXOff = args.nChunkXOff;
2,319✔
1158
    const int nChunkYOff = args.nChunkYOff;
2,319✔
1159
    const int nChunkXSize = args.nChunkXSize;
2,319✔
1160
    const int nChunkYSize = args.nChunkYSize;
2,319✔
1161
    const int nDstXOff = args.nDstXOff;
2,319✔
1162
    const int nDstXOff2 = args.nDstXOff2;
2,319✔
1163
    const int nDstYOff = args.nDstYOff;
2,319✔
1164
    const int nDstYOff2 = args.nDstYOff2;
2,319✔
1165
    const char *pszResampling = args.pszResampling;
2,319✔
1166
    bool bHasNoData = args.bHasNoData;
2,319✔
1167
    const double dfNoDataValue = args.dfNoDataValue;
2,319✔
1168
    const GDALColorTable *poColorTable = args.poColorTable;
2,319✔
1169
    const bool bPropagateNoData = args.bPropagateNoData;
2,319✔
1170

1171
    // AVERAGE_BIT2GRAYSCALE
1172
    const bool bBit2Grayscale =
1173
        CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
2,319✔
1174
    const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
2,319✔
1175
    if (bBit2Grayscale)
2,319✔
1176
        poColorTable = nullptr;
9✔
1177

1178
    T tNoDataValue;
1179
    if (!bHasNoData)
2,319✔
1180
        tNoDataValue = 0;
2,263✔
1181
    else
1182
        tNoDataValue = static_cast<T>(dfNoDataValue);
56✔
1183
    const T tReplacementVal =
2,319✔
1184
        bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
114✔
1185
                         args.eOvrDataType, dfNoDataValue))
56✔
1186
                   : 0;
1187

1188
    int nChunkRightXOff = nChunkXOff + nChunkXSize;
2,319✔
1189
    int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2,319✔
1190
    int nDstXWidth = nDstXOff2 - nDstXOff;
2,319✔
1191

1192
    /* -------------------------------------------------------------------- */
1193
    /*      Allocate buffers.                                               */
1194
    /* -------------------------------------------------------------------- */
1195
    *ppDstBuffer = static_cast<T *>(
2,319✔
1196
        VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
2,319✔
1197
                            GDALGetDataTypeSizeBytes(eWrkDataType)));
1198
    if (*ppDstBuffer == nullptr)
2,319✔
1199
    {
1200
        return CE_Failure;
×
1201
    }
1202
    T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
2,319✔
1203

1204
    struct PrecomputedXValue
1205
    {
1206
        int nLeftXOffShifted;
1207
        int nRightXOffShifted;
1208
        double dfLeftWeight;
1209
        double dfRightWeight;
1210
        double dfTotalWeightFullLine;
1211
    };
1212

1213
    PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1214
        VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
2,319✔
1215

1216
    if (pasSrcX == nullptr)
2,319✔
1217
    {
1218
        return CE_Failure;
×
1219
    }
1220

1221
    int nTransparentIdx = -1;
2,319✔
1222
    std::vector<GDALColorEntry> colorEntries;
2,319✔
1223
    if (poColorTable)
2,319✔
1224
        colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
5✔
1225

1226
    // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1227
    // it as nodata value
1228
    if (bHasNoData && dfNoDataValue >= 0.0f &&
2,349✔
1229
        tNoDataValue < colorEntries.size())
30✔
1230
        colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1✔
1231

1232
    // Or if we have no explicit nodata, but a color table entry that is
1233
    // transparent, consider it as the nodata value
1234
    else if (!bHasNoData && nTransparentIdx >= 0)
2,318✔
1235
    {
1236
        bHasNoData = true;
×
1237
        tNoDataValue = static_cast<T>(nTransparentIdx);
×
1238
    }
1239

1240
    /* ==================================================================== */
1241
    /*      Precompute inner loop constants.                                */
1242
    /* ==================================================================== */
1243
    bool bSrcXSpacingIsTwo = true;
2,319✔
1244
    int nLastSrcXOff2 = -1;
2,319✔
1245
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
852,323✔
1246
    {
1247
        double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
850,004✔
1248
        // Apply some epsilon to avoid numerical precision issues
1249
        int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
850,004✔
1250
        double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
850,004✔
1251
        int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
850,004✔
1252

1253
        if (nSrcXOff < nChunkXOff)
850,004✔
1254
            nSrcXOff = nChunkXOff;
×
1255
        if (nSrcXOff2 == nSrcXOff)
850,004✔
1256
            nSrcXOff2++;
×
1257
        if (nSrcXOff2 > nChunkRightXOff)
850,004✔
1258
            nSrcXOff2 = nChunkRightXOff;
1✔
1259

1260
        pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
850,004✔
1261
        pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
850,004✔
1262
            nSrcXOff2 - nChunkXOff;
850,004✔
1263
        pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
21✔
1264
            (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
850,004✔
1265
        pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
850,004✔
1266
            1 - (nSrcXOff2 - dfSrcXOff2);
850,004✔
1267
        pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
850,004✔
1268
            pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
850,004✔
1269
        if (nSrcXOff + 1 < nSrcXOff2)
850,004✔
1270
        {
1271
            pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
849,982✔
1272
                nSrcXOff2 - nSrcXOff - 2;
849,982✔
1273
            pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
849,982✔
1274
                pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
849,982✔
1275
        }
1276

1277
        if (nSrcXOff2 - nSrcXOff != 2 ||
850,004✔
1278
            (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
728,595✔
1279
        {
1280
            bSrcXSpacingIsTwo = false;
120,599✔
1281
        }
1282
        nLastSrcXOff2 = nSrcXOff2;
850,004✔
1283
    }
1284

1285
    /* ==================================================================== */
1286
    /*      Loop over destination scanlines.                                */
1287
    /* ==================================================================== */
1288
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
721,820✔
1289
    {
1290
        double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
719,501✔
1291
        int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
719,501✔
1292
        if (nSrcYOff < nChunkYOff)
719,501✔
1293
            nSrcYOff = nChunkYOff;
×
1294

1295
        double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
719,501✔
1296
        int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
719,501✔
1297
        if (nSrcYOff2 == nSrcYOff)
719,501✔
1298
            ++nSrcYOff2;
×
1299
        if (nSrcYOff2 > nChunkBottomYOff)
719,501✔
1300
            nSrcYOff2 = nChunkBottomYOff;
3✔
1301

1302
        T *const pDstScanline =
719,501✔
1303
            pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
719,501✔
1304

1305
        /* --------------------------------------------------------------------
1306
         */
1307
        /*      Loop over destination pixels */
1308
        /* --------------------------------------------------------------------
1309
         */
1310
        if (poColorTable == nullptr)
719,501✔
1311
        {
1312
            if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
719,386✔
1313
                pabyChunkNodataMask == nullptr)
1314
            {
1315
                if constexpr (eWrkDataType == GDT_Byte ||
1316
                              eWrkDataType == GDT_UInt16)
1317
                {
1318
                    // Optimized case : no nodata, overview by a factor of 2 and
1319
                    // regular x and y src spacing.
1320
                    const T *pSrcScanlineShifted =
116,684✔
1321
                        pChunk + pasSrcX[0].nLeftXOffShifted +
116,684✔
1322
                        static_cast<size_t>(nSrcYOff - nChunkYOff) *
116,684✔
1323
                            nChunkXSize;
116,684✔
1324
                    int iDstPixel = 0;
116,684✔
1325
#ifdef USE_SSE2
1326
                    if constexpr (eWrkDataType == GDT_Byte)
1327
                    {
1328
                        if (bQuadraticMean)
116,665✔
1329
                        {
1330
                            iDstPixel = QuadraticMeanByteSSE2OrAVX2(
5,385✔
1331
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1332
                                pDstScanline);
1333
                        }
1334
                        else
1335
                        {
1336
                            iDstPixel = AverageByteSSE2OrAVX2(
111,280✔
1337
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1338
                                pDstScanline);
1339
                        }
1340
                    }
1341
                    else
1342
                    {
1343
                        static_assert(eWrkDataType == GDT_UInt16);
1344
                        if (bQuadraticMean)
19✔
1345
                        {
1346
                            iDstPixel = QuadraticMeanUInt16SSE2(
10✔
1347
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1348
                                pDstScanline);
1349
                        }
1350
                        else
1351
                        {
1352
                            iDstPixel = AverageUInt16SSE2(
9✔
1353
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1354
                                pDstScanline);
1355
                        }
1356
                    }
1357
#endif
1358
                    for (; iDstPixel < nDstXWidth; ++iDstPixel)
291,091✔
1359
                    {
1360
                        Tsum nTotal = 0;
174,407✔
1361
                        T nVal;
1362
                        if (bQuadraticMean)
174,407✔
1363
                            nTotal =
44✔
1364
                                SQUARE<Tsum>(pSrcScanlineShifted[0]) +
44✔
1365
                                SQUARE<Tsum>(pSrcScanlineShifted[1]) +
44✔
1366
                                SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
44✔
1367
                                SQUARE<Tsum>(
44✔
1368
                                    pSrcScanlineShifted[1 + nChunkXSize]);
44✔
1369
                        else
1370
                            nTotal = pSrcScanlineShifted[0] +
174,363✔
1371
                                     pSrcScanlineShifted[1] +
174,363✔
1372
                                     pSrcScanlineShifted[nChunkXSize] +
174,363✔
1373
                                     pSrcScanlineShifted[1 + nChunkXSize];
174,363✔
1374

1375
                        constexpr int nTotalWeight = 4;
174,407✔
1376
                        if (bQuadraticMean)
174,407✔
1377
                            nVal = ComputeIntegerRMS_4values<T>(nTotal);
44✔
1378
                        else
1379
                            nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
174,363✔
1380
                                                  nTotalWeight);
1381

1382
                        // No need to compare nVal against tNoDataValue as we
1383
                        // are in a case where pabyChunkNodataMask == nullptr
1384
                        // implies the absence of nodata value.
1385
                        pDstScanline[iDstPixel] = nVal;
174,407✔
1386
                        pSrcScanlineShifted += 2;
174,407✔
1387
                    }
1388
                }
1389
                else
1390
                {
1391
                    static_assert(eWrkDataType == GDT_Float32 ||
1392
                                  eWrkDataType == GDT_Float64);
1393
                    const T *pSrcScanlineShifted =
70✔
1394
                        pChunk + pasSrcX[0].nLeftXOffShifted +
70✔
1395
                        static_cast<size_t>(nSrcYOff - nChunkYOff) *
70✔
1396
                            nChunkXSize;
70✔
1397
                    int iDstPixel = 0;
70✔
1398
#ifdef USE_SSE2
1399
                    if constexpr (eWrkDataType == GDT_Float32)
1400
                    {
1401
                        static_assert(std::is_same_v<T, float>);
1402
                        if (bQuadraticMean)
48✔
1403
                        {
1404
                            iDstPixel = QuadraticMeanFloatSSE2(
34✔
1405
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1406
                                pDstScanline);
1407
                        }
1408
                        else
1409
                        {
1410
                            iDstPixel = AverageFloatSSE2(
14✔
1411
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1412
                                pDstScanline);
1413
                        }
1414
                    }
1415
#endif
1416

1417
                    for (; iDstPixel < nDstXWidth; ++iDstPixel)
268✔
1418
                    {
1419
                        T nVal;
1420
                        if (bQuadraticMean)
198✔
1421
                        {
1422
                            // Cast to double to avoid overflows
1423
                            // (using std::hypot() is much slower)
1424
                            nVal = static_cast<T>(std::sqrt(
100✔
1425
                                0.25 *
1426
                                (SQUARE<double>(pSrcScanlineShifted[0]) +
100✔
1427
                                 SQUARE<double>(pSrcScanlineShifted[1]) +
100✔
1428
                                 SQUARE<double>(
100✔
1429
                                     pSrcScanlineShifted[nChunkXSize]) +
200✔
1430
                                 SQUARE<double>(
100✔
1431
                                     pSrcScanlineShifted[1 + nChunkXSize]))));
100✔
1432
                        }
1433
                        else
1434
                        {
1435
                            nVal = static_cast<T>(
98✔
1436
                                0.25f * (pSrcScanlineShifted[0] +
98✔
1437
                                         pSrcScanlineShifted[1] +
98✔
1438
                                         pSrcScanlineShifted[nChunkXSize] +
98✔
1439
                                         pSrcScanlineShifted[1 + nChunkXSize]));
98✔
1440
                        }
1441

1442
                        // No need to compare nVal against tNoDataValue as we
1443
                        // are in a case where pabyChunkNodataMask == nullptr
1444
                        // implies the absence of nodata value.
1445
                        pDstScanline[iDstPixel] = nVal;
198✔
1446
                        pSrcScanlineShifted += 2;
198✔
1447
                    }
1448
                }
116,754✔
1449
            }
1450
            else
1451
            {
1452
                const double dfBottomWeight =
17✔
1453
                    (nSrcYOff + 1 == nSrcYOff2) ? 1.0
602,632✔
1454
                                                : 1.0 - (dfSrcYOff - nSrcYOff);
602,615✔
1455
                const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
602,632✔
1456
                nSrcYOff -= nChunkYOff;
602,632✔
1457
                nSrcYOff2 -= nChunkYOff;
602,632✔
1458

1459
                double dfTotalWeightFullColumn = dfBottomWeight;
602,632✔
1460
                if (nSrcYOff + 1 < nSrcYOff2)
602,632✔
1461
                {
1462
                    dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
602,615✔
1463
                    dfTotalWeightFullColumn += dfTopWeight;
602,615✔
1464
                }
1465

1466
                for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
18,756,560✔
1467
                {
1468
                    const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
18,151,583✔
1469
                    const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
18,151,583✔
1470

1471
                    double dfTotal = 0;
18,151,583✔
1472
                    double dfTotalWeight = 0;
18,151,583✔
1473
                    if (pabyChunkNodataMask == nullptr)
18,151,583✔
1474
                    {
1475
                        auto pChunkShifted =
1,746,435✔
1476
                            pChunk +
115✔
1477
                            static_cast<size_t>(nSrcYOff) * nChunkXSize;
1,746,435✔
1478
                        int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1,746,435✔
1479
                        double dfWeightY = dfBottomWeight;
1,746,435✔
1480
                        while (true)
3,493,427✔
1481
                        {
1482
                            double dfTotalLine;
1483
                            if (bQuadraticMean)
5,239,852✔
1484
                            {
1485
                                // Left pixel
1486
                                {
1487
                                    const T val = pChunkShifted[nSrcXOff];
104✔
1488
                                    dfTotalLine =
104✔
1489
                                        SQUARE<double>(val) *
104✔
1490
                                        pasSrcX[iDstPixel].dfLeftWeight;
104✔
1491
                                }
1492

1493
                                if (nSrcXOff + 1 < nSrcXOff2)
104✔
1494
                                {
1495
                                    // Middle pixels
1496
                                    for (int iX = nSrcXOff + 1;
104✔
1497
                                         iX < nSrcXOff2 - 1; ++iX)
424✔
1498
                                    {
1499
                                        const T val = pChunkShifted[iX];
320✔
1500
                                        dfTotalLine += SQUARE<double>(val);
320✔
1501
                                    }
1502

1503
                                    // Right pixel
1504
                                    {
1505
                                        const T val =
104✔
1506
                                            pChunkShifted[nSrcXOff2 - 1];
104✔
1507
                                        dfTotalLine +=
104✔
1508
                                            SQUARE<double>(val) *
104✔
1509
                                            pasSrcX[iDstPixel].dfRightWeight;
104✔
1510
                                    }
1511
                                }
1512
                            }
1513
                            else
1514
                            {
1515
                                // Left pixel
1516
                                {
1517
                                    const T val = pChunkShifted[nSrcXOff];
5,239,756✔
1518
                                    dfTotalLine =
5,239,756✔
1519
                                        val * pasSrcX[iDstPixel].dfLeftWeight;
5,239,756✔
1520
                                }
1521

1522
                                if (nSrcXOff + 1 < nSrcXOff2)
5,239,756✔
1523
                                {
1524
                                    // Middle pixels
1525
                                    for (int iX = nSrcXOff + 1;
4,239,330✔
1526
                                         iX < nSrcXOff2 - 1; ++iX)
64,183,126✔
1527
                                    {
1528
                                        const T val = pChunkShifted[iX];
59,943,836✔
1529
                                        dfTotalLine += val;
59,943,836✔
1530
                                    }
1531

1532
                                    // Right pixel
1533
                                    {
1534
                                        const T val =
4,239,330✔
1535
                                            pChunkShifted[nSrcXOff2 - 1];
4,239,330✔
1536
                                        dfTotalLine +=
4,239,330✔
1537
                                            val *
4,239,330✔
1538
                                            pasSrcX[iDstPixel].dfRightWeight;
4,239,330✔
1539
                                    }
1540
                                }
1541
                            }
1542

1543
                            dfTotal += dfTotalLine * dfWeightY;
5,239,852✔
1544
                            --nCounterY;
5,239,852✔
1545
                            if (nCounterY < 0)
5,239,852✔
1546
                                break;
1,746,435✔
1547
                            pChunkShifted += nChunkXSize;
3,493,427✔
1548
                            dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
3,493,427✔
1549
                        }
1550

1551
                        dfTotalWeight =
1,746,435✔
1552
                            pasSrcX[iDstPixel].dfTotalWeightFullLine *
1,746,435✔
1553
                            dfTotalWeightFullColumn;
1554
                    }
1555
                    else
1556
                    {
1557
                        size_t nCount = 0;
16,405,168✔
1558
                        for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
71,771,104✔
1559
                        {
1560
                            const auto pChunkShifted =
55,364,536✔
1561
                                pChunk + static_cast<size_t>(iY) * nChunkXSize;
55,364,536✔
1562

1563
                            double dfTotalLine = 0;
55,364,536✔
1564
                            double dfTotalWeightLine = 0;
55,364,536✔
1565
                            // Left pixel
1566
                            {
1567
                                const int iX = nSrcXOff;
55,364,536✔
1568
                                const T val = pChunkShifted[iX];
55,364,536✔
1569
                                if (pabyChunkNodataMask
55,364,536✔
1570
                                        [iX +
55,364,536✔
1571
                                         static_cast<size_t>(iY) * nChunkXSize])
55,364,536✔
1572
                                {
1573
                                    nCount++;
23,514,683✔
1574
                                    const double dfWeightX =
23,514,683✔
1575
                                        pasSrcX[iDstPixel].dfLeftWeight;
23,514,683✔
1576
                                    dfTotalWeightLine = dfWeightX;
23,514,683✔
1577
                                    if (bQuadraticMean)
23,514,683✔
1578
                                        dfTotalLine =
60✔
1579
                                            SQUARE<double>(val) * dfWeightX;
60✔
1580
                                    else
1581
                                        dfTotalLine = val * dfWeightX;
23,514,683✔
1582
                                }
1583
                            }
1584

1585
                            if (nSrcXOff < nSrcXOff2 - 1)
55,364,536✔
1586
                            {
1587
                                // Middle pixels
1588
                                for (int iX = nSrcXOff + 1; iX < nSrcXOff2 - 1;
152,882,136✔
1589
                                     ++iX)
1590
                                {
1591
                                    const T val = pChunkShifted[iX];
97,517,000✔
1592
                                    if (pabyChunkNodataMask
97,517,000✔
1593
                                            [iX + static_cast<size_t>(iY) *
97,517,000✔
1594
                                                      nChunkXSize])
97,517,000✔
1595
                                    {
1596
                                        nCount++;
39,727,100✔
1597
                                        dfTotalWeightLine += 1;
39,727,100✔
1598
                                        if (bQuadraticMean)
39,727,100✔
1599
                                            dfTotalLine += SQUARE<double>(val);
×
1600
                                        else
1601
                                            dfTotalLine += val;
39,727,100✔
1602
                                    }
1603
                                }
1604

1605
                                // Right pixel
1606
                                {
1607
                                    const int iX = nSrcXOff2 - 1;
55,364,936✔
1608
                                    const T val = pChunkShifted[iX];
55,364,936✔
1609
                                    if (pabyChunkNodataMask
55,364,936✔
1610
                                            [iX + static_cast<size_t>(iY) *
55,364,936✔
1611
                                                      nChunkXSize])
55,364,936✔
1612
                                    {
1613
                                        nCount++;
23,514,751✔
1614
                                        const double dfWeightX =
23,514,751✔
1615
                                            pasSrcX[iDstPixel].dfRightWeight;
23,514,751✔
1616
                                        dfTotalWeightLine += dfWeightX;
23,514,751✔
1617
                                        if (bQuadraticMean)
23,514,751✔
1618
                                            dfTotalLine +=
1,139✔
1619
                                                SQUARE<double>(val) * dfWeightX;
61✔
1620
                                        else
1621
                                            dfTotalLine += val * dfWeightX;
23,514,750✔
1622
                                    }
1623
                                }
1624
                            }
1625

1626
                            const double dfWeightY =
94,329,704✔
1627
                                (iY == nSrcYOff)        ? dfBottomWeight
1628
                                : (iY + 1 == nSrcYOff2) ? dfTopWeight
38,963,768✔
1629
                                                        : 1.0;
1630
                            dfTotal += dfTotalLine * dfWeightY;
55,365,936✔
1631
                            dfTotalWeight += dfTotalWeightLine * dfWeightY;
55,365,936✔
1632
                        }
1633

1634
                        if (nCount == 0 ||
16,406,568✔
1635
                            (bPropagateNoData &&
8✔
1636
                             nCount <
1637
                                 static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
8✔
1638
                                     (nSrcXOff2 - nSrcXOff)))
8✔
1639
                        {
1640
                            pDstScanline[iDstPixel] = tNoDataValue;
9,608,182✔
1641
                            continue;
9,608,182✔
1642
                        }
1643
                    }
1644
                    if constexpr (eWrkDataType == GDT_Byte)
1645
                    {
1646
                        T nVal;
1647
                        if (bQuadraticMean)
8,544,660✔
1648
                            nVal = ComputeIntegerRMS<T, int>(dfTotal,
38✔
1649
                                                             dfTotalWeight);
1650
                        else
1651
                            nVal =
8,544,620✔
1652
                                static_cast<T>(dfTotal / dfTotalWeight + 0.5);
8,544,620✔
1653
                        if (bHasNoData && nVal == tNoDataValue)
8,545,560✔
1654
                            nVal = tReplacementVal;
×
1655
                        pDstScanline[iDstPixel] = nVal;
8,545,560✔
1656
                    }
1657
                    else if constexpr (eWrkDataType == GDT_UInt16)
1658
                    {
1659
                        T nVal;
1660
                        if (bQuadraticMean)
8✔
1661
                            nVal = ComputeIntegerRMS<T, uint64_t>(
4✔
1662
                                dfTotal, dfTotalWeight);
1663
                        else
1664
                            nVal =
4✔
1665
                                static_cast<T>(dfTotal / dfTotalWeight + 0.5);
4✔
1666
                        if (bHasNoData && nVal == tNoDataValue)
8✔
1667
                            nVal = tReplacementVal;
×
1668
                        pDstScanline[iDstPixel] = nVal;
8✔
1669
                    }
1670
                    else
1671
                    {
1672
                        T nVal;
1673
                        if (bQuadraticMean)
153✔
1674
                            nVal =
20✔
1675
                                static_cast<T>(sqrt(dfTotal / dfTotalWeight));
25✔
1676
                        else
1677
                            nVal = static_cast<T>(dfTotal / dfTotalWeight);
128✔
1678
                        if (bHasNoData && nVal == tNoDataValue)
153✔
1679
                            nVal = tReplacementVal;
2✔
1680
                        pDstScanline[iDstPixel] = nVal;
153✔
1681
                    }
1682
                }
1683
            }
1684
        }
1685
        else
1686
        {
1687
            nSrcYOff -= nChunkYOff;
115✔
1688
            nSrcYOff2 -= nChunkYOff;
115✔
1689

1690
            for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
4,256✔
1691
            {
1692
                const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
6,475✔
1693
                const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
6,475✔
1694

1695
                uint64_t nTotalR = 0;
6,475✔
1696
                uint64_t nTotalG = 0;
6,475✔
1697
                uint64_t nTotalB = 0;
6,475✔
1698
                size_t nCount = 0;
6,475✔
1699

1700
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
19,425✔
1701
                {
1702
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
38,850✔
1703
                    {
1704
                        const T val =
25,900✔
1705
                            pChunk[iX + static_cast<size_t>(iY) * nChunkXSize];
25,900✔
1706
                        // cppcheck-suppress unsignedLessThanZero
1707
                        if (val < 0 || val >= colorEntries.size())
25,900✔
1708
                            continue;
×
1709
                        const size_t idx = static_cast<size_t>(val);
25,900✔
1710
                        const auto &entry = colorEntries[idx];
25,900✔
1711
                        if (entry.c4)
25,900✔
1712
                        {
1713
                            if (bQuadraticMean)
14,128✔
1714
                            {
1715
                                nTotalR += SQUARE<int>(entry.c1);
800✔
1716
                                nTotalG += SQUARE<int>(entry.c2);
800✔
1717
                                nTotalB += SQUARE<int>(entry.c3);
800✔
1718
                                ++nCount;
800✔
1719
                            }
1720
                            else
1721
                            {
1722
                                nTotalR += entry.c1;
13,328✔
1723
                                nTotalG += entry.c2;
13,328✔
1724
                                nTotalB += entry.c3;
13,328✔
1725
                                ++nCount;
13,328✔
1726
                            }
1727
                        }
1728
                    }
1729
                }
1730

1731
                if (nCount == 0 ||
6,475✔
1732
                    (bPropagateNoData &&
×
1733
                     nCount < static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
×
1734
                                  (nSrcXOff2 - nSrcXOff)))
×
1735
                {
1736
                    pDstScanline[iDstPixel] = tNoDataValue;
2,838✔
1737
                }
1738
                else
1739
                {
1740
                    GDALColorEntry color;
1741
                    if (bQuadraticMean)
3,637✔
1742
                    {
1743
                        color.c1 =
200✔
1744
                            static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
200✔
1745
                        color.c2 =
200✔
1746
                            static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
200✔
1747
                        color.c3 =
200✔
1748
                            static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
200✔
1749
                    }
1750
                    else
1751
                    {
1752
                        color.c1 =
3,437✔
1753
                            static_cast<short>((nTotalR + nCount / 2) / nCount);
3,437✔
1754
                        color.c2 =
3,437✔
1755
                            static_cast<short>((nTotalG + nCount / 2) / nCount);
3,437✔
1756
                        color.c3 =
3,437✔
1757
                            static_cast<short>((nTotalB + nCount / 2) / nCount);
3,437✔
1758
                    }
1759
                    pDstScanline[iDstPixel] =
1,303✔
1760
                        static_cast<T>(BestColorEntry(colorEntries, color));
3,637✔
1761
                }
1762
            }
1763
        }
1764
    }
1765

1766
    CPLFree(pasSrcX);
2,319✔
1767

1768
    return CE_None;
2,319✔
1769
}
1770

1771
static CPLErr
1772
GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
2,319✔
1773
                               const void *pChunk, void **ppDstBuffer,
1774
                               GDALDataType *peDstBufferDataType)
1775
{
1776
    *peDstBufferDataType = args.eWrkDataType;
2,319✔
1777
    switch (args.eWrkDataType)
2,319✔
1778
    {
1779
        case GDT_Byte:
2,252✔
1780
        {
1781
            return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
2,252✔
1782
                args, static_cast<const GByte *>(pChunk), ppDstBuffer);
2,252✔
1783
        }
1784

1785
        case GDT_UInt16:
9✔
1786
        {
1787
            if (EQUAL(args.pszResampling, "RMS"))
9✔
1788
            {
1789
                // Use double as accumulation type, because UInt32 could overflow
1790
                return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
1791
                                                        GDT_UInt16>(
5✔
1792
                    args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
5✔
1793
            }
1794
            else
1795
            {
1796
                return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
1797
                                                        GDT_UInt16>(
4✔
1798
                    args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
4✔
1799
            }
1800
        }
1801

1802
        case GDT_Float32:
41✔
1803
        {
1804
            return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
41✔
1805
                args, static_cast<const float *>(pChunk), ppDstBuffer);
41✔
1806
        }
1807

1808
        case GDT_Float64:
17✔
1809
        {
1810
            return GDALResampleChunk_AverageOrRMS_T<double, double,
1811
                                                    GDT_Float64>(
17✔
1812
                args, static_cast<const double *>(pChunk), ppDstBuffer);
17✔
1813
        }
1814

1815
        default:
×
1816
            break;
×
1817
    }
1818

1819
    CPLAssert(false);
×
1820
    return CE_Failure;
1821
}
1822

1823
/************************************************************************/
1824
/*                     GDALResampleChunk_Gauss()                        */
1825
/************************************************************************/
1826

1827
static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
86✔
1828
                                      const void *pChunk, void **ppDstBuffer,
1829
                                      GDALDataType *peDstBufferDataType)
1830

1831
{
1832
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
86✔
1833
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
86✔
1834
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
86✔
1835
    const int nChunkXOff = args.nChunkXOff;
86✔
1836
    const int nChunkXSize = args.nChunkXSize;
86✔
1837
    const int nChunkYOff = args.nChunkYOff;
86✔
1838
    const int nChunkYSize = args.nChunkYSize;
86✔
1839
    const int nDstXOff = args.nDstXOff;
86✔
1840
    const int nDstXOff2 = args.nDstXOff2;
86✔
1841
    const int nDstYOff = args.nDstYOff;
86✔
1842
    const int nDstYOff2 = args.nDstYOff2;
86✔
1843
    const bool bHasNoData = args.bHasNoData;
86✔
1844
    double dfNoDataValue = args.dfNoDataValue;
86✔
1845
    const GDALColorTable *poColorTable = args.poColorTable;
86✔
1846

1847
    const double *const padfChunk = static_cast<const double *>(pChunk);
86✔
1848

1849
    *ppDstBuffer =
86✔
1850
        VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
86✔
1851
                            GDALGetDataTypeSizeBytes(GDT_Float64));
1852
    if (*ppDstBuffer == nullptr)
86✔
1853
    {
1854
        return CE_Failure;
×
1855
    }
1856
    *peDstBufferDataType = GDT_Float64;
86✔
1857
    double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
86✔
1858

1859
    /* -------------------------------------------------------------------- */
1860
    /*      Create the filter kernel and allocate scanline buffer.          */
1861
    /* -------------------------------------------------------------------- */
1862
    int nGaussMatrixDim = 3;
86✔
1863
    const int *panGaussMatrix;
1864
    constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
86✔
1865
    constexpr int anGaussMatrix5x5[] = {1,  4, 6,  4,  1,  4, 16, 24, 16,
86✔
1866
                                        4,  6, 24, 36, 24, 6, 4,  16, 24,
1867
                                        16, 4, 1,  4,  6,  4, 1};
1868
    constexpr int anGaussMatrix7x7[] = {
86✔
1869
        1,   6,  15, 20,  15,  6,   1,   6,  36, 90,  120, 90,  36,
1870
        6,   15, 90, 225, 300, 225, 90,  15, 20, 120, 300, 400, 300,
1871
        120, 20, 15, 90,  225, 300, 225, 90, 15, 6,   36,  90,  120,
1872
        90,  36, 6,  1,   6,   15,  20,  15, 6,  1};
1873

1874
    const int nOXSize = args.nOvrXSize;
86✔
1875
    const int nOYSize = args.nOvrYSize;
86✔
1876
    const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
86✔
1877

1878
    // matrix for gauss filter
1879
    if (nResYFactor <= 2)
86✔
1880
    {
1881
        panGaussMatrix = anGaussMatrix3x3;
85✔
1882
        nGaussMatrixDim = 3;
85✔
1883
    }
1884
    else if (nResYFactor <= 4)
1✔
1885
    {
1886
        panGaussMatrix = anGaussMatrix5x5;
×
1887
        nGaussMatrixDim = 5;
×
1888
    }
1889
    else
1890
    {
1891
        panGaussMatrix = anGaussMatrix7x7;
1✔
1892
        nGaussMatrixDim = 7;
1✔
1893
    }
1894

1895
#ifdef DEBUG_OUT_OF_BOUND_ACCESS
1896
    int *panGaussMatrixDup = static_cast<int *>(
1897
        CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
1898
    memcpy(panGaussMatrixDup, panGaussMatrix,
1899
           sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
1900
    panGaussMatrix = panGaussMatrixDup;
1901
#endif
1902

1903
    if (!bHasNoData)
86✔
1904
        dfNoDataValue = 0.0;
79✔
1905

1906
    std::vector<GDALColorEntry> colorEntries;
86✔
1907
    int nTransparentIdx = -1;
86✔
1908
    if (poColorTable)
86✔
1909
        colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
2✔
1910

1911
    // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1912
    // it as nodata value.
1913
    if (bHasNoData && dfNoDataValue >= 0.0f &&
92✔
1914
        dfNoDataValue < colorEntries.size())
6✔
1915
        colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
×
1916

1917
    // Or if we have no explicit nodata, but a color table entry that is
1918
    // transparent, consider it as the nodata value.
1919
    else if (!bHasNoData && nTransparentIdx >= 0)
86✔
1920
    {
1921
        dfNoDataValue = nTransparentIdx;
×
1922
    }
1923

1924
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
86✔
1925
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
86✔
1926
    const int nDstXWidth = nDstXOff2 - nDstXOff;
86✔
1927

1928
    /* ==================================================================== */
1929
    /*      Loop over destination scanlines.                                */
1930
    /* ==================================================================== */
1931
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
16,488✔
1932
    {
1933
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
16,402✔
1934
        int nSrcYOff2 =
16,402✔
1935
            static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
16,402✔
1936

1937
        if (nSrcYOff < nChunkYOff)
16,402✔
1938
        {
1939
            nSrcYOff = nChunkYOff;
×
1940
            nSrcYOff2++;
×
1941
        }
1942

1943
        const int iSizeY = nSrcYOff2 - nSrcYOff;
16,402✔
1944
        nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
16,402✔
1945
        nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
16,402✔
1946

1947
        if (nSrcYOff2 > nChunkBottomYOff ||
16,402✔
1948
            (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
16,359✔
1949
        {
1950
            nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
44✔
1951
        }
1952

1953
        int nYShiftGaussMatrix = 0;
16,402✔
1954
        if (nSrcYOff < nChunkYOff)
16,402✔
1955
        {
1956
            nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
×
1957
            nSrcYOff = nChunkYOff;
×
1958
        }
1959

1960
        const double *const padfSrcScanline =
16,402✔
1961
            padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
16,402✔
1962
        const GByte *pabySrcScanlineNodataMask = nullptr;
16,402✔
1963
        if (pabyChunkNodataMask != nullptr)
16,402✔
1964
            pabySrcScanlineNodataMask =
152✔
1965
                pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
152✔
1966

1967
        /* --------------------------------------------------------------------
1968
         */
1969
        /*      Loop over destination pixels */
1970
        /* --------------------------------------------------------------------
1971
         */
1972
        double *const padfDstScanline =
16,402✔
1973
            padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
16,402✔
1974
        for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
4,149,980✔
1975
        {
1976
            int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
4,133,580✔
1977
            int nSrcXOff2 =
4,133,580✔
1978
                static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
4,133,580✔
1979

1980
            if (nSrcXOff < nChunkXOff)
4,133,580✔
1981
            {
1982
                nSrcXOff = nChunkXOff;
×
1983
                nSrcXOff2++;
×
1984
            }
1985

1986
            const int iSizeX = nSrcXOff2 - nSrcXOff;
4,133,580✔
1987
            nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
4,133,580✔
1988
            nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
4,133,580✔
1989

1990
            if (nSrcXOff2 > nChunkRightXOff ||
4,133,580✔
1991
                (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
4,127,930✔
1992
            {
1993
                nSrcXOff2 =
5,650✔
1994
                    std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
5,650✔
1995
            }
1996

1997
            int nXShiftGaussMatrix = 0;
4,133,580✔
1998
            if (nSrcXOff < nChunkXOff)
4,133,580✔
1999
            {
2000
                nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
×
2001
                nSrcXOff = nChunkXOff;
×
2002
            }
2003

2004
            if (poColorTable == nullptr)
4,133,580✔
2005
            {
2006
                double dfTotal = 0.0;
4,133,380✔
2007
                GInt64 nCount = 0;
4,133,380✔
2008
                const int *panLineWeight =
4,133,380✔
2009
                    panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
4,133,380✔
2010
                    nXShiftGaussMatrix;
2011

2012
                for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
16,527,900✔
2013
                     ++iY, ++j, panLineWeight += nGaussMatrixDim)
12,394,500✔
2014
                {
2015
                    for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
49,561,300✔
2016
                    {
2017
                        const double val =
37,166,800✔
2018
                            padfSrcScanline[iX - nChunkXOff +
37,166,800✔
2019
                                            static_cast<GPtrDiff_t>(iY -
37,166,800✔
2020
                                                                    nSrcYOff) *
37,166,800✔
2021
                                                nChunkXSize];
37,166,800✔
2022
                        if (pabySrcScanlineNodataMask == nullptr ||
37,166,800✔
2023
                            pabySrcScanlineNodataMask[iX - nChunkXOff +
32,872✔
2024
                                                      static_cast<GPtrDiff_t>(
32,872✔
2025
                                                          iY - nSrcYOff) *
32,872✔
2026
                                                          nChunkXSize])
32,872✔
2027
                        {
2028
                            const int nWeight = panLineWeight[i];
37,146,100✔
2029
                            dfTotal += val * nWeight;
37,146,100✔
2030
                            nCount += nWeight;
37,146,100✔
2031
                        }
2032
                    }
2033
                }
2034

2035
                if (nCount == 0)
4,133,380✔
2036
                {
2037
                    padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2,217✔
2038
                }
2039
                else
2040
                {
2041
                    padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
4,131,160✔
2042
                }
2043
            }
2044
            else
2045
            {
2046
                GInt64 nTotalR = 0;
200✔
2047
                GInt64 nTotalG = 0;
200✔
2048
                GInt64 nTotalB = 0;
200✔
2049
                GInt64 nTotalWeight = 0;
200✔
2050
                const int *panLineWeight =
200✔
2051
                    panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
200✔
2052
                    nXShiftGaussMatrix;
2053

2054
                for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
780✔
2055
                     ++iY, ++j, panLineWeight += nGaussMatrixDim)
580✔
2056
                {
2057
                    for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2,262✔
2058
                    {
2059
                        const double val =
1,682✔
2060
                            padfSrcScanline[iX - nChunkXOff +
1,682✔
2061
                                            static_cast<GPtrDiff_t>(iY -
1,682✔
2062
                                                                    nSrcYOff) *
1,682✔
2063
                                                nChunkXSize];
1,682✔
2064
                        if (val < 0 || val >= colorEntries.size())
1,682✔
2065
                            continue;
×
2066

2067
                        size_t idx = static_cast<size_t>(val);
1,682✔
2068
                        if (colorEntries[idx].c4)
1,682✔
2069
                        {
2070
                            const int nWeight = panLineWeight[i];
1,682✔
2071
                            nTotalR +=
1,682✔
2072
                                static_cast<GInt64>(colorEntries[idx].c1) *
1,682✔
2073
                                nWeight;
1,682✔
2074
                            nTotalG +=
1,682✔
2075
                                static_cast<GInt64>(colorEntries[idx].c2) *
1,682✔
2076
                                nWeight;
1,682✔
2077
                            nTotalB +=
1,682✔
2078
                                static_cast<GInt64>(colorEntries[idx].c3) *
1,682✔
2079
                                nWeight;
1,682✔
2080
                            nTotalWeight += nWeight;
1,682✔
2081
                        }
2082
                    }
2083
                }
2084

2085
                if (nTotalWeight == 0)
200✔
2086
                {
2087
                    padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
×
2088
                }
2089
                else
2090
                {
2091
                    GDALColorEntry color;
2092

2093
                    color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
200✔
2094
                                                  nTotalWeight);
2095
                    color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
200✔
2096
                                                  nTotalWeight);
2097
                    color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
200✔
2098
                                                  nTotalWeight);
2099
                    padfDstScanline[iDstPixel - nDstXOff] =
200✔
2100
                        BestColorEntry(colorEntries, color);
200✔
2101
                }
2102
            }
2103
        }
2104
    }
2105

2106
#ifdef DEBUG_OUT_OF_BOUND_ACCESS
2107
    CPLFree(panGaussMatrixDup);
2108
#endif
2109

2110
    return CE_None;
86✔
2111
}
2112

2113
/************************************************************************/
2114
/*                      GDALResampleChunk_Mode()                        */
2115
/************************************************************************/
2116

2117
template <class T> static inline bool IsSame(T a, T b)
4,398✔
2118
{
2119
    return a == b;
4,398✔
2120
}
2121

2122
template <> bool IsSame<float>(float a, float b)
4,854✔
2123
{
2124
    return a == b || (std::isnan(a) && std::isnan(b));
4,854✔
2125
}
2126

2127
template <> bool IsSame<double>(double a, double b)
504✔
2128
{
2129
    return a == b || (std::isnan(a) && std::isnan(b));
504✔
2130
}
2131

2132
template <>
2133
bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
480✔
2134
{
2135
    return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
960✔
2136
                      std::isnan(b.real()) && std::isnan(b.imag()));
960✔
2137
}
2138

2139
template <>
2140
bool IsSame<std::complex<double>>(std::complex<double> a,
480✔
2141
                                  std::complex<double> b)
2142
{
2143
    return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
960✔
2144
                      std::isnan(b.real()) && std::isnan(b.imag()));
960✔
2145
}
2146

2147
template <class T>
2148
static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
136✔
2149
                                      const T *pChunk, T *const pDstBuffer)
2150

2151
{
2152
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
136✔
2153
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
136✔
2154
    const double dfSrcXDelta = args.dfSrcXDelta;
136✔
2155
    const double dfSrcYDelta = args.dfSrcYDelta;
136✔
2156
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
136✔
2157
    const int nChunkXOff = args.nChunkXOff;
136✔
2158
    const int nChunkXSize = args.nChunkXSize;
136✔
2159
    const int nChunkYOff = args.nChunkYOff;
136✔
2160
    const int nChunkYSize = args.nChunkYSize;
136✔
2161
    const int nDstXOff = args.nDstXOff;
136✔
2162
    const int nDstXOff2 = args.nDstXOff2;
136✔
2163
    const int nDstYOff = args.nDstYOff;
136✔
2164
    const int nDstYOff2 = args.nDstYOff2;
136✔
2165
    const bool bHasNoData = args.bHasNoData;
136✔
2166
    const GDALColorTable *poColorTable = args.poColorTable;
136✔
2167
    const int nDstXSize = nDstXOff2 - nDstXOff;
136✔
2168

2169
    T tNoDataValue;
8✔
2170
    if constexpr (std::is_same<T, std::complex<float>>::value ||
2171
                  std::is_same<T, std::complex<double>>::value)
2172
    {
2173
        using BaseT = typename T::value_type;
2174
        tNoDataValue =
8✔
2175
            std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2176
                                std::numeric_limits<BaseT>::quiet_NaN());
2177
    }
2178
    else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
128✔
2179
        tNoDataValue = 0;
127✔
2180
    else
2181
        tNoDataValue = static_cast<T>(args.dfNoDataValue);
1✔
2182

2183
    size_t nMaxNumPx = 0;
136✔
2184
    T *paVals = nullptr;
136✔
2185
    int *panSums = nullptr;
136✔
2186

2187
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
136✔
2188
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
136✔
2189
    std::vector<int> anVals(256, 0);
272✔
2190

2191
    /* ==================================================================== */
2192
    /*      Loop over destination scanlines.                                */
2193
    /* ==================================================================== */
2194
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
7,531✔
2195
    {
2196
        double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
7,395✔
2197
        int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
7,395✔
2198
#ifdef only_pixels_with_more_than_10_pct_participation
2199
        // When oversampling, don't take into account pixels that have a tiny
2200
        // participation in the resulting pixel
2201
        if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2202
            nSrcYOff < nChunkBottomYOff)
2203
            nSrcYOff++;
2204
#endif
2205
        if (nSrcYOff < nChunkYOff)
7,395✔
2206
            nSrcYOff = nChunkYOff;
×
2207

2208
        double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
7,395✔
2209
        int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
7,395✔
2210
#ifdef only_pixels_with_more_than_10_pct_participation
2211
        // When oversampling, don't take into account pixels that have a tiny
2212
        // participation in the resulting pixel
2213
        if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2214
            nSrcYOff2 > nChunkYOff)
2215
            nSrcYOff2--;
2216
#endif
2217
        if (nSrcYOff2 == nSrcYOff)
7,395✔
2218
            ++nSrcYOff2;
×
2219
        if (nSrcYOff2 > nChunkBottomYOff)
7,395✔
2220
            nSrcYOff2 = nChunkBottomYOff;
×
2221

2222
        const T *const paSrcScanline =
7,395✔
2223
            pChunk +
149✔
2224
            (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
7,395✔
2225
        const GByte *pabySrcScanlineNodataMask = nullptr;
7,395✔
2226
        if (pabyChunkNodataMask != nullptr)
7,395✔
2227
            pabySrcScanlineNodataMask =
1,810✔
2228
                pabyChunkNodataMask +
2229
                static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
1,810✔
2230

2231
        T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
7,395✔
2232
        /* --------------------------------------------------------------------
2233
         */
2234
        /*      Loop over destination pixels */
2235
        /* --------------------------------------------------------------------
2236
         */
2237
        for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
4,259,580✔
2238
        {
2239
            double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
4,252,187✔
2240
            // Apply some epsilon to avoid numerical precision issues
2241
            int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
4,252,187✔
2242
#ifdef only_pixels_with_more_than_10_pct_participation
2243
            // When oversampling, don't take into account pixels that have a
2244
            // tiny participation in the resulting pixel
2245
            if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2246
                nSrcXOff < nChunkRightXOff)
2247
                nSrcXOff++;
2248
#endif
2249
            if (nSrcXOff < nChunkXOff)
4,252,187✔
2250
                nSrcXOff = nChunkXOff;
×
2251

2252
            double dfSrcXOff2 =
4,252,187✔
2253
                dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
4,252,187✔
2254
            int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
4,252,187✔
2255
#ifdef only_pixels_with_more_than_10_pct_participation
2256
            // When oversampling, don't take into account pixels that have a
2257
            // tiny participation in the resulting pixel
2258
            if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2259
                nSrcXOff2 > nChunkXOff)
2260
                nSrcXOff2--;
2261
#endif
2262
            if (nSrcXOff2 == nSrcXOff)
4,252,187✔
2263
                nSrcXOff2++;
×
2264
            if (nSrcXOff2 > nChunkRightXOff)
4,252,187✔
2265
                nSrcXOff2 = nChunkRightXOff;
×
2266

2267
            bool bRegularProcessing = false;
4,252,187✔
2268
            if constexpr (!std::is_same<T, GByte>::value)
2269
                bRegularProcessing = true;
827✔
2270
            else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
4,251,360✔
2271
                bRegularProcessing = true;
×
2272

2273
            if (bRegularProcessing)
4,252,187✔
2274
            {
2275
                // Not sure how much sense it makes to run a majority
2276
                // filter on floating point data, but here it is for the sake
2277
                // of compatibility. It won't look right on RGB images by the
2278
                // nature of the filter.
2279

2280
                if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
827✔
2281
                    nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
2,481✔
2282
                    static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
827✔
2283
                            static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
827✔
2284
                        std::numeric_limits<size_t>::max() / sizeof(float))
827✔
2285
                {
2286
                    CPLError(CE_Failure, CPLE_NotSupported,
×
2287
                             "Too big downsampling factor");
2288
                    CPLFree(paVals);
×
2289
                    CPLFree(panSums);
×
2290
                    return CE_Failure;
×
2291
                }
2292
                const size_t nNumPx =
827✔
2293
                    static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
827✔
2294
                    static_cast<size_t>(nSrcXOff2 - nSrcXOff);
827✔
2295
                size_t iMaxInd = 0;
827✔
2296
                size_t iMaxVal = 0;
827✔
2297
                bool biMaxValdValid = false;
827✔
2298

2299
                if (paVals == nullptr || nNumPx > nMaxNumPx)
827✔
2300
                {
2301
                    T *paValsNew = static_cast<T *>(
2302
                        VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
71✔
2303
                    int *panSumsNew = static_cast<int *>(
2304
                        VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
71✔
2305
                    if (paValsNew != nullptr)
71✔
2306
                        paVals = paValsNew;
71✔
2307
                    if (panSumsNew != nullptr)
71✔
2308
                        panSums = panSumsNew;
71✔
2309
                    if (paValsNew == nullptr || panSumsNew == nullptr)
71✔
2310
                    {
2311
                        CPLFree(paVals);
×
2312
                        CPLFree(panSums);
×
2313
                        return CE_Failure;
×
2314
                    }
2315
                    nMaxNumPx = nNumPx;
71✔
2316
                }
2317

2318
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2,585✔
2319
                {
2320
                    const GPtrDiff_t iTotYOff =
1,758✔
2321
                        static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
1,758✔
2322
                        nChunkXOff;
1,758✔
2323
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
5,690✔
2324
                    {
2325
                        if (pabySrcScanlineNodataMask == nullptr ||
3,932✔
2326
                            pabySrcScanlineNodataMask[iX + iTotYOff])
16✔
2327
                        {
2328
                            const T val = paSrcScanline[iX + iTotYOff];
3,917✔
2329
                            size_t i = 0;  // Used after for.
3,917✔
2330

2331
                            // Check array for existing entry.
2332
                            for (; i < iMaxInd; ++i)
14,387✔
2333
                                if (IsSame(paVals[i], val) &&
17,626✔
2334
                                    ++panSums[i] > panSums[iMaxVal])
6,910✔
2335
                                {
2336
                                    iMaxVal = i;
246✔
2337
                                    biMaxValdValid = true;
246✔
2338
                                    break;
246✔
2339
                                }
2340

2341
                            // Add to arr if entry not already there.
2342
                            if (i == iMaxInd)
3,917✔
2343
                            {
2344
                                paVals[iMaxInd] = val;
3,671✔
2345
                                panSums[iMaxInd] = 1;
3,671✔
2346

2347
                                if (!biMaxValdValid)
3,671✔
2348
                                {
2349
                                    iMaxVal = iMaxInd;
824✔
2350
                                    biMaxValdValid = true;
824✔
2351
                                }
2352

2353
                                ++iMaxInd;
3,671✔
2354
                            }
2355
                        }
2356
                    }
2357
                }
2358

2359
                if (!biMaxValdValid)
827✔
2360
                    paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
3✔
2361
                else
2362
                    paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
824✔
2363
            }
2364
            else if constexpr (std::is_same<T, GByte>::value)
2365
            // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
2366
            {
2367
                // So we go here for a paletted or non-paletted byte band.
2368
                // The input values are then between 0 and 255.
2369
                int nMaxVal = 0;
4,251,360✔
2370
                int iMaxInd = -1;
4,251,360✔
2371

2372
                // The cost of this zeroing might be high. Perhaps we should
2373
                // just use the above generic case, and go to this one if the
2374
                // number of source pixels is large enough
2375
                std::fill(anVals.begin(), anVals.end(), 0);
4,251,360✔
2376

2377
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
12,777,700✔
2378
                {
2379
                    const GPtrDiff_t iTotYOff =
8,526,370✔
2380
                        static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
8,526,370✔
2381
                        nChunkXOff;
8,526,370✔
2382
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
25,649,400✔
2383
                    {
2384
                        const T val = paSrcScanline[iX + iTotYOff];
17,123,000✔
2385
                        if (!bHasNoData || val != tNoDataValue)
17,123,000✔
2386
                        {
2387
                            int nVal = static_cast<int>(val);
17,123,000✔
2388
                            if (++anVals[nVal] > nMaxVal)
17,123,000✔
2389
                            {
2390
                                // Sum the density.
2391
                                // Is it the most common value so far?
2392
                                iMaxInd = nVal;
17,006,300✔
2393
                                nMaxVal = anVals[nVal];
17,006,300✔
2394
                            }
2395
                        }
2396
                    }
2397
                }
2398

2399
                if (iMaxInd == -1)
4,251,360✔
2400
                    paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
×
2401
                else
2402
                    paDstScanline[iDstPixel - nDstXOff] =
4,251,360✔
2403
                        static_cast<T>(iMaxInd);
2404
            }
2405
        }
2406
    }
2407

2408
    CPLFree(paVals);
136✔
2409
    CPLFree(panSums);
136✔
2410

2411
    return CE_None;
136✔
2412
}
2413

2414
static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
136✔
2415
                                     const void *pChunk, void **ppDstBuffer,
2416
                                     GDALDataType *peDstBufferDataType)
2417
{
2418
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(
136✔
2419
        args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2420
        GDALGetDataTypeSizeBytes(args.eWrkDataType));
2421
    if (*ppDstBuffer == nullptr)
136✔
2422
    {
2423
        return CE_Failure;
×
2424
    }
2425

2426
    CPLAssert(args.eSrcDataType == args.eWrkDataType);
136✔
2427

2428
    *peDstBufferDataType = args.eWrkDataType;
136✔
2429
    switch (args.eWrkDataType)
136✔
2430
    {
2431
        // For mode resampling, as no computation is done, only the
2432
        // size of the data type matters... except for Byte where we have
2433
        // special processing. And for floating point values
2434
        case GDT_Byte:
65✔
2435
        {
2436
            return GDALResampleChunk_ModeT(args,
65✔
2437
                                           static_cast<const GByte *>(pChunk),
2438
                                           static_cast<GByte *>(*ppDstBuffer));
65✔
2439
        }
2440

2441
        case GDT_Int8:
4✔
2442
        {
2443
            return GDALResampleChunk_ModeT(args,
4✔
2444
                                           static_cast<const int8_t *>(pChunk),
2445
                                           static_cast<int8_t *>(*ppDstBuffer));
4✔
2446
        }
2447

2448
        case GDT_Int16:
9✔
2449
        case GDT_UInt16:
2450
        case GDT_Float16:
2451
        {
2452
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
9✔
2453
            return GDALResampleChunk_ModeT(
9✔
2454
                args, static_cast<const uint16_t *>(pChunk),
2455
                static_cast<uint16_t *>(*ppDstBuffer));
9✔
2456
        }
2457

2458
        case GDT_CInt16:
15✔
2459
        case GDT_CFloat16:
2460
        case GDT_Int32:
2461
        case GDT_UInt32:
2462
        {
2463
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
15✔
2464
            return GDALResampleChunk_ModeT(
15✔
2465
                args, static_cast<const uint32_t *>(pChunk),
2466
                static_cast<uint32_t *>(*ppDstBuffer));
15✔
2467
        }
2468

2469
        case GDT_Float32:
17✔
2470
        {
2471
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
17✔
2472
            return GDALResampleChunk_ModeT(args,
17✔
2473
                                           static_cast<const float *>(pChunk),
2474
                                           static_cast<float *>(*ppDstBuffer));
17✔
2475
        }
2476

2477
        case GDT_CInt32:
12✔
2478
        case GDT_Int64:
2479
        case GDT_UInt64:
2480
        {
2481
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
12✔
2482
            return GDALResampleChunk_ModeT(
12✔
2483
                args, static_cast<const uint64_t *>(pChunk),
2484
                static_cast<uint64_t *>(*ppDstBuffer));
12✔
2485
        }
2486

2487
        case GDT_Float64:
6✔
2488
        {
2489
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
6✔
2490
            return GDALResampleChunk_ModeT(args,
6✔
2491
                                           static_cast<const double *>(pChunk),
2492
                                           static_cast<double *>(*ppDstBuffer));
6✔
2493
        }
2494

2495
        case GDT_CFloat32:
4✔
2496
        {
2497
            return GDALResampleChunk_ModeT(
4✔
2498
                args, static_cast<const std::complex<float> *>(pChunk),
2499
                static_cast<std::complex<float> *>(*ppDstBuffer));
4✔
2500
        }
2501

2502
        case GDT_CFloat64:
4✔
2503
        {
2504
            return GDALResampleChunk_ModeT(
4✔
2505
                args, static_cast<const std::complex<double> *>(pChunk),
2506
                static_cast<std::complex<double> *>(*ppDstBuffer));
4✔
2507
        }
2508

2509
        case GDT_Unknown:
×
2510
        case GDT_TypeCount:
2511
            break;
×
2512
    }
2513

2514
    CPLAssert(false);
×
2515
    return CE_Failure;
2516
}
2517

2518
/************************************************************************/
2519
/*                  GDALResampleConvolutionHorizontal()                 */
2520
/************************************************************************/
2521

2522
template <class T>
2523
static inline double
2524
GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
44,886✔
2525
                                  int nSrcPixelCount)
2526
{
2527
    double dfVal1 = 0.0;
44,886✔
2528
    double dfVal2 = 0.0;
44,886✔
2529
    int i = 0;  // Used after for.
44,886✔
2530
    // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2531
    // manually (untypical) unrolled loop in -O2 and -O3:
2532
    // https://github.com/OSGeo/gdal/issues/9508
2533
#if !defined(__INTEL_CLANG_COMPILER)
2534
    for (; i < nSrcPixelCount - 3; i += 4)
89,516✔
2535
    {
2536
        dfVal1 += pChunk[i] * padfWeights[i];
44,630✔
2537
        dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
44,630✔
2538
        dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
44,630✔
2539
        dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
44,630✔
2540
    }
2541
#endif
2542
    for (; i < nSrcPixelCount; ++i)
46,358✔
2543
    {
2544
        dfVal1 += pChunk[i] * padfWeights[i];
1,472✔
2545
    }
2546
    return dfVal1 + dfVal2;
44,886✔
2547
}
2548

2549
template <class T>
2550
static inline void GDALResampleConvolutionHorizontalWithMask(
44,576✔
2551
    const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2552
    int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2553
{
2554
    dfVal = 0;
44,576✔
2555
    dfWeightSum = 0;
44,576✔
2556
    int i = 0;
44,576✔
2557
    for (; i < nSrcPixelCount - 3; i += 4)
98,300✔
2558
    {
2559
        const double dfWeight0 = padfWeights[i] * pabyMask[i];
53,724✔
2560
        const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
53,724✔
2561
        const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
53,724✔
2562
        const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
53,724✔
2563
        dfVal += pChunk[i] * dfWeight0;
53,724✔
2564
        dfVal += pChunk[i + 1] * dfWeight1;
53,724✔
2565
        dfVal += pChunk[i + 2] * dfWeight2;
53,724✔
2566
        dfVal += pChunk[i + 3] * dfWeight3;
53,724✔
2567
        dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
53,724✔
2568
    }
2569
    for (; i < nSrcPixelCount; ++i)
61,162✔
2570
    {
2571
        const double dfWeight = padfWeights[i] * pabyMask[i];
16,586✔
2572
        dfVal += pChunk[i] * dfWeight;
16,586✔
2573
        dfWeightSum += dfWeight;
16,586✔
2574
    }
2575
}
44,576✔
2576

2577
template <class T>
2578
static inline void GDALResampleConvolutionHorizontal_3rows(
1,340,094✔
2579
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2580
    const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2581
    double &dfRes2, double &dfRes3)
2582
{
2583
    double dfVal1 = 0.0;
1,340,094✔
2584
    double dfVal2 = 0.0;
1,340,094✔
2585
    double dfVal3 = 0.0;
1,340,094✔
2586
    double dfVal4 = 0.0;
1,340,094✔
2587
    double dfVal5 = 0.0;
1,340,094✔
2588
    double dfVal6 = 0.0;
1,340,094✔
2589
    int i = 0;  // Used after for.
1,340,094✔
2590
    for (; i < nSrcPixelCount - 3; i += 4)
2,733,937✔
2591
    {
2592
        dfVal1 += pChunkRow1[i] * padfWeights[i];
1,393,842✔
2593
        dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
1,393,842✔
2594
        dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
1,393,842✔
2595
        dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
1,393,842✔
2596
        dfVal3 += pChunkRow2[i] * padfWeights[i];
1,393,842✔
2597
        dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
1,393,842✔
2598
        dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
1,393,842✔
2599
        dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
1,393,842✔
2600
        dfVal5 += pChunkRow3[i] * padfWeights[i];
1,393,842✔
2601
        dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
1,393,842✔
2602
        dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
1,393,842✔
2603
        dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
1,393,842✔
2604
    }
2605
    for (; i < nSrcPixelCount; ++i)
1,378,621✔
2606
    {
2607
        dfVal1 += pChunkRow1[i] * padfWeights[i];
38,527✔
2608
        dfVal3 += pChunkRow2[i] * padfWeights[i];
38,527✔
2609
        dfVal5 += pChunkRow3[i] * padfWeights[i];
38,527✔
2610
    }
2611
    dfRes1 = dfVal1 + dfVal2;
1,340,094✔
2612
    dfRes2 = dfVal3 + dfVal4;
1,340,094✔
2613
    dfRes3 = dfVal5 + dfVal6;
1,340,094✔
2614
}
1,340,094✔
2615

2616
template <class T>
2617
static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
18,828✔
2618
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2619
    const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2620
    double &dfRes2, double &dfRes3)
2621
{
2622
    GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
18,828✔
2623
                                            padfWeights, nSrcPixelCount, dfRes1,
2624
                                            dfRes2, dfRes3);
2625
}
18,828✔
2626

2627
template <class T>
2628
static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
1,256,466✔
2629
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2630
    const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2631
{
2632
    GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
1,256,466✔
2633
                                            padfWeights, 4, dfRes1, dfRes2,
2634
                                            dfRes3);
2635
}
1,256,466✔
2636

2637
/************************************************************************/
2638
/*                  GDALResampleConvolutionVertical()                   */
2639
/************************************************************************/
2640

2641
template <class T>
2642
static inline double
2643
GDALResampleConvolutionVertical(const T *pChunk, size_t nStride,
465,158✔
2644
                                const double *padfWeights, int nSrcLineCount)
2645
{
2646
    double dfVal1 = 0.0;
465,158✔
2647
    double dfVal2 = 0.0;
465,158✔
2648
    int i = 0;
465,158✔
2649
    size_t j = 0;
465,158✔
2650
    for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
915,856✔
2651
    {
2652
        dfVal1 += pChunk[j + 0 * nStride] * padfWeights[i + 0];
450,698✔
2653
        dfVal1 += pChunk[j + 1 * nStride] * padfWeights[i + 1];
450,698✔
2654
        dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
450,698✔
2655
        dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
450,698✔
2656
    }
2657
    for (; i < nSrcLineCount; ++i, j += nStride)
518,661✔
2658
    {
2659
        dfVal1 += pChunk[j] * padfWeights[i];
53,503✔
2660
    }
2661
    return dfVal1 + dfVal2;
465,158✔
2662
}
2663

2664
template <class T>
2665
static inline void GDALResampleConvolutionVertical_2cols(
2,880,000✔
2666
    const T *pChunk, size_t nStride, const double *padfWeights,
2667
    int nSrcLineCount, double &dfRes1, double &dfRes2)
2668
{
2669
    double dfVal1 = 0.0;
2,880,000✔
2670
    double dfVal2 = 0.0;
2,880,000✔
2671
    double dfVal3 = 0.0;
2,880,000✔
2672
    double dfVal4 = 0.0;
2,880,000✔
2673
    int i = 0;
2,880,000✔
2674
    size_t j = 0;
2,880,000✔
2675
    for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
5,716,800✔
2676
    {
2677
        dfVal1 += pChunk[j + 0 + 0 * nStride] * padfWeights[i + 0];
2,836,800✔
2678
        dfVal3 += pChunk[j + 1 + 0 * nStride] * padfWeights[i + 0];
2,836,800✔
2679
        dfVal1 += pChunk[j + 0 + 1 * nStride] * padfWeights[i + 1];
2,836,800✔
2680
        dfVal3 += pChunk[j + 1 + 1 * nStride] * padfWeights[i + 1];
2,836,800✔
2681
        dfVal2 += pChunk[j + 0 + 2 * nStride] * padfWeights[i + 2];
2,836,800✔
2682
        dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2,836,800✔
2683
        dfVal2 += pChunk[j + 0 + 3 * nStride] * padfWeights[i + 3];
2,836,800✔
2684
        dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2,836,800✔
2685
    }
2686
    for (; i < nSrcLineCount; ++i, j += nStride)
2,995,210✔
2687
    {
2688
        dfVal1 += pChunk[j + 0] * padfWeights[i];
115,210✔
2689
        dfVal3 += pChunk[j + 1] * padfWeights[i];
115,210✔
2690
    }
2691
    dfRes1 = dfVal1 + dfVal2;
2,880,000✔
2692
    dfRes2 = dfVal3 + dfVal4;
2,880,000✔
2693
}
2,880,000✔
2694

2695
#ifdef USE_SSE2
2696

2697
#ifdef __AVX__
2698
/************************************************************************/
2699
/*             GDALResampleConvolutionVertical_16cols<T>                */
2700
/************************************************************************/
2701

2702
template <class T>
2703
static inline void
2704
GDALResampleConvolutionVertical_16cols(const T *pChunk, size_t nStride,
2705
                                       const double *padfWeights,
2706
                                       int nSrcLineCount, float *afDest)
2707
{
2708
    int i = 0;
2709
    size_t j = 0;
2710
    XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2711
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2712
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2713
    XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2714
    for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2715
    {
2716
        XMMReg4Double w0 =
2717
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2718
        XMMReg4Double w1 =
2719
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2720
        XMMReg4Double w2 =
2721
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2722
        XMMReg4Double w3 =
2723
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2724
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2725
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2726
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2727
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2728
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2729
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2730
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2731
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2732
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2733
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2734
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2735
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2736
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2737
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2738
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2739
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2740
    }
2741
    for (; i < nSrcLineCount; ++i, j += nStride)
2742
    {
2743
        XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2744
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2745
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2746
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2747
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2748
    }
2749
    v_acc0.Store4Val(afDest);
2750
    v_acc1.Store4Val(afDest + 4);
2751
    v_acc2.Store4Val(afDest + 8);
2752
    v_acc3.Store4Val(afDest + 12);
2753
}
2754

2755
template <class T>
2756
static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2757
                                                          const double *, int,
2758
                                                          double *)
2759
{
2760
    // Cannot be reached
2761
    CPLAssert(false);
2762
}
2763

2764
#else
2765

2766
/************************************************************************/
2767
/*              GDALResampleConvolutionVertical_8cols<T>                */
2768
/************************************************************************/
2769

2770
template <class T>
2771
static inline void
2772
GDALResampleConvolutionVertical_8cols(const T *pChunk, size_t nStride,
21,384,200✔
2773
                                      const double *padfWeights,
2774
                                      int nSrcLineCount, float *afDest)
2775
{
2776
    int i = 0;
21,384,200✔
2777
    size_t j = 0;
21,384,200✔
2778
    XMMReg4Double v_acc0 = XMMReg4Double::Zero();
21,384,200✔
2779
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
21,317,700✔
2780
    for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
40,779,200✔
2781
    {
2782
        XMMReg4Double w0 =
19,435,800✔
2783
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
19,435,800✔
2784
        XMMReg4Double w1 =
19,419,600✔
2785
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
19,419,600✔
2786
        XMMReg4Double w2 =
19,427,600✔
2787
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
19,427,600✔
2788
        XMMReg4Double w3 =
19,433,800✔
2789
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
19,433,800✔
2790
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
19,437,400✔
2791
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
19,381,800✔
2792
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
19,395,300✔
2793
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
19,396,500✔
2794
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
19,387,000✔
2795
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
19,386,900✔
2796
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
19,388,300✔
2797
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
19,396,700✔
2798
    }
2799
    for (; i < nSrcLineCount; ++i, j += nStride)
32,869,500✔
2800
    {
2801
        XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
11,526,100✔
2802
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
11,526,100✔
2803
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
11,526,100✔
2804
    }
2805
    v_acc0.Store4Val(afDest);
21,343,400✔
2806
    v_acc1.Store4Val(afDest + 4);
21,342,400✔
2807
}
21,370,600✔
2808

2809
template <class T>
2810
static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
2811
                                                         const double *, int,
2812
                                                         double *)
2813
{
2814
    // Cannot be reached
2815
    CPLAssert(false);
2816
}
2817

2818
#endif  // __AVX__
2819

2820
/************************************************************************/
2821
/*              GDALResampleConvolutionHorizontalSSE2<T>                */
2822
/************************************************************************/
2823

2824
template <class T>
2825
static inline double GDALResampleConvolutionHorizontalSSE2(
2,987,575✔
2826
    const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2827
{
2828
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2,987,575✔
2829
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2,987,060✔
2830
    int i = 0;  // Used after for.
2,986,995✔
2831
    for (; i < nSrcPixelCount - 7; i += 8)
3,213,713✔
2832
    {
2833
        // Retrieve the pixel & accumulate
2834
        const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
226,606✔
2835
        const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
226,602✔
2836
        const XMMReg4Double v_weight1 =
226,602✔
2837
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
226,602✔
2838
        const XMMReg4Double v_weight2 =
226,607✔
2839
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
226,607✔
2840

2841
        v_acc1 += v_pixels1 * v_weight1;
226,608✔
2842
        v_acc2 += v_pixels2 * v_weight2;
226,605✔
2843
    }
2844

2845
    v_acc1 += v_acc2;
2,987,111✔
2846

2847
    double dfVal = v_acc1.GetHorizSum();
2,987,114✔
2848
    for (; i < nSrcPixelCount; ++i)
10,156,520✔
2849
    {
2850
        dfVal += pChunk[i] * padfWeightsAligned[i];
7,169,450✔
2851
    }
2852
    return dfVal;
2,987,053✔
2853
}
2854

2855
/************************************************************************/
2856
/*              GDALResampleConvolutionHorizontal<GByte>                */
2857
/************************************************************************/
2858

2859
template <>
2860
inline double GDALResampleConvolutionHorizontal<GByte>(
2,438,960✔
2861
    const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2862
{
2863
    return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2,438,960✔
2864
                                                 nSrcPixelCount);
2,438,980✔
2865
}
2866

2867
template <>
2868
inline double GDALResampleConvolutionHorizontal<GUInt16>(
548,646✔
2869
    const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2870
{
2871
    return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
548,646✔
2872
                                                 nSrcPixelCount);
548,773✔
2873
}
2874

2875
/************************************************************************/
2876
/*              GDALResampleConvolutionHorizontalWithMaskSSE2<T>        */
2877
/************************************************************************/
2878

2879
template <class T>
2880
static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
7,054,303✔
2881
    const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
2882
    int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2883
{
2884
    int i = 0;  // Used after for.
7,054,303✔
2885
    XMMReg4Double v_acc = XMMReg4Double::Zero();
7,054,303✔
2886
    XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
7,053,433✔
2887
    for (; i < nSrcPixelCount - 3; i += 4)
19,734,421✔
2888
    {
2889
        const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
12,680,058✔
2890
        const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
12,676,758✔
2891
        XMMReg4Double v_weight =
12,678,558✔
2892
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
12,678,558✔
2893
        v_weight *= v_mask;
12,681,158✔
2894
        v_acc += v_pixels * v_weight;
12,681,658✔
2895
        v_acc_weight += v_weight;
12,672,658✔
2896
    }
2897

2898
    dfVal = v_acc.GetHorizSum();
7,054,363✔
2899
    dfWeightSum = v_acc_weight.GetHorizSum();
7,056,573✔
2900
    for (; i < nSrcPixelCount; ++i)
7,288,643✔
2901
    {
2902
        const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
231,090✔
2903
        dfVal += pChunk[i] * dfWeight;
231,090✔
2904
        dfWeightSum += dfWeight;
231,090✔
2905
    }
2906
}
7,057,553✔
2907

2908
/************************************************************************/
2909
/*              GDALResampleConvolutionHorizontalWithMask<GByte>        */
2910
/************************************************************************/
2911

2912
template <>
2913
inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
7,057,990✔
2914
    const GByte *pChunk, const GByte *pabyMask,
2915
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2916
    double &dfWeightSum)
2917
{
2918
    GDALResampleConvolutionHorizontalWithMaskSSE2(
7,057,990✔
2919
        pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2920
        dfWeightSum);
2921
}
7,051,380✔
2922

2923
template <>
2924
inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
63✔
2925
    const GUInt16 *pChunk, const GByte *pabyMask,
2926
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2927
    double &dfWeightSum)
2928
{
2929
    GDALResampleConvolutionHorizontalWithMaskSSE2(
63✔
2930
        pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2931
        dfWeightSum);
2932
}
63✔
2933

2934
/************************************************************************/
2935
/*              GDALResampleConvolutionHorizontal_3rows_SSE2<T>         */
2936
/************************************************************************/
2937

2938
template <class T>
2939
static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
17,006,230✔
2940
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2941
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2942
    double &dfRes2, double &dfRes3)
2943
{
2944
    XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
17,006,230✔
2945
                  v_acc2 = XMMReg4Double::Zero(),
16,969,130✔
2946
                  v_acc3 = XMMReg4Double::Zero();
16,978,130✔
2947
    int i = 0;
16,984,830✔
2948
    for (; i < nSrcPixelCount - 7; i += 8)
33,878,466✔
2949
    {
2950
        // Retrieve the pixel & accumulate.
2951
        XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
16,890,536✔
2952
        XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
16,898,136✔
2953
        const XMMReg4Double v_weight1 =
16,903,836✔
2954
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
16,903,836✔
2955
        const XMMReg4Double v_weight2 =
16,888,936✔
2956
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
16,888,936✔
2957

2958
        v_acc1 += v_pixels1 * v_weight1;
16,885,836✔
2959
        v_acc1 += v_pixels2 * v_weight2;
16,869,136✔
2960

2961
        v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
16,868,436✔
2962
        v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
16,900,336✔
2963
        v_acc2 += v_pixels1 * v_weight1;
16,900,736✔
2964
        v_acc2 += v_pixels2 * v_weight2;
16,886,536✔
2965

2966
        v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
16,880,536✔
2967
        v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
16,904,136✔
2968
        v_acc3 += v_pixels1 * v_weight1;
16,900,736✔
2969
        v_acc3 += v_pixels2 * v_weight2;
16,887,636✔
2970
    }
2971

2972
    dfRes1 = v_acc1.GetHorizSum();
16,987,930✔
2973
    dfRes2 = v_acc2.GetHorizSum();
16,980,130✔
2974
    dfRes3 = v_acc3.GetHorizSum();
16,988,030✔
2975
    for (; i < nSrcPixelCount; ++i)
28,703,926✔
2976
    {
2977
        dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
11,718,796✔
2978
        dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
11,718,796✔
2979
        dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
11,718,796✔
2980
    }
2981
}
16,985,130✔
2982

2983
/************************************************************************/
2984
/*              GDALResampleConvolutionHorizontal_3rows<GByte>          */
2985
/************************************************************************/
2986

2987
template <>
2988
inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
16,991,400✔
2989
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
2990
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2991
    double &dfRes2, double &dfRes3)
2992
{
2993
    GDALResampleConvolutionHorizontal_3rows_SSE2(
16,991,400✔
2994
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2995
        dfRes1, dfRes2, dfRes3);
2996
}
16,984,900✔
2997

2998
template <>
2999
inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
30✔
3000
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3001
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3002
    int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3003
{
3004
    GDALResampleConvolutionHorizontal_3rows_SSE2(
30✔
3005
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3006
        dfRes1, dfRes2, dfRes3);
3007
}
30✔
3008

3009
/************************************************************************/
3010
/*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>   */
3011
/************************************************************************/
3012

3013
template <class T>
3014
static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3,599,674✔
3015
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3016
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3017
    double &dfRes2, double &dfRes3)
3018
{
3019
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3,599,674✔
3020
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3,598,436✔
3021
    XMMReg4Double v_acc3 = XMMReg4Double::Zero();
3,598,646✔
3022
    int i = 0;  // Use after for.
3,598,242✔
3023
    for (; i < nSrcPixelCount - 3; i += 4)
6,413,775✔
3024
    {
3025
        // Retrieve the pixel & accumulate.
3026
        const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
2,814,740✔
3027
        const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
2,815,710✔
3028
        const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
2,815,950✔
3029
        const XMMReg4Double v_weight =
2,816,000✔
3030
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2,816,000✔
3031

3032
        v_acc1 += v_pixels1 * v_weight;
2,814,800✔
3033
        v_acc2 += v_pixels2 * v_weight;
2,812,990✔
3034
        v_acc3 += v_pixels3 * v_weight;
2,813,700✔
3035
    }
3036

3037
    dfRes1 = v_acc1.GetHorizSum();
3,599,035✔
3038
    dfRes2 = v_acc2.GetHorizSum();
3,598,794✔
3039
    dfRes3 = v_acc3.GetHorizSum();
3,598,887✔
3040

3041
    for (; i < nSrcPixelCount; ++i)
7,981,124✔
3042
    {
3043
        dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
4,382,226✔
3044
        dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
4,382,226✔
3045
        dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
4,382,226✔
3046
    }
3047
}
3,598,908✔
3048

3049
/************************************************************************/
3050
/*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>    */
3051
/************************************************************************/
3052

3053
template <>
3054
inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
3,532,550✔
3055
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3056
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3057
    double &dfRes2, double &dfRes3)
3058
{
3059
    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3,532,550✔
3060
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3061
        dfRes1, dfRes2, dfRes3);
3062
}
3,532,370✔
3063

3064
template <>
3065
inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
66,800✔
3066
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3067
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3068
    int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3069
{
3070
    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
66,800✔
3071
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3072
        dfRes1, dfRes2, dfRes3);
3073
}
67,018✔
3074

3075
/************************************************************************/
3076
/*     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>       */
3077
/************************************************************************/
3078

3079
template <class T>
3080
static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
13,903,230✔
3081
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3082
    const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3083
    double &dfRes3)
3084
{
3085
    const XMMReg4Double v_weight =
13,903,230✔
3086
        XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3087

3088
    // Retrieve the pixel & accumulate.
3089
    const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
13,877,540✔
3090
    const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
13,905,380✔
3091
    const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
13,895,100✔
3092

3093
    XMMReg4Double v_acc1 = v_pixels1 * v_weight;
13,904,630✔
3094
    XMMReg4Double v_acc2 = v_pixels2 * v_weight;
13,820,390✔
3095
    XMMReg4Double v_acc3 = v_pixels3 * v_weight;
13,823,260✔
3096

3097
    dfRes1 = v_acc1.GetHorizSum();
13,849,350✔
3098
    dfRes2 = v_acc2.GetHorizSum();
13,853,520✔
3099
    dfRes3 = v_acc3.GetHorizSum();
13,882,770✔
3100
}
13,876,020✔
3101

3102
/************************************************************************/
3103
/*       GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>      */
3104
/************************************************************************/
3105

3106
template <>
3107
inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
8,257,770✔
3108
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3109
    const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3110
    double &dfRes3)
3111
{
3112
    GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
8,257,770✔
3113
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3114
        dfRes3);
3115
}
8,252,170✔
3116

3117
template <>
3118
inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
5,645,210✔
3119
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3120
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3121
    double &dfRes2, double &dfRes3)
3122
{
3123
    GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
5,645,210✔
3124
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3125
        dfRes3);
3126
}
5,635,610✔
3127

3128
#endif  // USE_SSE2
3129

3130
/************************************************************************/
3131
/*                    GDALResampleChunk_Convolution()                   */
3132
/************************************************************************/
3133

3134
template <class T, class Twork, GDALDataType eWrkDataType>
3135
static CPLErr GDALResampleChunk_ConvolutionT(
4,468✔
3136
    const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3137
    FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3138
    int nKernelRadius, bool bKernelWithNegativeWeights, float fMaxVal)
3139

3140
{
3141
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
4,468✔
3142
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
4,468✔
3143
    const double dfSrcXDelta = args.dfSrcXDelta;
4,468✔
3144
    const double dfSrcYDelta = args.dfSrcYDelta;
4,468✔
3145
    constexpr int nBands = 1;
4,468✔
3146
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
4,468✔
3147
    const int nChunkXOff = args.nChunkXOff;
4,468✔
3148
    const int nChunkXSize = args.nChunkXSize;
4,468✔
3149
    const int nChunkYOff = args.nChunkYOff;
4,468✔
3150
    const int nChunkYSize = args.nChunkYSize;
4,468✔
3151
    const int nDstXOff = args.nDstXOff;
4,468✔
3152
    const int nDstXOff2 = args.nDstXOff2;
4,468✔
3153
    const int nDstYOff = args.nDstYOff;
4,468✔
3154
    const int nDstYOff2 = args.nDstYOff2;
4,468✔
3155
    const bool bHasNoData = args.bHasNoData;
4,468✔
3156
    double dfNoDataValue = args.dfNoDataValue;
4,468✔
3157

3158
    if (!bHasNoData)
4,468✔
3159
        dfNoDataValue = 0.0;
4,390✔
3160
    const auto dstDataType = args.eOvrDataType;
4,468✔
3161
    const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
4,468✔
3162
    const double dfReplacementVal =
4,468✔
3163
        bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
75✔
3164
                   : dfNoDataValue;
3165
    // cppcheck-suppress unreadVariable
3166
    const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
4,468✔
3167
    const bool bNoDataValueInt64Valid =
4,454✔
3168
        isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
4,464✔
3169
    const auto nNodataValueInt64 =
4,454✔
3170
        bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
3171
    constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
4,454✔
3172

3173
    // TODO: we should have some generic function to do this.
3174
    Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
4,454✔
3175
    Twork fDstMax = cpl::NumericLimits<Twork>::max();
4,454✔
3176
    if (dstDataType == GDT_Byte)
4,454✔
3177
    {
3178
        fDstMin = std::numeric_limits<GByte>::min();
3,733✔
3179
        fDstMax = std::numeric_limits<GByte>::max();
3,729✔
3180
    }
3181
    else if (dstDataType == GDT_Int8)
725✔
3182
    {
3183
        fDstMin = std::numeric_limits<GInt8>::min();
1✔
3184
        fDstMax = std::numeric_limits<GInt8>::max();
1✔
3185
    }
3186
    else if (dstDataType == GDT_UInt16)
724✔
3187
    {
3188
        fDstMin = std::numeric_limits<GUInt16>::min();
382✔
3189
        fDstMax = std::numeric_limits<GUInt16>::max();
379✔
3190
    }
3191
    else if (dstDataType == GDT_Int16)
342✔
3192
    {
3193
        fDstMin = std::numeric_limits<GInt16>::min();
291✔
3194
        fDstMax = std::numeric_limits<GInt16>::max();
291✔
3195
    }
3196
    else if (dstDataType == GDT_UInt32)
51✔
3197
    {
3198
        fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
1✔
3199
        fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
1✔
3200
    }
3201
    else if (dstDataType == GDT_Int32)
50✔
3202
    {
3203
        // cppcheck-suppress unreadVariable
3204
        fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
2✔
3205
        // cppcheck-suppress unreadVariable
3206
        fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
2✔
3207
    }
3208
    else if (dstDataType == GDT_UInt64)
48✔
3209
    {
3210
        // cppcheck-suppress unreadVariable
3211
        fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
1✔
3212
        // cppcheck-suppress unreadVariable
3213
        // (1 << 64) - 2048: largest uint64 value a double can hold
3214
        fDstMax = static_cast<Twork>(18446744073709549568ULL);
1✔
3215
    }
3216
    else if (dstDataType == GDT_Int64)
47✔
3217
    {
3218
        // cppcheck-suppress unreadVariable
3219
        fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
1✔
3220
        // cppcheck-suppress unreadVariable
3221
        // (1 << 63) - 1024: largest int64 that a double can hold
3222
        fDstMax = static_cast<Twork>(9223372036854774784LL);
1✔
3223
    }
3224

3225
    auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
36,936,309✔
3226
                               bNoDataValueInt64Valid, nNodataValueInt64,
3227
                               dfNoDataValue, dfReplacementVal](Twork fVal)
3228
    {
3229
        if (!bHasNoData)
16,021,300✔
3230
            return fVal;
11,838,800✔
3231

3232
        // Clamp value before comparing to nodata: this is only needed for
3233
        // kernels with negative weights (Lanczos)
3234
        Twork fClamped = fVal;
4,182,510✔
3235
        if (fClamped < fDstMin)
4,182,510✔
3236
            fClamped = fDstMin;
15,998✔
3237
        else if (fClamped > fDstMax)
4,166,510✔
3238
            fClamped = fDstMax;
16,406✔
3239
        if (isIntegerDT)
4,182,510✔
3240
        {
3241
            if (bNoDataValueInt64Valid)
4,193,840✔
3242
            {
3243
                const double fClampedRounded = std::round(fClamped);
4,212,380✔
3244
                if (fClampedRounded >=
8,432,080✔
3245
                        static_cast<Twork>(
3246
                            std::numeric_limits<int64_t>::min()) &&
8,393,700✔
3247
                    fClampedRounded <=
3248
                        static_cast<Twork>(9223372036854774784LL) &&
8,394,340✔
3249
                    nNodataValueInt64 ==
4,182,100✔
3250
                        static_cast<GInt64>(std::round(fClamped)))
4,192,460✔
3251
                {
3252
                    // Do not use the nodata value
3253
                    return static_cast<Twork>(dfReplacementVal);
14,435✔
3254
                }
3255
            }
3256
        }
UNCOV
3257
        else if (dfNoDataValue == fClamped)
×
3258
        {
3259
            // Do not use the nodata value
3260
            return static_cast<Twork>(dfReplacementVal);
1✔
3261
        }
3262
        return fClamped;
4,166,550✔
3263
    };
3264

3265
    /* -------------------------------------------------------------------- */
3266
    /*      Allocate work buffers.                                          */
3267
    /* -------------------------------------------------------------------- */
3268
    const int nDstXSize = nDstXOff2 - nDstXOff;
4,454✔
3269
    Twork *pafWrkScanline = nullptr;
4,454✔
3270
    if (dstDataType != eWrkDataType)
4,454✔
3271
    {
3272
        pafWrkScanline =
3273
            static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
4,414✔
3274
        if (pafWrkScanline == nullptr)
4,424✔
3275
            return CE_Failure;
×
3276
    }
3277

3278
    const double dfXScale = 1.0 / dfXRatioDstToSrc;
4,464✔
3279
    const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
4,464✔
3280
    const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
4,464✔
3281
    const double dfYScale = 1.0 / dfYRatioDstToSrc;
4,464✔
3282
    const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
4,464✔
3283
    const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
4,464✔
3284

3285
    // Temporary array to store result of horizontal filter.
3286
    double *padfHorizontalFiltered = static_cast<double *>(
3287
        VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
4,464✔
3288

3289
    // To store convolution coefficients.
3290
    double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
4,470✔
3291
        static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
3292
                         0.5) *
3293
        sizeof(double)));
3294

3295
    GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
4,464✔
3296
    if (pabyChunkNodataMask)
4,464✔
3297
        pabyChunkNodataMaskHorizontalFiltered =
3298
            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
462✔
3299
    if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
4,464✔
3300
        (pabyChunkNodataMask != nullptr &&
462✔
3301
         pabyChunkNodataMaskHorizontalFiltered == nullptr))
3302
    {
UNCOV
3303
        VSIFree(pafWrkScanline);
×
3304
        VSIFree(padfHorizontalFiltered);
×
3305
        VSIFreeAligned(padfWeights);
×
3306
        VSIFree(pabyChunkNodataMaskHorizontalFiltered);
×
3307
        return CE_Failure;
×
3308
    }
3309

3310
    /* ==================================================================== */
3311
    /*      First pass: horizontal filter                                   */
3312
    /* ==================================================================== */
3313
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
4,464✔
3314
#ifdef USE_SSE2
3315
    bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
4,464✔
3316
#endif
3317
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2,919,167✔
3318
    {
3319
        const double dfSrcPixel =
2,914,689✔
3320
            (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
2,914,689✔
3321
        int nSrcPixelStart =
2,914,689✔
3322
            static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
2,914,689✔
3323
        if (nSrcPixelStart < nChunkXOff)
2,914,689✔
3324
            nSrcPixelStart = nChunkXOff;
56,676✔
3325
        int nSrcPixelStop =
2,914,689✔
3326
            static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
2,914,689✔
3327
        if (nSrcPixelStop > nChunkRightXOff)
2,914,689✔
3328
            nSrcPixelStop = nChunkRightXOff;
56,714✔
3329
#if 0
3330
        if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3331
        {
3332
            printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3333
        }
3334
        if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3335
        {
3336
            printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3337
        }
3338
#endif
3339
        const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
2,914,689✔
3340
        double dfWeightSum = 0.0;
2,914,689✔
3341

3342
        // Compute convolution coefficients.
3343
        int nSrcPixel = nSrcPixelStart;
2,914,689✔
3344
        double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
2,914,689✔
3345
        for (; nSrcPixel < nSrcPixelStop - 3; nSrcPixel += 4)
4,057,261✔
3346
        {
3347
            padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
1,142,736✔
3348
            dfX += dfXScaleWeight;
1,142,736✔
3349
            padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
1,142,736✔
3350
            dfX += dfXScaleWeight;
1,142,736✔
3351
            padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
1,142,736✔
3352
            dfX += dfXScaleWeight;
1,142,736✔
3353
            padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
1,142,736✔
3354
            dfX += dfXScaleWeight;
1,142,736✔
3355
            dfWeightSum +=
1,142,574✔
3356
                pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
1,142,736✔
3357
        }
3358
        for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
6,902,998✔
3359
        {
3360
            const double dfWeight = pfnFilterFunc(dfX);
3,988,561✔
3361
            padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3,988,478✔
3362
            dfWeightSum += dfWeight;
3,988,478✔
3363
        }
3364

3365
        const int nHeight = nChunkYSize * nBands;
2,914,437✔
3366
        if (pabyChunkNodataMask == nullptr)
2,914,437✔
3367
        {
3368
            if (dfWeightSum != 0)
2,826,713✔
3369
            {
3370
                const double dfInvWeightSum = 1.0 / dfWeightSum;
2,826,709✔
3371
                for (int i = 0; i < nSrcPixelCount; ++i)
10,736,857✔
3372
                    padfWeights[i] *= dfInvWeightSum;
7,910,145✔
3373
            }
3374
            int iSrcLineOff = 0;
2,826,713✔
3375
#ifdef USE_SSE2
3376
            if (nSrcPixelCount == 4)
2,826,713✔
3377
            {
3378
                for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
15,758,974✔
3379
                {
3380
                    const size_t j =
15,146,056✔
3381
                        static_cast<size_t>(iSrcLineOff) * nChunkXSize +
15,146,056✔
3382
                        (nSrcPixelStart - nChunkXOff);
15,146,056✔
3383
                    double dfVal1 = 0.0;
15,146,056✔
3384
                    double dfVal2 = 0.0;
15,146,056✔
3385
                    double dfVal3 = 0.0;
15,146,056✔
3386
                    GDALResampleConvolutionHorizontalPixelCount4_3rows(
15,146,056✔
3387
                        pChunk + j, pChunk + j + nChunkXSize,
15,146,056✔
3388
                        pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
15,146,056✔
3389
                        dfVal2, dfVal3);
3390
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
15,145,536✔
3391
                                               nDstXSize +
15,145,536✔
3392
                                           iDstPixel - nDstXOff] = dfVal1;
15,145,536✔
3393
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
15,145,536✔
3394
                                            1) *
15,145,536✔
3395
                                               nDstXSize +
15,145,536✔
3396
                                           iDstPixel - nDstXOff] = dfVal2;
15,145,536✔
3397
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
15,145,536✔
3398
                                            2) *
15,145,536✔
3399
                                               nDstXSize +
15,145,536✔
3400
                                           iDstPixel - nDstXOff] = dfVal3;
15,145,536✔
3401
                }
3402
            }
3403
            else if (bSrcPixelCountLess8)
2,213,279✔
3404
            {
3405
                for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
5,662,124✔
3406
                {
3407
                    const size_t j =
3,617,695✔
3408
                        static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3,617,695✔
3409
                        (nSrcPixelStart - nChunkXOff);
3,617,695✔
3410
                    double dfVal1 = 0.0;
3,617,695✔
3411
                    double dfVal2 = 0.0;
3,617,695✔
3412
                    double dfVal3 = 0.0;
3,617,695✔
3413
                    GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3,617,695✔
3414
                        pChunk + j, pChunk + j + nChunkXSize,
3,617,695✔
3415
                        pChunk + j + 2 * nChunkXSize, padfWeights,
3,617,695✔
3416
                        nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3417
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3,618,242✔
3418
                                               nDstXSize +
3,618,242✔
3419
                                           iDstPixel - nDstXOff] = dfVal1;
3,618,242✔
3420
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3,618,242✔
3421
                                            1) *
3,618,242✔
3422
                                               nDstXSize +
3,618,242✔
3423
                                           iDstPixel - nDstXOff] = dfVal2;
3,618,242✔
3424
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3,618,242✔
3425
                                            2) *
3,618,242✔
3426
                                               nDstXSize +
3,618,242✔
3427
                                           iDstPixel - nDstXOff] = dfVal3;
3,618,242✔
3428
                }
3429
            }
3430
            else
3431
#endif
3432
            {
3433
                for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
17,222,937✔
3434
                {
3435
                    const size_t j =
17,052,130✔
3436
                        static_cast<size_t>(iSrcLineOff) * nChunkXSize +
17,052,130✔
3437
                        (nSrcPixelStart - nChunkXOff);
17,052,130✔
3438
                    double dfVal1 = 0.0;
17,052,130✔
3439
                    double dfVal2 = 0.0;
17,052,130✔
3440
                    double dfVal3 = 0.0;
17,052,130✔
3441
                    GDALResampleConvolutionHorizontal_3rows(
17,052,130✔
3442
                        pChunk + j, pChunk + j + nChunkXSize,
17,052,130✔
3443
                        pChunk + j + 2 * nChunkXSize, padfWeights,
17,052,130✔
3444
                        nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3445
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
17,053,530✔
3446
                                               nDstXSize +
17,053,530✔
3447
                                           iDstPixel - nDstXOff] = dfVal1;
17,053,530✔
3448
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
17,053,530✔
3449
                                            1) *
17,053,530✔
3450
                                               nDstXSize +
17,053,530✔
3451
                                           iDstPixel - nDstXOff] = dfVal2;
17,053,530✔
3452
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
17,053,530✔
3453
                                            2) *
17,053,530✔
3454
                                               nDstXSize +
17,053,530✔
3455
                                           iDstPixel - nDstXOff] = dfVal3;
17,053,530✔
3456
                }
3457
            }
3458
            for (; iSrcLineOff < nHeight; ++iSrcLineOff)
5,860,740✔
3459
            {
3460
                const size_t j =
3,032,528✔
3461
                    static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3,032,528✔
3462
                    (nSrcPixelStart - nChunkXOff);
3,032,528✔
3463
                const double dfVal = GDALResampleConvolutionHorizontal(
3,581,276✔
3464
                    pChunk + j, padfWeights, nSrcPixelCount);
593,558✔
3465
                padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3,032,612✔
3466
                                           nDstXSize +
3,032,612✔
3467
                                       iDstPixel - nDstXOff] = dfVal;
3,032,612✔
3468
            }
3469
        }
3470
        else
3471
        {
3472
            for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
20,509,568✔
3473
            {
3474
                const size_t j =
20,420,046✔
3475
                    static_cast<size_t>(iSrcLineOff) * nChunkXSize +
20,420,046✔
3476
                    (nSrcPixelStart - nChunkXOff);
20,420,046✔
3477

3478
                if (bKernelWithNegativeWeights)
20,420,046✔
3479
                {
3480
                    int nConsecutiveValid = 0;
19,892,712✔
3481
                    int nMaxConsecutiveValid = 0;
19,892,712✔
3482
                    for (int k = 0; k < nSrcPixelCount; k++)
181,887,458✔
3483
                    {
3484
                        if (pabyChunkNodataMask[j + k])
161,990,146✔
3485
                            nConsecutiveValid++;
48,855,153✔
3486
                        else if (nConsecutiveValid)
113,134,793✔
3487
                        {
3488
                            nMaxConsecutiveValid = std::max(
112,625✔
3489
                                nMaxConsecutiveValid, nConsecutiveValid);
107,790✔
3490
                            nConsecutiveValid = 0;
112,625✔
3491
                        }
3492
                    }
3493
                    nMaxConsecutiveValid =
19,896,112✔
3494
                        std::max(nMaxConsecutiveValid, nConsecutiveValid);
19,897,512✔
3495
                    if (nMaxConsecutiveValid < nSrcPixelCount / 2)
19,896,112✔
3496
                    {
3497
                        const size_t nTempOffset =
13,314,907✔
3498
                            static_cast<size_t>(iSrcLineOff) * nDstXSize +
13,314,907✔
3499
                            iDstPixel - nDstXOff;
13,314,907✔
3500
                        padfHorizontalFiltered[nTempOffset] = 0.0;
13,314,907✔
3501
                        pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
13,314,907✔
3502
                        continue;
13,314,907✔
3503
                    }
3504
                }
3505

3506
                double dfVal = 0.0;
7,108,569✔
3507
                GDALResampleConvolutionHorizontalWithMask(
7,108,569✔
3508
                    pChunk + j, pabyChunkNodataMask + j, padfWeights,
44,639✔
3509
                    nSrcPixelCount, dfVal, dfWeightSum);
3510
                const size_t nTempOffset =
7,106,706✔
3511
                    static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
7,106,706✔
3512
                    nDstXOff;
7,106,706✔
3513
                if (dfWeightSum > 0.0)
7,106,706✔
3514
                {
3515
                    padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
7,054,866✔
3516
                    pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
7,054,866✔
3517
                }
3518
                else
3519
                {
3520
                    padfHorizontalFiltered[nTempOffset] = 0.0;
51,906✔
3521
                    pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
51,906✔
3522
                }
3523
            }
3524
        }
3525
    }
3526

3527
    /* ==================================================================== */
3528
    /*      Second pass: vertical filter                                    */
3529
    /* ==================================================================== */
3530
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
4,473✔
3531

3532
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
266,297✔
3533
    {
3534
        Twork *const pafDstScanline =
261,824✔
3535
            pafWrkScanline
3536
                ? pafWrkScanline
261,824✔
3537
                : static_cast<Twork *>(pDstBuffer) +
8,421✔
3538
                      static_cast<size_t>(iDstLine - nDstYOff) * nDstXSize;
8,421✔
3539

3540
        const double dfSrcLine =
261,824✔
3541
            (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
261,824✔
3542
        int nSrcLineStart =
261,824✔
3543
            static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
261,824✔
3544
        int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
261,824✔
3545
        if (nSrcLineStart < nChunkYOff)
261,824✔
3546
            nSrcLineStart = nChunkYOff;
2,815✔
3547
        if (nSrcLineStop > nChunkBottomYOff)
261,824✔
3548
            nSrcLineStop = nChunkBottomYOff;
2,859✔
3549
#if 0
3550
        if( nSrcLineStart < nChunkYOff &&
3551
            nChunkYOff > 0 )
3552
        {
3553
            printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3554
        }
3555
        if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3556
        {
3557
            printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3558
        }
3559
#endif
3560
        const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
261,824✔
3561
        double dfWeightSum = 0.0;
261,824✔
3562

3563
        // Compute convolution coefficients.
3564
        int nSrcLine = nSrcLineStart;  // Used after for.
261,824✔
3565
        double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
261,824✔
3566
        for (; nSrcLine < nSrcLineStop - 3;
616,065✔
3567
             nSrcLine += 4, dfY += 4 * dfYScaleWeight)
354,241✔
3568
        {
3569
            padfWeights[nSrcLine - nSrcLineStart] = dfY;
354,245✔
3570
            padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
354,245✔
3571
            padfWeights[nSrcLine + 2 - nSrcLineStart] =
354,245✔
3572
                dfY + 2 * dfYScaleWeight;
354,245✔
3573
            padfWeights[nSrcLine + 3 - nSrcLineStart] =
354,245✔
3574
                dfY + 3 * dfYScaleWeight;
354,245✔
3575
            dfWeightSum +=
354,241✔
3576
                pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
354,245✔
3577
        }
3578
        for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
297,504✔
3579
        {
3580
            const double dfWeight = pfnFilterFunc(dfY);
35,694✔
3581
            padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
35,684✔
3582
            dfWeightSum += dfWeight;
35,684✔
3583
        }
3584

3585
        if (pabyChunkNodataMask == nullptr)
261,810✔
3586
        {
3587
            if (dfWeightSum != 0)
222,823✔
3588
            {
3589
                const double dfInvWeightSum = 1.0 / dfWeightSum;
222,819✔
3590
                for (int i = 0; i < nSrcLineCount; ++i)
1,402,838✔
3591
                    padfWeights[i] *= dfInvWeightSum;
1,180,019✔
3592
            }
3593
        }
3594

3595
        if (pabyChunkNodataMask == nullptr)
261,810✔
3596
        {
3597
            int iFilteredPixelOff = 0;  // Used after for.
222,820✔
3598
            // j used after for.
3599
            size_t j =
222,820✔
3600
                (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
222,820✔
3601
#ifdef USE_SSE2
3602
            if constexpr (eWrkDataType == GDT_Float32)
3603
            {
3604
#ifdef __AVX__
3605
                for (; iFilteredPixelOff < nDstXSize - 15;
3606
                     iFilteredPixelOff += 16, j += 16)
3607
                {
3608
                    GDALResampleConvolutionVertical_16cols(
3609
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3610
                        nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3611
                    if (bHasNoData)
3612
                    {
3613
                        for (int k = 0; k < 16; k++)
3614
                        {
3615
                            pafDstScanline[iFilteredPixelOff + k] =
3616
                                replaceValIfNodata(
3617
                                    pafDstScanline[iFilteredPixelOff + k]);
3618
                        }
3619
                    }
3620
                }
3621
#else
3622
                for (; iFilteredPixelOff < nDstXSize - 7;
21,594,688✔
3623
                     iFilteredPixelOff += 8, j += 8)
3624
                {
3625
                    GDALResampleConvolutionVertical_8cols(
21,402,878✔
3626
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
21,402,878✔
3627
                        nSrcLineCount, pafDstScanline + iFilteredPixelOff);
21,402,878✔
3628
                    if (bHasNoData)
21,379,058✔
3629
                    {
3630
                        for (int k = 0; k < 8; k++)
123,192✔
3631
                        {
3632
                            pafDstScanline[iFilteredPixelOff + k] =
109,504✔
3633
                                replaceValIfNodata(
109,504✔
3634
                                    pafDstScanline[iFilteredPixelOff + k]);
109,504✔
3635
                        }
3636
                    }
3637
                }
3638
#endif
3639

3640
                for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
656,971✔
3641
                {
3642
                    const Twork fVal =
465,130✔
3643
                        static_cast<Twork>(GDALResampleConvolutionVertical(
465,164✔
3644
                            padfHorizontalFiltered + j, nDstXSize, padfWeights,
465,164✔
3645
                            nSrcLineCount));
3646
                    pafDstScanline[iFilteredPixelOff] =
465,175✔
3647
                        replaceValIfNodata(fVal);
465,130✔
3648
                }
3649
            }
3650
            else
3651
#endif
3652
            {
3653
                for (; iFilteredPixelOff < nDstXSize - 1;
2,887,210✔
3654
                     iFilteredPixelOff += 2, j += 2)
3655
                {
3656
                    double dfVal1 = 0.0;
2,880,000✔
3657
                    double dfVal2 = 0.0;
2,880,000✔
3658
                    GDALResampleConvolutionVertical_2cols(
2,880,000✔
3659
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
2,880,000✔
3660
                        nSrcLineCount, dfVal1, dfVal2);
3661
                    pafDstScanline[iFilteredPixelOff] =
5,760,010✔
3662
                        replaceValIfNodata(static_cast<Twork>(dfVal1));
2,880,000✔
3663
                    pafDstScanline[iFilteredPixelOff + 1] =
2,880,000✔
3664
                        replaceValIfNodata(static_cast<Twork>(dfVal2));
2,880,000✔
3665
                }
3666
                if (iFilteredPixelOff < nDstXSize)
7,206✔
3667
                {
3668
                    const double dfVal = GDALResampleConvolutionVertical(
2✔
3669
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
2✔
3670
                        nSrcLineCount);
3671
                    pafDstScanline[iFilteredPixelOff] =
2✔
3672
                        replaceValIfNodata(static_cast<Twork>(dfVal));
2✔
3673
                }
3674
            }
3675
        }
3676
        else
3677
        {
3678
            for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
19,035,637✔
3679
                 ++iFilteredPixelOff)
3680
            {
3681
                double dfVal = 0.0;
18,979,733✔
3682
                dfWeightSum = 0.0;
18,979,733✔
3683
                size_t j = (nSrcLineStart - nChunkYOff) *
18,979,733✔
3684
                               static_cast<size_t>(nDstXSize) +
18,979,733✔
3685
                           iFilteredPixelOff;
18,979,733✔
3686
                if (bKernelWithNegativeWeights)
18,979,733✔
3687
                {
3688
                    int nConsecutiveValid = 0;
18,718,201✔
3689
                    int nMaxConsecutiveValid = 0;
18,718,201✔
3690
                    for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
132,765,321✔
3691
                    {
3692
                        const double dfWeight =
114,029,020✔
3693
                            padfWeights[i] *
114,029,020✔
3694
                            pabyChunkNodataMaskHorizontalFiltered[j];
3695
                        if (pabyChunkNodataMaskHorizontalFiltered[j])
114,029,020✔
3696
                        {
3697
                            nConsecutiveValid++;
48,495,137✔
3698
                        }
3699
                        else if (nConsecutiveValid)
65,534,383✔
3700
                        {
3701
                            nMaxConsecutiveValid = std::max(
222,031✔
3702
                                nMaxConsecutiveValid, nConsecutiveValid);
204,376✔
3703
                            nConsecutiveValid = 0;
222,031✔
3704
                        }
3705
                        dfVal += padfHorizontalFiltered[j] * dfWeight;
114,047,020✔
3706
                        dfWeightSum += dfWeight;
114,047,020✔
3707
                    }
3708
                    nMaxConsecutiveValid =
18,734,801✔
3709
                        std::max(nMaxConsecutiveValid, nConsecutiveValid);
18,735,901✔
3710
                    if (nMaxConsecutiveValid < nSrcLineCount / 2)
18,734,801✔
3711
                    {
3712
                        pafDstScanline[iFilteredPixelOff] =
9,246,271✔
3713
                            static_cast<Twork>(dfNoDataValue);
9,246,179✔
3714
                        continue;
9,246,271✔
3715
                    }
3716
                }
3717
                else
3718
                {
3719
                    for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
1,255,012✔
3720
                    {
3721
                        const double dfWeight =
993,504✔
3722
                            padfWeights[i] *
993,504✔
3723
                            pabyChunkNodataMaskHorizontalFiltered[j];
3724
                        dfVal += padfHorizontalFiltered[j] * dfWeight;
993,504✔
3725
                        dfWeightSum += dfWeight;
993,504✔
3726
                    }
3727
                }
3728
                if (dfWeightSum > 0.0)
9,750,002✔
3729
                {
3730
                    pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
9,717,051✔
3731
                        static_cast<Twork>(dfVal / dfWeightSum));
9,716,629✔
3732
                }
3733
                else
3734
                {
3735
                    pafDstScanline[iFilteredPixelOff] =
33,357✔
3736
                        static_cast<Twork>(dfNoDataValue);
33,333✔
3737
                }
3738
            }
3739
        }
3740

3741
        if (fMaxVal != 0.0f)
254,927✔
3742
        {
3743
            for (int i = 0; i < nDstXSize; ++i)
192,324✔
3744
            {
3745
                if (pafDstScanline[i] > fMaxVal)
192,088✔
3746
                    pafDstScanline[i] = fMaxVal;
96,022✔
3747
            }
3748
        }
3749

3750
        if (pafWrkScanline)
254,927✔
3751
        {
3752
            GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
253,406✔
3753
                            static_cast<GByte *>(pDstBuffer) +
3754
                                static_cast<size_t>(iDstLine - nDstYOff) *
253,406✔
3755
                                    nDstXSize * nDstDataTypeSize,
253,406✔
3756
                            dstDataType, nDstDataTypeSize, nDstXSize);
3757
        }
3758
    }
3759

3760
    VSIFree(pafWrkScanline);
4,473✔
3761
    VSIFreeAligned(padfWeights);
4,473✔
3762
    VSIFree(padfHorizontalFiltered);
4,473✔
3763
    VSIFree(pabyChunkNodataMaskHorizontalFiltered);
4,473✔
3764

3765
    return CE_None;
4,473✔
3766
}
3767

3768
static CPLErr
3769
GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
4,472✔
3770
                              const void *pChunk, void **ppDstBuffer,
3771
                              GDALDataType *peDstBufferDataType)
3772
{
3773
    GDALResampleAlg eResample;
3774
    bool bKernelWithNegativeWeights = false;
4,472✔
3775
    if (EQUAL(args.pszResampling, "BILINEAR"))
4,472✔
3776
        eResample = GRA_Bilinear;
2,628✔
3777
    else if (EQUAL(args.pszResampling, "CUBIC"))
1,844✔
3778
    {
3779
        eResample = GRA_Cubic;
1,762✔
3780
        bKernelWithNegativeWeights = true;
1,762✔
3781
    }
3782
    else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
82✔
3783
        eResample = GRA_CubicSpline;
23✔
3784
    else if (EQUAL(args.pszResampling, "LANCZOS"))
59✔
3785
    {
3786
        eResample = GRA_Lanczos;
54✔
3787
        bKernelWithNegativeWeights = true;
54✔
3788
    }
3789
    else
3790
    {
3791
        CPLAssert(false);
5✔
3792
        return CE_Failure;
3793
    }
3794
    const int nKernelRadius = GWKGetFilterRadius(eResample);
4,467✔
3795
    FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
4,471✔
3796
    const FilterFunc4ValuesType pfnFilterFunc4Values =
3797
        GWKGetFilterFunc4Values(eResample);
4,467✔
3798

3799
    float fMaxVal = 0.f;
4,463✔
3800
    // Cubic, etc... can have overshoots, so make sure we clamp values to the
3801
    // maximum value if NBITS is set.
3802
    if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
4,463✔
3803
        (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
8✔
3804
         args.eOvrDataType == GDT_UInt32))
×
3805
    {
3806
        int nBits = args.nOvrNBITS;
8✔
3807
        if (nBits == GDALGetDataTypeSizeBits(args.eOvrDataType))
8✔
3808
            nBits = 0;
1✔
3809
        if (nBits > 0 && nBits < 32)
8✔
3810
            fMaxVal = static_cast<float>((1U << nBits) - 1);
7✔
3811
    }
3812

3813
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(
4,463✔
3814
        args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
3815
        GDALGetDataTypeSizeBytes(args.eOvrDataType));
3816
    if (*ppDstBuffer == nullptr)
4,471✔
3817
    {
3818
        return CE_Failure;
×
3819
    }
3820
    *peDstBufferDataType = args.eOvrDataType;
4,471✔
3821

3822
    switch (args.eWrkDataType)
4,471✔
3823
    {
3824
        case GDT_Byte:
3,734✔
3825
        {
3826
            return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
3,734✔
3827
                args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
3828
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3829
                bKernelWithNegativeWeights, fMaxVal);
3,735✔
3830
        }
3831

3832
        case GDT_UInt16:
395✔
3833
        {
3834
            return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
395✔
3835
                args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
3836
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3837
                bKernelWithNegativeWeights, fMaxVal);
396✔
3838
        }
3839

3840
        case GDT_Float32:
313✔
3841
        {
3842
            return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
313✔
3843
                args, static_cast<const float *>(pChunk), *ppDstBuffer,
3844
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3845
                bKernelWithNegativeWeights, fMaxVal);
313✔
3846
        }
3847

3848
        case GDT_Float64:
29✔
3849
        {
3850
            return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
29✔
3851
                args, static_cast<const double *>(pChunk), *ppDstBuffer,
3852
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3853
                bKernelWithNegativeWeights, fMaxVal);
29✔
3854
        }
3855

3856
        default:
×
3857
            break;
×
3858
    }
3859

3860
    CPLAssert(false);
×
3861
    return CE_Failure;
3862
}
3863

3864
/************************************************************************/
3865
/*                       GDALResampleChunkC32R()                        */
3866
/************************************************************************/
3867

3868
static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
2✔
3869
                                    const float *pafChunk, const int nChunkYOff,
3870
                                    const int nChunkYSize, const int nDstYOff,
3871
                                    const int nDstYOff2, const int nOvrXSize,
3872
                                    const int nOvrYSize, void **ppDstBuffer,
3873
                                    GDALDataType *peDstBufferDataType,
3874
                                    const char *pszResampling)
3875

3876
{
3877
    enum Method
3878
    {
3879
        NEAR,
3880
        AVERAGE,
3881
        AVERAGE_MAGPHASE,
3882
        RMS,
3883
    };
3884

3885
    Method eMethod = NEAR;
2✔
3886
    if (STARTS_WITH_CI(pszResampling, "NEAR"))
2✔
3887
    {
3888
        eMethod = NEAR;
×
3889
    }
3890
    else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
2✔
3891
    {
3892
        eMethod = AVERAGE_MAGPHASE;
×
3893
    }
3894
    else if (EQUAL(pszResampling, "RMS"))
2✔
3895
    {
3896
        eMethod = RMS;
2✔
3897
    }
3898
    else if (STARTS_WITH_CI(pszResampling, "AVER"))
×
3899
    {
3900
        eMethod = AVERAGE;
×
3901
    }
3902
    else
3903
    {
3904
        CPLError(
×
3905
            CE_Failure, CPLE_NotSupported,
3906
            "Resampling method %s is not supported for complex data types. "
3907
            "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
3908
            pszResampling);
3909
        return CE_Failure;
×
3910
    }
3911

3912
    const int nOXSize = nOvrXSize;
2✔
3913
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
2✔
3914
                                       GDALGetDataTypeSizeBytes(GDT_CFloat32));
3915
    if (*ppDstBuffer == nullptr)
2✔
3916
    {
3917
        return CE_Failure;
×
3918
    }
3919
    float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
2✔
3920
    *peDstBufferDataType = GDT_CFloat32;
2✔
3921

3922
    const int nOYSize = nOvrYSize;
2✔
3923
    const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
2✔
3924
    const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
2✔
3925

3926
    /* ==================================================================== */
3927
    /*      Loop over destination scanlines.                                */
3928
    /* ==================================================================== */
3929
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
8✔
3930
    {
3931
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
6✔
3932
        if (nSrcYOff < nChunkYOff)
6✔
3933
            nSrcYOff = nChunkYOff;
×
3934

3935
        int nSrcYOff2 =
6✔
3936
            static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
6✔
3937
        if (nSrcYOff2 == nSrcYOff)
6✔
3938
            nSrcYOff2++;
×
3939

3940
        if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
6✔
3941
        {
3942
            if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
2✔
3943
                nSrcYOff = nSrcHeight - 1;
×
3944
            nSrcYOff2 = nSrcHeight;
2✔
3945
        }
3946
        if (nSrcYOff2 > nChunkYOff + nChunkYSize)
6✔
3947
            nSrcYOff2 = nChunkYOff + nChunkYSize;
×
3948

3949
        const float *const pafSrcScanline =
6✔
3950
            pafChunk +
6✔
3951
            (static_cast<size_t>(nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
6✔
3952
        float *const pafDstScanline =
6✔
3953
            pafDstBuffer +
6✔
3954
            static_cast<size_t>(iDstLine - nDstYOff) * 2 * nOXSize;
6✔
3955

3956
        /* --------------------------------------------------------------------
3957
         */
3958
        /*      Loop over destination pixels */
3959
        /* --------------------------------------------------------------------
3960
         */
3961
        for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
18✔
3962
        {
3963
            const size_t iDstPixelSZ = static_cast<size_t>(iDstPixel);
12✔
3964
            int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
12✔
3965
            int nSrcXOff2 =
12✔
3966
                static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
12✔
3967
            if (nSrcXOff2 == nSrcXOff)
12✔
3968
                nSrcXOff2++;
×
3969
            if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
12✔
3970
            {
3971
                if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
6✔
3972
                    nSrcXOff = nSrcWidth - 1;
×
3973
                nSrcXOff2 = nSrcWidth;
6✔
3974
            }
3975
            const size_t nSrcXOffSZ = static_cast<size_t>(nSrcXOff);
12✔
3976

3977
            if (eMethod == NEAR)
12✔
3978
            {
3979
                pafDstScanline[iDstPixelSZ * 2] =
×
3980
                    pafSrcScanline[nSrcXOffSZ * 2];
×
3981
                pafDstScanline[iDstPixelSZ * 2 + 1] =
×
3982
                    pafSrcScanline[nSrcXOffSZ * 2 + 1];
×
3983
            }
3984
            else if (eMethod == AVERAGE_MAGPHASE)
12✔
3985
            {
3986
                double dfTotalR = 0.0;
×
3987
                double dfTotalI = 0.0;
×
3988
                double dfTotalM = 0.0;
×
3989
                size_t nCount = 0;
×
3990

3991
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
×
3992
                {
3993
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
×
3994
                    {
3995
                        const double dfR =
×
3996
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
×
3997
                                           static_cast<size_t>(iY - nSrcYOff) *
×
3998
                                               nSrcWidth * 2];
×
3999
                        const double dfI =
×
4000
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
×
4001
                                           static_cast<size_t>(iY - nSrcYOff) *
×
4002
                                               nSrcWidth * 2 +
×
4003
                                           1];
×
4004
                        dfTotalR += dfR;
×
4005
                        dfTotalI += dfI;
×
4006
                        dfTotalM += std::hypot(dfR, dfI);
×
4007
                        ++nCount;
×
4008
                    }
4009
                }
4010

4011
                CPLAssert(nCount > 0);
×
4012
                if (nCount == 0)
×
4013
                {
4014
                    pafDstScanline[iDstPixelSZ * 2] = 0.0;
×
4015
                    pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
×
4016
                }
4017
                else
4018
                {
4019
                    pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
×
4020
                        dfTotalR / static_cast<double>(nCount));
×
4021
                    pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
×
4022
                        dfTotalI / static_cast<double>(nCount));
×
4023
                    const double dfM =
4024
                        std::hypot(pafDstScanline[iDstPixelSZ * 2],
×
4025
                                   pafDstScanline[iDstPixelSZ * 2 + 1]);
×
4026
                    const double dfDesiredM =
×
4027
                        dfTotalM / static_cast<double>(nCount);
×
4028
                    double dfRatio = 1.0;
×
4029
                    if (dfM != 0.0)
×
4030
                        dfRatio = dfDesiredM / dfM;
×
4031

4032
                    pafDstScanline[iDstPixelSZ * 2] *=
×
4033
                        static_cast<float>(dfRatio);
×
4034
                    pafDstScanline[iDstPixelSZ * 2 + 1] *=
×
4035
                        static_cast<float>(dfRatio);
×
4036
                }
4037
            }
4038
            else if (eMethod == RMS)
12✔
4039
            {
4040
                double dfTotalR = 0.0;
12✔
4041
                double dfTotalI = 0.0;
12✔
4042
                size_t nCount = 0;
12✔
4043

4044
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
36✔
4045
                {
4046
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
72✔
4047
                    {
4048
                        const double dfR =
48✔
4049
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
48✔
4050
                                           static_cast<size_t>(iY - nSrcYOff) *
48✔
4051
                                               nSrcWidth * 2];
48✔
4052
                        const double dfI =
48✔
4053
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
48✔
4054
                                           static_cast<size_t>(iY - nSrcYOff) *
48✔
4055
                                               nSrcWidth * 2 +
48✔
4056
                                           1];
48✔
4057

4058
                        dfTotalR += SQUARE(dfR);
48✔
4059
                        dfTotalI += SQUARE(dfI);
48✔
4060

4061
                        ++nCount;
48✔
4062
                    }
4063
                }
4064

4065
                CPLAssert(nCount > 0);
12✔
4066
                if (nCount == 0)
12✔
4067
                {
4068
                    pafDstScanline[iDstPixelSZ * 2] = 0.0;
×
4069
                    pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
×
4070
                }
4071
                else
4072
                {
4073
                    /* compute RMS */
4074
                    pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
12✔
4075
                        sqrt(dfTotalR / static_cast<double>(nCount)));
12✔
4076
                    pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
12✔
4077
                        sqrt(dfTotalI / static_cast<double>(nCount)));
12✔
4078
                }
4079
            }
4080
            else if (eMethod == AVERAGE)
×
4081
            {
4082
                double dfTotalR = 0.0;
×
4083
                double dfTotalI = 0.0;
×
4084
                size_t nCount = 0;
×
4085

4086
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
×
4087
                {
4088
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
×
4089
                    {
4090
                        // TODO(schwehr): Maybe use std::complex?
4091
                        dfTotalR +=
×
4092
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
×
4093
                                           static_cast<size_t>(iY - nSrcYOff) *
×
4094
                                               nSrcWidth * 2];
×
4095
                        dfTotalI +=
×
4096
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
×
4097
                                           static_cast<size_t>(iY - nSrcYOff) *
×
4098
                                               nSrcWidth * 2 +
×
4099
                                           1];
×
4100
                        ++nCount;
×
4101
                    }
4102
                }
4103

4104
                CPLAssert(nCount > 0);
×
4105
                if (nCount == 0)
×
4106
                {
4107
                    pafDstScanline[iDstPixelSZ * 2] = 0.0;
×
4108
                    pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
×
4109
                }
4110
                else
4111
                {
4112
                    pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
×
4113
                        dfTotalR / static_cast<double>(nCount));
×
4114
                    pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
×
4115
                        dfTotalI / static_cast<double>(nCount));
×
4116
                }
4117
            }
4118
        }
4119
    }
4120

4121
    return CE_None;
2✔
4122
}
4123

4124
/************************************************************************/
4125
/*                  GDALRegenerateCascadingOverviews()                  */
4126
/*                                                                      */
4127
/*      Generate a list of overviews in order from largest to           */
4128
/*      smallest, computing each from the next larger.                  */
4129
/************************************************************************/
4130

4131
static CPLErr GDALRegenerateCascadingOverviews(
44✔
4132
    GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4133
    const char *pszResampling, GDALProgressFunc pfnProgress,
4134
    void *pProgressData, CSLConstList papszOptions)
4135

4136
{
4137
    /* -------------------------------------------------------------------- */
4138
    /*      First, we must put the overviews in order from largest to       */
4139
    /*      smallest.                                                       */
4140
    /* -------------------------------------------------------------------- */
4141
    for (int i = 0; i < nOverviews - 1; ++i)
127✔
4142
    {
4143
        for (int j = 0; j < nOverviews - i - 1; ++j)
292✔
4144
        {
4145
            if (papoOvrBands[j]->GetXSize() *
209✔
4146
                    static_cast<float>(papoOvrBands[j]->GetYSize()) <
209✔
4147
                papoOvrBands[j + 1]->GetXSize() *
209✔
4148
                    static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
209✔
4149
            {
4150
                GDALRasterBand *poTempBand = papoOvrBands[j];
×
4151
                papoOvrBands[j] = papoOvrBands[j + 1];
×
4152
                papoOvrBands[j + 1] = poTempBand;
×
4153
            }
4154
        }
4155
    }
4156

4157
    /* -------------------------------------------------------------------- */
4158
    /*      Count total pixels so we can prepare appropriate scaled         */
4159
    /*      progress functions.                                             */
4160
    /* -------------------------------------------------------------------- */
4161
    double dfTotalPixels = 0.0;
44✔
4162

4163
    for (int i = 0; i < nOverviews; ++i)
171✔
4164
    {
4165
        dfTotalPixels += papoOvrBands[i]->GetXSize() *
127✔
4166
                         static_cast<double>(papoOvrBands[i]->GetYSize());
127✔
4167
    }
4168

4169
    /* -------------------------------------------------------------------- */
4170
    /*      Generate all the bands.                                         */
4171
    /* -------------------------------------------------------------------- */
4172
    double dfPixelsProcessed = 0.0;
44✔
4173

4174
    for (int i = 0; i < nOverviews; ++i)
171✔
4175
    {
4176
        GDALRasterBand *poBaseBand = poSrcBand;
127✔
4177
        if (i != 0)
127✔
4178
            poBaseBand = papoOvrBands[i - 1];
83✔
4179

4180
        double dfPixels = papoOvrBands[i]->GetXSize() *
127✔
4181
                          static_cast<double>(papoOvrBands[i]->GetYSize());
127✔
4182

4183
        void *pScaledProgressData = GDALCreateScaledProgress(
254✔
4184
            dfPixelsProcessed / dfTotalPixels,
4185
            (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
127✔
4186
            pProgressData);
4187

4188
        const CPLErr eErr = GDALRegenerateOverviewsEx(
254✔
4189
            poBaseBand, 1,
4190
            reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
127✔
4191
            pszResampling, GDALScaledProgress, pScaledProgressData,
4192
            papszOptions);
4193
        GDALDestroyScaledProgress(pScaledProgressData);
127✔
4194

4195
        if (eErr != CE_None)
127✔
4196
            return eErr;
×
4197

4198
        dfPixelsProcessed += dfPixels;
127✔
4199

4200
        // Only do the bit2grayscale promotion on the base band.
4201
        if (STARTS_WITH_CI(pszResampling,
127✔
4202
                           "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4203
            pszResampling = "AVERAGE";
8✔
4204
    }
4205

4206
    return CE_None;
44✔
4207
}
4208

4209
/************************************************************************/
4210
/*                    GDALGetResampleFunction()                         */
4211
/************************************************************************/
4212

4213
GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4,943✔
4214
                                             int *pnRadius)
4215
{
4216
    if (pnRadius)
4,943✔
4217
        *pnRadius = 0;
4,943✔
4218
    if (STARTS_WITH_CI(pszResampling, "NEAR"))
4,943✔
4219
        return GDALResampleChunk_Near;
500✔
4220
    else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4,443✔
4221
             EQUAL(pszResampling, "RMS"))
3,900✔
4222
        return GDALResampleChunk_AverageOrRMS;
570✔
4223
    else if (EQUAL(pszResampling, "GAUSS"))
3,873✔
4224
    {
4225
        if (pnRadius)
26✔
4226
            *pnRadius = 1;
26✔
4227
        return GDALResampleChunk_Gauss;
26✔
4228
    }
4229
    else if (EQUAL(pszResampling, "MODE"))
3,847✔
4230
        return GDALResampleChunk_Mode;
96✔
4231
    else if (EQUAL(pszResampling, "CUBIC"))
3,751✔
4232
    {
4233
        if (pnRadius)
1,339✔
4234
            *pnRadius = GWKGetFilterRadius(GRA_Cubic);
1,339✔
4235
        return GDALResampleChunk_Convolution;
1,341✔
4236
    }
4237
    else if (EQUAL(pszResampling, "CUBICSPLINE"))
2,412✔
4238
    {
4239
        if (pnRadius)
3✔
4240
            *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
3✔
4241
        return GDALResampleChunk_Convolution;
3✔
4242
    }
4243
    else if (EQUAL(pszResampling, "LANCZOS"))
2,409✔
4244
    {
4245
        if (pnRadius)
8✔
4246
            *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
8✔
4247
        return GDALResampleChunk_Convolution;
8✔
4248
    }
4249
    else if (EQUAL(pszResampling, "BILINEAR"))
2,401✔
4250
    {
4251
        if (pnRadius)
2,398✔
4252
            *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
2,398✔
4253
        return GDALResampleChunk_Convolution;
2,398✔
4254
    }
4255
    else
4256
    {
4257
        CPLError(
3✔
4258
            CE_Failure, CPLE_AppDefined,
4259
            "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4260
            pszResampling);
4261
        return nullptr;
×
4262
    }
4263
}
4264

4265
/************************************************************************/
4266
/*                      GDALGetOvrWorkDataType()                        */
4267
/************************************************************************/
4268

4269
GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4,819✔
4270
                                    GDALDataType eSrcDataType)
4271
{
4272
    if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4,819✔
4273
    {
4274
        return eSrcDataType;
585✔
4275
    }
4276
    else if (eSrcDataType == GDT_Byte &&
4,234✔
4277
             (STARTS_WITH_CI(pszResampling, "AVER") ||
3,894✔
4278
              EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
3,422✔
4279
              EQUAL(pszResampling, "CUBICSPLINE") ||
2,274✔
4280
              EQUAL(pszResampling, "LANCZOS") ||
2,271✔
4281
              EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
2,266✔
4282
    {
4283
        return GDT_Byte;
3,887✔
4284
    }
4285
    else if (eSrcDataType == GDT_UInt16 &&
347✔
4286
             (STARTS_WITH_CI(pszResampling, "AVER") ||
119✔
4287
              EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
115✔
4288
              EQUAL(pszResampling, "CUBICSPLINE") ||
3✔
4289
              EQUAL(pszResampling, "LANCZOS") ||
3✔
4290
              EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
2✔
4291
    {
4292
        return GDT_UInt16;
111✔
4293
    }
4294
    else if (EQUAL(pszResampling, "GAUSS"))
236✔
4295
        return GDT_Float64;
20✔
4296

4297
    if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
216✔
4298
        eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
204✔
4299
        eSrcDataType == GDT_Float32)
4300
    {
4301
        return GDT_Float32;
174✔
4302
    }
4303
    return GDT_Float64;
42✔
4304
}
4305

4306
namespace
4307
{
4308
// Structure to hold a pointer to free with CPLFree()
4309
struct PointerHolder
4310
{
4311
    void *ptr = nullptr;
4312

4313
    explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
5,792✔
4314
    {
4315
    }
5,792✔
4316

4317
    ~PointerHolder()
5,792✔
4318
    {
5,792✔
4319
        CPLFree(ptr);
5,792✔
4320
    }
5,792✔
4321

4322
    PointerHolder(const PointerHolder &) = delete;
4323
    PointerHolder &operator=(const PointerHolder &) = delete;
4324
};
4325
}  // namespace
4326

4327
/************************************************************************/
4328
/*                      GDALRegenerateOverviews()                       */
4329
/************************************************************************/
4330

4331
/**
4332
 * \brief Generate downsampled overviews.
4333
 *
4334
 * This function will generate one or more overview images from a base image
4335
 * using the requested downsampling algorithm.  Its primary use is for
4336
 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4337
 * used to generate downsampled images in one file from another outside the
4338
 * overview architecture.
4339
 *
4340
 * The output bands need to exist in advance.
4341
 *
4342
 * The full set of resampling algorithms is documented in
4343
 * GDALDataset::BuildOverviews().
4344
 *
4345
 * This function will honour properly NODATA_VALUES tuples (special dataset
4346
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4347
 * considered as the nodata value and not each value of the triplet
4348
 * independently per band.
4349
 *
4350
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4351
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4352
 * overview computation.
4353
 *
4354
 * @param hSrcBand the source (base level) band.
4355
 * @param nOverviewCount the number of downsampled bands being generated.
4356
 * @param pahOvrBands the list of downsampled bands to be generated.
4357
 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4358
 * @param pfnProgress progress report function.
4359
 * @param pProgressData progress function callback data.
4360
 * @return CE_None on success or CE_Failure on failure.
4361
 */
4362
CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
250✔
4363
                               GDALRasterBandH *pahOvrBands,
4364
                               const char *pszResampling,
4365
                               GDALProgressFunc pfnProgress,
4366
                               void *pProgressData)
4367

4368
{
4369
    return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
250✔
4370
                                     pszResampling, pfnProgress, pProgressData,
4371
                                     nullptr);
250✔
4372
}
4373

4374
/************************************************************************/
4375
/*                     GDALRegenerateOverviewsEx()                      */
4376
/************************************************************************/
4377

4378
constexpr int RADIUS_TO_DIAMETER = 2;
4379

4380
/**
4381
 * \brief Generate downsampled overviews.
4382
 *
4383
 * This function will generate one or more overview images from a base image
4384
 * using the requested downsampling algorithm.  Its primary use is for
4385
 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4386
 * used to generate downsampled images in one file from another outside the
4387
 * overview architecture.
4388
 *
4389
 * The output bands need to exist in advance.
4390
 *
4391
 * The full set of resampling algorithms is documented in
4392
 * GDALDataset::BuildOverviews().
4393
 *
4394
 * This function will honour properly NODATA_VALUES tuples (special dataset
4395
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4396
 * considered as the nodata value and not each value of the triplet
4397
 * independently per band.
4398
 *
4399
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4400
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4401
 * overview computation.
4402
 *
4403
 * @param hSrcBand the source (base level) band.
4404
 * @param nOverviewCount the number of downsampled bands being generated.
4405
 * @param pahOvrBands the list of downsampled bands to be generated.
4406
 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4407
 * @param pfnProgress progress report function.
4408
 * @param pProgressData progress function callback data.
4409
 * @param papszOptions NULL terminated list of options as key=value pairs, or
4410
 * NULL
4411
 * @return CE_None on success or CE_Failure on failure.
4412
 * @since GDAL 3.6
4413
 */
4414
CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
887✔
4415
                                 GDALRasterBandH *pahOvrBands,
4416
                                 const char *pszResampling,
4417
                                 GDALProgressFunc pfnProgress,
4418
                                 void *pProgressData, CSLConstList papszOptions)
4419

4420
{
4421
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
887✔
4422
    GDALRasterBand **papoOvrBands =
887✔
4423
        reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4424

4425
    if (pfnProgress == nullptr)
887✔
4426
        pfnProgress = GDALDummyProgress;
252✔
4427

4428
    if (EQUAL(pszResampling, "NONE"))
887✔
4429
        return CE_None;
49✔
4430

4431
    int nKernelRadius = 0;
838✔
4432
    GDALResampleFunction pfnResampleFn =
4433
        GDALGetResampleFunction(pszResampling, &nKernelRadius);
838✔
4434

4435
    if (pfnResampleFn == nullptr)
838✔
4436
        return CE_Failure;
×
4437

4438
    /* -------------------------------------------------------------------- */
4439
    /*      Check color tables...                                           */
4440
    /* -------------------------------------------------------------------- */
4441
    GDALColorTable *poColorTable = nullptr;
838✔
4442

4443
    if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
471✔
4444
         EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
1,750✔
4445
        poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
452✔
4446
    {
4447
        poColorTable = poSrcBand->GetColorTable();
9✔
4448
        if (poColorTable != nullptr)
9✔
4449
        {
4450
            if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
9✔
4451
            {
4452
                CPLError(CE_Warning, CPLE_AppDefined,
×
4453
                         "Computing overviews on palette index raster bands "
4454
                         "with a palette whose color interpretation is not RGB "
4455
                         "will probably lead to unexpected results.");
4456
                poColorTable = nullptr;
×
4457
            }
4458
            else if (poColorTable->IsIdentity())
9✔
4459
            {
4460
                poColorTable = nullptr;
×
4461
            }
4462
        }
4463
        else
4464
        {
4465
            CPLError(CE_Warning, CPLE_AppDefined,
×
4466
                     "Computing overviews on palette index raster bands "
4467
                     "without a palette will probably lead to unexpected "
4468
                     "results.");
4469
        }
4470
    }
4471
    // Not ready yet
4472
    else if ((EQUAL(pszResampling, "CUBIC") ||
2,433✔
4473
              EQUAL(pszResampling, "CUBICSPLINE") ||
775✔
4474
              EQUAL(pszResampling, "LANCZOS") ||
775✔
4475
              EQUAL(pszResampling, "BILINEAR")) &&
1,684✔
4476
             poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
80✔
4477
    {
4478
        CPLError(CE_Warning, CPLE_AppDefined,
×
4479
                 "Computing %s overviews on palette index raster bands "
4480
                 "will probably lead to unexpected results.",
4481
                 pszResampling);
4482
    }
4483

4484
    // If we have a nodata mask and we are doing something more complicated
4485
    // than nearest neighbouring, we have to fetch to nodata mask.
4486

4487
    GDALRasterBand *poMaskBand = nullptr;
838✔
4488
    bool bUseNoDataMask = false;
838✔
4489
    bool bCanUseCascaded = true;
838✔
4490

4491
    if (!STARTS_WITH_CI(pszResampling, "NEAR"))
838✔
4492
    {
4493
        // Special case if we are an alpha/mask band. We want it to be
4494
        // considered as the mask band to avoid alpha=0 to be taken into account
4495
        // in average computation.
4496
        if (poSrcBand->IsMaskBand())
532✔
4497
        {
4498
            poMaskBand = poSrcBand;
91✔
4499
            bUseNoDataMask = true;
91✔
4500
        }
4501
        else
4502
        {
4503
            poMaskBand = poSrcBand->GetMaskBand();
441✔
4504
            const int nMaskFlags = poSrcBand->GetMaskFlags();
441✔
4505
            bCanUseCascaded =
441✔
4506
                (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
441✔
4507
            bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
441✔
4508
        }
4509
    }
4510

4511
    /* -------------------------------------------------------------------- */
4512
    /*      If we are operating on multiple overviews, and using            */
4513
    /*      averaging, lets do them in cascading order to reduce the        */
4514
    /*      amount of computation.                                          */
4515
    /* -------------------------------------------------------------------- */
4516

4517
    // In case the mask made be computed from another band of the dataset,
4518
    // we can't use cascaded generation, as the computation of the overviews
4519
    // of the band used for the mask band may not have yet occurred (#3033).
4520
    if ((STARTS_WITH_CI(pszResampling, "AVER") ||
838✔
4521
         EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
471✔
4522
         EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
440✔
4523
         EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
386✔
4524
         EQUAL(pszResampling, "MODE")) &&
838✔
4525
        nOverviewCount > 1 && bCanUseCascaded)
44✔
4526
        return GDALRegenerateCascadingOverviews(
44✔
4527
            poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4528
            pProgressData, papszOptions);
44✔
4529

4530
    /* -------------------------------------------------------------------- */
4531
    /*      Setup one horizontal swath to read from the raw buffer.         */
4532
    /* -------------------------------------------------------------------- */
4533
    int nFRXBlockSize = 0;
794✔
4534
    int nFRYBlockSize = 0;
794✔
4535
    poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
794✔
4536

4537
    const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
794✔
4538
    const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
1,282✔
4539
                                       EQUAL(pszResampling, "MODE") ||
1,236✔
4540
                                       !GDALDataTypeIsComplex(eSrcDataType);
442✔
4541
    const GDALDataType eWrkDataType =
4542
        bUseGenericResampleFn
4543
            ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
794✔
4544
            : GDT_CFloat32;
794✔
4545

4546
    const int nWidth = poSrcBand->GetXSize();
794✔
4547
    const int nHeight = poSrcBand->GetYSize();
794✔
4548

4549
    int nMaxOvrFactor = 1;
794✔
4550
    for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
1,705✔
4551
    {
4552
        const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
911✔
4553
        const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
911✔
4554
        nMaxOvrFactor = std::max(
911✔
4555
            nMaxOvrFactor,
4556
            static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
911✔
4557
        nMaxOvrFactor = std::max(
911✔
4558
            nMaxOvrFactor,
4559
            static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
911✔
4560
    }
4561

4562
    int nFullResYChunk = nFRYBlockSize;
794✔
4563
    int nMaxChunkYSizeQueried = 0;
794✔
4564

4565
    const auto UpdateChunkHeightAndGetChunkSize =
4566
        [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
10,354✔
4567
         eWrkDataType, nWidth]()
83,793✔
4568
    {
4569
        // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4570
        // + nFullResYChunk) / nMaxOvrFactor)
4571
        if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
10,354✔
4572
        {
4573
            return GINTBIG_MAX;
1✔
4574
        }
4575
        nFullResYChunk =
10,353✔
4576
            std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
10,353✔
4577
        if ((nKernelRadius > 0 &&
10,353✔
4578
             nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
970✔
4579
            nFullResYChunk >
10,353✔
4580
                INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
10,353✔
4581
        {
4582
            return GINTBIG_MAX;
×
4583
        }
4584
        nMaxChunkYSizeQueried =
10,353✔
4585
            nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
10,353✔
4586
        if (GDALGetDataTypeSizeBytes(eWrkDataType) >
10,353✔
4587
            std::numeric_limits<int64_t>::max() /
10,353✔
4588
                (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
10,353✔
4589
        {
4590
            return GINTBIG_MAX;
1✔
4591
        }
4592
        return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
10,352✔
4593
               nMaxChunkYSizeQueried * nWidth;
10,352✔
4594
    };
794✔
4595

4596
    const char *pszChunkYSize =
4597
        CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
794✔
4598
#ifndef __COVERITY__
4599
    // Only configurable for debug / testing
4600
    if (pszChunkYSize)
794✔
4601
    {
4602
        nFullResYChunk = atoi(pszChunkYSize);
×
4603
    }
4604
#endif
4605

4606
    // Only configurable for debug / testing
4607
    const int nChunkMaxSize =
4608
        atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
794✔
4609

4610
    auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
794✔
4611
    if (nChunkSize > nChunkMaxSize)
794✔
4612
    {
4613
        if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
15✔
4614
            !GDALDataTypeIsComplex(eSrcDataType) &&
44✔
4615
            (!STARTS_WITH_CI(pszResampling, "AVER") ||
14✔
4616
             EQUAL(pszResampling, "AVERAGE")))
2✔
4617
        {
4618
            // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
4619
            // which use a block based strategy, which is much less memory
4620
            // hungry.
4621
            return GDALRegenerateOverviewsMultiBand(
14✔
4622
                1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
4623
                pfnProgress, pProgressData, papszOptions);
14✔
4624
        }
4625
        else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
1✔
4626
        {
4627
            return GDALRegenerateCascadingOverviews(
×
4628
                poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
4629
                pfnProgress, pProgressData, papszOptions);
×
4630
        }
4631
    }
4632
    else if (pszChunkYSize == nullptr)
779✔
4633
    {
4634
        // Try to get as close as possible to nChunkMaxSize
4635
        while (nChunkSize < nChunkMaxSize / 2)
10,339✔
4636
        {
4637
            nFullResYChunk *= 2;
9,560✔
4638
            nChunkSize = UpdateChunkHeightAndGetChunkSize();
9,560✔
4639
        }
4640
    }
4641

4642
    int nHasNoData = 0;
780✔
4643
    const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
780✔
4644
    const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
780✔
4645
    const bool bPropagateNoData =
4646
        CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
780✔
4647

4648
    // Structure describing a resampling job
4649
    struct OvrJob
4650
    {
4651
        // Buffers to free when job is finished
4652
        std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
4653
        std::shared_ptr<PointerHolder> oSrcBufferHolder{};
4654
        std::unique_ptr<PointerHolder> oDstBufferHolder{};
4655

4656
        GDALRasterBand *poDstBand = nullptr;
4657

4658
        // Input parameters of pfnResampleFn
4659
        GDALResampleFunction pfnResampleFn = nullptr;
4660
        int nSrcWidth = 0;
4661
        int nSrcHeight = 0;
4662
        int nDstWidth = 0;
4663
        GDALOverviewResampleArgs args{};
4664
        const void *pChunk = nullptr;
4665
        bool bUseGenericResampleFn = false;
4666

4667
        // Output values of resampling function
4668
        CPLErr eErr = CE_Failure;
4669
        void *pDstBuffer = nullptr;
4670
        GDALDataType eDstBufferDataType = GDT_Unknown;
4671

4672
        void SetSrcMaskBufferHolder(
×
4673
            const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
4674
        {
4675
            oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
×
4676
        }
×
4677

4678
        void SetSrcBufferHolder(
×
4679
            const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
4680
        {
4681
            oSrcBufferHolder = oSrcBufferHolderIn;
×
4682
        }
×
4683

4684
        void NotifyFinished()
880✔
4685
        {
4686
            std::lock_guard guard(mutex);
1,760✔
4687
            bFinished = true;
880✔
4688
            cv.notify_one();
880✔
4689
        }
880✔
4690

4691
        bool IsFinished()
×
4692
        {
4693
            std::lock_guard guard(mutex);
×
4694
            return bFinished;
×
4695
        }
4696

4697
        void WaitFinished()
×
4698
        {
4699
            std::unique_lock oGuard(mutex);
×
4700
            while (!bFinished)
×
4701
            {
4702
                cv.wait(oGuard);
×
4703
            }
4704
        }
×
4705

4706
      private:
4707
        // Synchronization
4708
        bool bFinished = false;
4709
        std::mutex mutex{};
4710
        std::condition_variable cv{};
4711
    };
4712

4713
    // Thread function to resample
4714
    const auto JobResampleFunc = [](void *pData)
880✔
4715
    {
4716
        OvrJob *poJob = static_cast<OvrJob *>(pData);
880✔
4717

4718
        if (poJob->bUseGenericResampleFn)
880✔
4719
        {
4720
            poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
878✔
4721
                                               &(poJob->pDstBuffer),
4722
                                               &(poJob->eDstBufferDataType));
4723
        }
4724
        else
4725
        {
4726
            poJob->eErr = GDALResampleChunkC32R(
2✔
4727
                poJob->nSrcWidth, poJob->nSrcHeight,
4728
                static_cast<const float *>(poJob->pChunk),
2✔
4729
                poJob->args.nChunkYOff, poJob->args.nChunkYSize,
4730
                poJob->args.nDstYOff, poJob->args.nDstYOff2,
4731
                poJob->args.nOvrXSize, poJob->args.nOvrYSize,
4732
                &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
4733
                poJob->args.pszResampling);
4734
        }
4735

4736
        poJob->oDstBufferHolder =
4737
            std::make_unique<PointerHolder>(poJob->pDstBuffer);
880✔
4738

4739
        poJob->NotifyFinished();
880✔
4740
    };
880✔
4741

4742
    // Function to write resample data to target band
4743
    const auto WriteJobData = [](const OvrJob *poJob)
880✔
4744
    {
4745
        return poJob->poDstBand->RasterIO(
1,760✔
4746
            GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
880✔
4747
            poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
880✔
4748
            poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
880✔
4749
            poJob->eDstBufferDataType, 0, 0, nullptr);
880✔
4750
    };
4751

4752
    // Wait for completion of oldest job and serialize it
4753
    const auto WaitAndFinalizeOldestJob =
4754
        [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
×
4755
    {
4756
        auto poOldestJob = jobList.front().get();
×
4757
        poOldestJob->WaitFinished();
×
4758
        CPLErr l_eErr = poOldestJob->eErr;
×
4759
        if (l_eErr == CE_None)
×
4760
        {
4761
            l_eErr = WriteJobData(poOldestJob);
×
4762
        }
4763

4764
        jobList.pop_front();
×
4765
        return l_eErr;
×
4766
    };
4767

4768
    // Queue of jobs
4769
    std::list<std::unique_ptr<OvrJob>> jobList;
1,560✔
4770

4771
    GByte *pabyChunkNodataMask = nullptr;
780✔
4772
    void *pChunk = nullptr;
780✔
4773

4774
    const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
780✔
4775
    const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
3,120✔
4776
                                                       ? CPLGetNumCPUs()
780✔
4777
                                                       : atoi(pszThreads)));
780✔
4778
    auto poThreadPool =
4779
        nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
780✔
4780
    auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
4781
                                   : std::unique_ptr<CPLJobQueue>(nullptr);
1,560✔
4782

4783
    /* -------------------------------------------------------------------- */
4784
    /*      Loop over image operating on chunks.                            */
4785
    /* -------------------------------------------------------------------- */
4786
    int nChunkYOff = 0;
780✔
4787
    CPLErr eErr = CE_None;
780✔
4788

4789
    for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
1,565✔
4790
         nChunkYOff += nFullResYChunk)
785✔
4791
    {
4792
        if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
785✔
4793
                         pProgressData))
4794
        {
4795
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
×
4796
            eErr = CE_Failure;
×
4797
        }
4798

4799
        if (nFullResYChunk + nChunkYOff > nHeight)
785✔
4800
            nFullResYChunk = nHeight - nChunkYOff;
778✔
4801

4802
        int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
785✔
4803
        int nChunkYSizeQueried =
785✔
4804
            nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
785✔
4805
        if (nChunkYOffQueried < 0)
785✔
4806
        {
4807
            nChunkYSizeQueried += nChunkYOffQueried;
83✔
4808
            nChunkYOffQueried = 0;
83✔
4809
        }
4810
        if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
785✔
4811
            nChunkYSizeQueried = nHeight - nChunkYOffQueried;
83✔
4812

4813
        // Avoid accumulating too many tasks and exhaust RAM
4814
        // Try to complete already finished jobs
4815
        while (eErr == CE_None && !jobList.empty())
785✔
4816
        {
4817
            auto poOldestJob = jobList.front().get();
×
4818
            if (!poOldestJob->IsFinished())
×
4819
                break;
×
4820
            eErr = poOldestJob->eErr;
×
4821
            if (eErr == CE_None)
×
4822
            {
4823
                eErr = WriteJobData(poOldestJob);
×
4824
            }
4825

4826
            jobList.pop_front();
×
4827
        }
4828

4829
        // And in case we have saturated the number of threads,
4830
        // wait for completion of tasks to go below the threshold.
4831
        while (eErr == CE_None &&
1,570✔
4832
               jobList.size() >= static_cast<size_t>(nThreads))
785✔
4833
        {
4834
            eErr = WaitAndFinalizeOldestJob(jobList);
×
4835
        }
4836

4837
        // (Re)allocate buffers if needed
4838
        if (pChunk == nullptr)
785✔
4839
        {
4840
            pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
780✔
4841
                                         nMaxChunkYSizeQueried, nWidth);
4842
        }
4843
        if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
785✔
4844
        {
4845
            pabyChunkNodataMask = static_cast<GByte *>(
4846
                VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
283✔
4847
        }
4848

4849
        if (pChunk == nullptr ||
785✔
4850
            (bUseNoDataMask && pabyChunkNodataMask == nullptr))
283✔
4851
        {
4852
            CPLFree(pChunk);
×
4853
            CPLFree(pabyChunkNodataMask);
×
4854
            return CE_Failure;
×
4855
        }
4856

4857
        // Read chunk.
4858
        if (eErr == CE_None)
785✔
4859
            eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
785✔
4860
                                       nChunkYSizeQueried, pChunk, nWidth,
4861
                                       nChunkYSizeQueried, eWrkDataType, 0, 0,
4862
                                       nullptr);
4863
        if (eErr == CE_None && bUseNoDataMask)
785✔
4864
            eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
283✔
4865
                                        nChunkYSizeQueried, pabyChunkNodataMask,
4866
                                        nWidth, nChunkYSizeQueried, GDT_Byte, 0,
4867
                                        0, nullptr);
4868

4869
        // Special case to promote 1bit data to 8bit 0/255 values.
4870
        if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
785✔
4871
        {
4872
            if (eWrkDataType == GDT_Float32)
9✔
4873
            {
4874
                float *pafChunk = static_cast<float *>(pChunk);
×
4875
                for (size_t i = 0;
×
4876
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
×
4877
                {
4878
                    if (pafChunk[i] == 1.0)
×
4879
                        pafChunk[i] = 255.0;
×
4880
                }
4881
            }
4882
            else if (eWrkDataType == GDT_Byte)
9✔
4883
            {
4884
                GByte *pabyChunk = static_cast<GByte *>(pChunk);
9✔
4885
                for (size_t i = 0;
168,417✔
4886
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
168,417✔
4887
                {
4888
                    if (pabyChunk[i] == 1)
168,408✔
4889
                        pabyChunk[i] = 255;
127,437✔
4890
                }
4891
            }
4892
            else if (eWrkDataType == GDT_UInt16)
×
4893
            {
4894
                GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
×
4895
                for (size_t i = 0;
×
4896
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
×
4897
                {
4898
                    if (pasChunk[i] == 1)
×
4899
                        pasChunk[i] = 255;
×
4900
                }
4901
            }
4902
            else if (eWrkDataType == GDT_Float64)
×
4903
            {
4904
                double *padfChunk = static_cast<double *>(pChunk);
×
4905
                for (size_t i = 0;
×
4906
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
×
4907
                {
4908
                    if (padfChunk[i] == 1.0)
×
4909
                        padfChunk[i] = 255.0;
×
4910
                }
4911
            }
4912
            else
4913
            {
4914
                CPLAssert(false);
×
4915
            }
4916
        }
4917
        else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
776✔
4918
        {
4919
            if (eWrkDataType == GDT_Float32)
×
4920
            {
4921
                float *pafChunk = static_cast<float *>(pChunk);
×
4922
                for (size_t i = 0;
×
4923
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
×
4924
                {
4925
                    if (pafChunk[i] == 1.0)
×
4926
                        pafChunk[i] = 0.0;
×
4927
                    else if (pafChunk[i] == 0.0)
×
4928
                        pafChunk[i] = 255.0;
×
4929
                }
4930
            }
4931
            else if (eWrkDataType == GDT_Byte)
×
4932
            {
4933
                GByte *pabyChunk = static_cast<GByte *>(pChunk);
×
4934
                for (size_t i = 0;
×
4935
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
×
4936
                {
4937
                    if (pabyChunk[i] == 1)
×
4938
                        pabyChunk[i] = 0;
×
4939
                    else if (pabyChunk[i] == 0)
×
4940
                        pabyChunk[i] = 255;
×
4941
                }
4942
            }
4943
            else if (eWrkDataType == GDT_UInt16)
×
4944
            {
4945
                GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
×
4946
                for (size_t i = 0;
×
4947
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
×
4948
                {
4949
                    if (pasChunk[i] == 1)
×
4950
                        pasChunk[i] = 0;
×
4951
                    else if (pasChunk[i] == 0)
×
4952
                        pasChunk[i] = 255;
×
4953
                }
4954
            }
4955
            else if (eWrkDataType == GDT_Float64)
×
4956
            {
4957
                double *padfChunk = static_cast<double *>(pChunk);
×
4958
                for (size_t i = 0;
×
4959
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
×
4960
                {
4961
                    if (padfChunk[i] == 1.0)
×
4962
                        padfChunk[i] = 0.0;
×
4963
                    else if (padfChunk[i] == 0.0)
×
4964
                        padfChunk[i] = 255.0;
×
4965
                }
4966
            }
4967
            else
4968
            {
4969
                CPLAssert(false);
×
4970
            }
4971
        }
4972

4973
        auto oSrcBufferHolder =
4974
            std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
1,570✔
4975
        auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
4976
            poJobQueue ? pabyChunkNodataMask : nullptr);
1,570✔
4977

4978
        for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
1,665✔
4979
             ++iOverview)
4980
        {
4981
            GDALRasterBand *poDstBand = papoOvrBands[iOverview];
880✔
4982
            const int nDstWidth = poDstBand->GetXSize();
880✔
4983
            const int nDstHeight = poDstBand->GetYSize();
880✔
4984

4985
            const double dfXRatioDstToSrc =
880✔
4986
                static_cast<double>(nWidth) / nDstWidth;
880✔
4987
            const double dfYRatioDstToSrc =
880✔
4988
                static_cast<double>(nHeight) / nDstHeight;
880✔
4989

4990
            /* --------------------------------------------------------------------
4991
             */
4992
            /*      Figure out the line to start writing to, and the first line
4993
             */
4994
            /*      to not write to.  In theory this approach should ensure that
4995
             */
4996
            /*      every output line will be written if all input chunks are */
4997
            /*      processed. */
4998
            /* --------------------------------------------------------------------
4999
             */
5000
            int nDstYOff =
880✔
5001
                static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
880✔
5002
            if (nDstYOff == nDstHeight)
880✔
5003
                continue;
×
5004
            int nDstYOff2 = static_cast<int>(
880✔
5005
                0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
880✔
5006

5007
            if (nChunkYOff + nFullResYChunk == nHeight)
880✔
5008
                nDstYOff2 = nDstHeight;
873✔
5009
#if DEBUG_VERBOSE
5010
            CPLDebug("GDAL",
5011
                     "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
5012
                     nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
5013
                     nDstWidth, nDstYOff2 - nDstYOff);
5014
#endif
5015

5016
            auto poJob = std::make_unique<OvrJob>();
1,760✔
5017
            poJob->pfnResampleFn = pfnResampleFn;
880✔
5018
            poJob->bUseGenericResampleFn = bUseGenericResampleFn;
880✔
5019
            poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
880✔
5020
            poJob->args.nOvrXSize = poDstBand->GetXSize();
880✔
5021
            poJob->args.nOvrYSize = poDstBand->GetYSize();
880✔
5022
            const char *pszNBITS =
5023
                poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
880✔
5024
            poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
880✔
5025
            poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
880✔
5026
            poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
880✔
5027
            poJob->args.eWrkDataType = eWrkDataType;
880✔
5028
            poJob->pChunk = pChunk;
880✔
5029
            poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
880✔
5030
            poJob->nSrcWidth = nWidth;
880✔
5031
            poJob->nSrcHeight = nHeight;
880✔
5032
            poJob->args.nChunkXOff = 0;
880✔
5033
            poJob->args.nChunkXSize = nWidth;
880✔
5034
            poJob->args.nChunkYOff = nChunkYOffQueried;
880✔
5035
            poJob->args.nChunkYSize = nChunkYSizeQueried;
880✔
5036
            poJob->nDstWidth = nDstWidth;
880✔
5037
            poJob->args.nDstXOff = 0;
880✔
5038
            poJob->args.nDstXOff2 = nDstWidth;
880✔
5039
            poJob->args.nDstYOff = nDstYOff;
880✔
5040
            poJob->args.nDstYOff2 = nDstYOff2;
880✔
5041
            poJob->poDstBand = poDstBand;
880✔
5042
            poJob->args.pszResampling = pszResampling;
880✔
5043
            poJob->args.bHasNoData = bHasNoData;
880✔
5044
            poJob->args.dfNoDataValue = dfNoDataValue;
880✔
5045
            poJob->args.poColorTable = poColorTable;
880✔
5046
            poJob->args.eSrcDataType = eSrcDataType;
880✔
5047
            poJob->args.bPropagateNoData = bPropagateNoData;
880✔
5048

5049
            if (poJobQueue)
880✔
5050
            {
5051
                poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
×
5052
                poJob->SetSrcBufferHolder(oSrcBufferHolder);
×
5053
                poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
×
5054
                jobList.emplace_back(std::move(poJob));
×
5055
            }
5056
            else
5057
            {
5058
                JobResampleFunc(poJob.get());
880✔
5059
                eErr = poJob->eErr;
880✔
5060
                if (eErr == CE_None)
880✔
5061
                {
5062
                    eErr = WriteJobData(poJob.get());
880✔
5063
                }
5064
            }
5065
        }
5066

5067
        if (poJobQueue)
785✔
5068
        {
5069
            pChunk = nullptr;
×
5070
            pabyChunkNodataMask = nullptr;
×
5071
        }
5072
    }
5073

5074
    VSIFree(pChunk);
780✔
5075
    VSIFree(pabyChunkNodataMask);
780✔
5076

5077
    // Wait for all pending jobs to complete
5078
    while (!jobList.empty())
780✔
5079
    {
5080
        const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
×
5081
        if (l_eErr != CE_None && eErr == CE_None)
×
5082
            eErr = l_eErr;
×
5083
    }
5084

5085
    /* -------------------------------------------------------------------- */
5086
    /*      Renormalized overview mean / stddev if needed.                  */
5087
    /* -------------------------------------------------------------------- */
5088
    if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
780✔
5089
    {
5090
        GDALOverviewMagnitudeCorrection(
×
5091
            poSrcBand, nOverviewCount,
5092
            reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5093
            GDALDummyProgress, nullptr);
5094
    }
5095

5096
    /* -------------------------------------------------------------------- */
5097
    /*      It can be important to flush out data to overviews.             */
5098
    /* -------------------------------------------------------------------- */
5099
    for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
1,653✔
5100
         ++iOverview)
5101
    {
5102
        eErr = papoOvrBands[iOverview]->FlushCache(false);
873✔
5103
    }
5104

5105
    if (eErr == CE_None)
780✔
5106
        pfnProgress(1.0, nullptr, pProgressData);
780✔
5107

5108
    return eErr;
780✔
5109
}
5110

5111
/************************************************************************/
5112
/*            GDALRegenerateOverviewsMultiBand()                        */
5113
/************************************************************************/
5114

5115
/**
5116
 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5117
 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5118
 *
5119
 * This function will generate one or more overview images from a base
5120
 * image using the requested downsampling algorithm.  Its primary use
5121
 * is for generating overviews via GDALDataset::BuildOverviews(), but it
5122
 * can also be used to generate downsampled images in one file from another
5123
 * outside the overview architecture.
5124
 *
5125
 * The output bands need to exist in advance and share the same characteristics
5126
 * (type, dimensions)
5127
 *
5128
 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5129
 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5130
 *
5131
 * It does not support color tables or complex data types.
5132
 *
5133
 * The pseudo-algorithm used by the function is :
5134
 *    for each overview
5135
 *       iterate on lines of the source by a step of deltay
5136
 *           iterate on columns of the source  by a step of deltax
5137
 *               read the source data of size deltax * deltay for all the bands
5138
 *               generate the corresponding overview block for all the bands
5139
 *
5140
 * This function will honour properly NODATA_VALUES tuples (special dataset
5141
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5142
 * considered as the nodata value and not each value of the triplet
5143
 * independently per band.
5144
 *
5145
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5146
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5147
 * overview computation.
5148
 *
5149
 * @param nBands the number of bands, size of papoSrcBands and size of
5150
 *               first dimension of papapoOverviewBands
5151
 * @param papoSrcBands the list of source bands to downsample
5152
 * @param nOverviews the number of downsampled overview levels being generated.
5153
 * @param papapoOverviewBands bidimension array of bands. First dimension is
5154
 *                            indexed by nBands. Second dimension is indexed by
5155
 *                            nOverviews.
5156
 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5157
 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5158
 * @param pfnProgress progress report function.
5159
 * @param pProgressData progress function callback data.
5160
 * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5161
 *                     key=value pairs, or NULL
5162
 *                     Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5163
 *                     options can be specified to express that overviews should
5164
 *                     be regenerated only in the specified subset of the source
5165
 *                     dataset.
5166
 * @return CE_None on success or CE_Failure on failure.
5167
 */
5168

5169
CPLErr GDALRegenerateOverviewsMultiBand(
388✔
5170
    int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5171
    GDALRasterBand *const *const *papapoOverviewBands,
5172
    const char *pszResampling, GDALProgressFunc pfnProgress,
5173
    void *pProgressData, CSLConstList papszOptions)
5174
{
5175
    CPL_IGNORE_RET_VAL(papszOptions);
388✔
5176

5177
    if (pfnProgress == nullptr)
388✔
5178
        pfnProgress = GDALDummyProgress;
11✔
5179

5180
    if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
388✔
5181
        return CE_None;
3✔
5182

5183
    // Sanity checks.
5184
    if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
385✔
5185
        !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
191✔
5186
        !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
84✔
5187
        !EQUAL(pszResampling, "CUBICSPLINE") &&
22✔
5188
        !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
21✔
5189
        !EQUAL(pszResampling, "MODE"))
5✔
5190
    {
5191
        CPLError(CE_Failure, CPLE_NotSupported,
×
5192
                 "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5193
                 "not supported",
5194
                 pszResampling);
5195
        return CE_Failure;
×
5196
    }
5197

5198
    int nKernelRadius = 0;
385✔
5199
    GDALResampleFunction pfnResampleFn =
5200
        GDALGetResampleFunction(pszResampling, &nKernelRadius);
385✔
5201
    if (pfnResampleFn == nullptr)
385✔
5202
        return CE_Failure;
×
5203

5204
    const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
385✔
5205
    const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
385✔
5206
    if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
385✔
5207
        return CE_None;
×
5208
    GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
385✔
5209
    for (int iBand = 1; iBand < nBands; ++iBand)
66,232✔
5210
    {
5211
        if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
131,694✔
5212
            papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
65,847✔
5213
        {
5214
            CPLError(
×
5215
                CE_Failure, CPLE_NotSupported,
5216
                "GDALRegenerateOverviewsMultiBand: all the source bands must "
5217
                "have the same dimensions");
5218
            return CE_Failure;
×
5219
        }
5220
        if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
65,847✔
5221
        {
5222
            CPLError(
×
5223
                CE_Failure, CPLE_NotSupported,
5224
                "GDALRegenerateOverviewsMultiBand: all the source bands must "
5225
                "have the same data type");
5226
            return CE_Failure;
×
5227
        }
5228
    }
5229

5230
    for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
1,031✔
5231
    {
5232
        const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
646✔
5233
        const int nDstWidth = poOvrFirstBand->GetXSize();
646✔
5234
        const int nDstHeight = poOvrFirstBand->GetYSize();
646✔
5235
        for (int iBand = 1; iBand < nBands; ++iBand)
66,759✔
5236
        {
5237
            const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
66,113✔
5238
            if (poOvrBand->GetXSize() != nDstWidth ||
132,226✔
5239
                poOvrBand->GetYSize() != nDstHeight)
66,113✔
5240
            {
5241
                CPLError(
×
5242
                    CE_Failure, CPLE_NotSupported,
5243
                    "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5244
                    "of the same level must have the same dimensions");
5245
                return CE_Failure;
×
5246
            }
5247
            if (poOvrBand->GetRasterDataType() != eDataType)
66,113✔
5248
            {
5249
                CPLError(
×
5250
                    CE_Failure, CPLE_NotSupported,
5251
                    "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5252
                    "must have the same data type as the source bands");
5253
                return CE_Failure;
×
5254
            }
5255
        }
5256
    }
5257

5258
    // First pass to compute the total number of pixels to write.
5259
    double dfTotalPixelCount = 0;
385✔
5260
    const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
385✔
5261
    const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
385✔
5262
    const int nSrcXSize = atoi(CSLFetchNameValueDef(
385✔
5263
        papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5264
    const int nSrcYSize = atoi(CSLFetchNameValueDef(
385✔
5265
        papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5266
    for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
1,031✔
5267
    {
5268
        dfTotalPixelCount +=
646✔
5269
            static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
1,292✔
5270
            papapoOverviewBands[0][iOverview]->GetXSize() *
646✔
5271
            static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
1,292✔
5272
            papapoOverviewBands[0][iOverview]->GetYSize();
646✔
5273
    }
5274

5275
    const GDALDataType eWrkDataType =
5276
        GDALGetOvrWorkDataType(pszResampling, eDataType);
385✔
5277
    const int nWrkDataTypeSize =
5278
        std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
385✔
5279

5280
    const bool bIsMask = papoSrcBands[0]->IsMaskBand();
385✔
5281

5282
    // If we have a nodata mask and we are doing something more complicated
5283
    // than nearest neighbouring, we have to fetch to nodata mask.
5284
    const bool bUseNoDataMask =
5285
        !STARTS_WITH_CI(pszResampling, "NEAR") &&
568✔
5286
        (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
183✔
5287

5288
    std::vector<bool> abHasNoData(nBands);
770✔
5289
    std::vector<double> adfNoDataValue(nBands);
770✔
5290

5291
    for (int iBand = 0; iBand < nBands; ++iBand)
66,617✔
5292
    {
5293
        int nHasNoData = 0;
66,232✔
5294
        adfNoDataValue[iBand] =
132,464✔
5295
            papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
66,232✔
5296
        abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
66,232✔
5297
    }
5298
    const bool bPropagateNoData =
5299
        CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
385✔
5300

5301
    const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
385✔
5302
    const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
1,540✔
5303
                                                       ? CPLGetNumCPUs()
385✔
5304
                                                       : atoi(pszThreads)));
385✔
5305
    auto poThreadPool =
5306
        nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
385✔
5307
    auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5308
                                   : std::unique_ptr<CPLJobQueue>(nullptr);
770✔
5309

5310
    // Only configurable for debug / testing
5311
    const GIntBig nChunkMaxSize = []() -> GIntBig
385✔
5312
    {
5313
        const char *pszVal =
5314
            CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
385✔
5315
        if (pszVal)
385✔
5316
        {
5317
            GIntBig nRet = 0;
15✔
5318
            CPLParseMemorySize(pszVal, &nRet, nullptr);
15✔
5319
            return std::max<GIntBig>(100, nRet);
15✔
5320
        }
5321
        return 10 * 1024 * 1024;
370✔
5322
    }();
385✔
5323

5324
    // Only configurable for debug / testing
5325
    const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
385✔
5326
    {
5327
        const char *pszVal = CPLGetConfigOption(
385✔
5328
            "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
5329
        if (pszVal)
385✔
5330
        {
5331
            GIntBig nRet = 0;
14✔
5332
            CPLParseMemorySize(pszVal, &nRet, nullptr);
14✔
5333
            return std::max<GIntBig>(100, nRet);
14✔
5334
        }
5335
        const auto nUsableRAM = CPLGetUsablePhysicalRAM();
371✔
5336
        if (nUsableRAM > 0)
371✔
5337
            return nUsableRAM / 10;
371✔
5338
        // Select a value to be able to at least downsample by 2 for a RGB
5339
        // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
5340
        return 100 * 1024 * 1024;
×
5341
    }();
385✔
5342

5343
    // Second pass to do the real job.
5344
    double dfCurPixelCount = 0;
385✔
5345
    CPLErr eErr = CE_None;
385✔
5346
    for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
1,025✔
5347
         ++iOverview)
5348
    {
5349
        int iSrcOverview = -1;  // -1 means the source bands.
645✔
5350

5351
        const int nDstTotalWidth =
5352
            papapoOverviewBands[0][iOverview]->GetXSize();
645✔
5353
        const int nDstTotalHeight =
5354
            papapoOverviewBands[0][iOverview]->GetYSize();
645✔
5355

5356
        // Compute the coordinates of the target region to refresh
5357
        constexpr double EPS = 1e-8;
645✔
5358
        const int nDstXOffStart = static_cast<int>(
645✔
5359
            static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
645✔
5360
            EPS);
5361
        const int nDstXOffEnd =
5362
            std::min(static_cast<int>(
1,290✔
5363
                         std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
645✔
5364
                                       nToplevelSrcWidth * nDstTotalWidth -
645✔
5365
                                   EPS)),
5366
                     nDstTotalWidth);
645✔
5367
        const int nDstWidth = nDstXOffEnd - nDstXOffStart;
645✔
5368
        const int nDstYOffStart =
645✔
5369
            static_cast<int>(static_cast<double>(nSrcYOff) /
645✔
5370
                                 nToplevelSrcHeight * nDstTotalHeight +
645✔
5371
                             EPS);
5372
        const int nDstYOffEnd =
5373
            std::min(static_cast<int>(
1,290✔
5374
                         std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
645✔
5375
                                       nToplevelSrcHeight * nDstTotalHeight -
645✔
5376
                                   EPS)),
5377
                     nDstTotalHeight);
645✔
5378
        const int nDstHeight = nDstYOffEnd - nDstYOffStart;
645✔
5379

5380
        // Try to use previous level of overview as the source to compute
5381
        // the next level.
5382
        int nSrcWidth = nToplevelSrcWidth;
645✔
5383
        int nSrcHeight = nToplevelSrcHeight;
645✔
5384
        if (iOverview > 0 &&
905✔
5385
            papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
260✔
5386
        {
5387
            nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
252✔
5388
            nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
252✔
5389
            iSrcOverview = iOverview - 1;
252✔
5390
        }
5391

5392
        const double dfXRatioDstToSrc =
645✔
5393
            static_cast<double>(nSrcWidth) / nDstTotalWidth;
645✔
5394
        const double dfYRatioDstToSrc =
645✔
5395
            static_cast<double>(nSrcHeight) / nDstTotalHeight;
645✔
5396

5397
        const int nOvrFactor =
5398
            std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
1,935✔
5399
                                 static_cast<int>(0.5 + dfYRatioDstToSrc)));
645✔
5400

5401
        int nDstChunkXSize = 0;
645✔
5402
        int nDstChunkYSize = 0;
645✔
5403
        papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
645✔
5404
                                                        &nDstChunkYSize);
5405

5406
        constexpr int PIXEL_MARGIN = 2;
645✔
5407
        // Try to extend the chunk size so that the memory needed to acquire
5408
        // source pixels goes up to 10 MB.
5409
        // This can help for drivers that support multi-threaded reading
5410
        const int nFullResYChunk = static_cast<int>(std::min<double>(
645✔
5411
            nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
645✔
5412
        const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
645✔
5413
            nSrcHeight,
1,290✔
5414
            nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
1,290✔
5415
                                 nKernelRadius * nOvrFactor));
645✔
5416
        while (nDstChunkXSize < nDstWidth)
881✔
5417
        {
5418
            constexpr int INCREASE_FACTOR = 2;
255✔
5419

5420
            const int nFullResXChunk = static_cast<int>(std::min<double>(
255✔
5421
                nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
510✔
5422
                                              dfXRatioDstToSrc));
255✔
5423

5424
            const int nFullResXChunkQueried =
5425
                static_cast<int>(std::min<int64_t>(
255✔
5426
                    nSrcWidth,
510✔
5427
                    nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
510✔
5428
                                         nKernelRadius * nOvrFactor));
255✔
5429

5430
            if (nBands > nChunkMaxSize / nFullResXChunkQueried /
255✔
5431
                             nFullResYChunkQueried / nWrkDataTypeSize)
255✔
5432
            {
5433
                break;
19✔
5434
            }
5435

5436
            nDstChunkXSize *= INCREASE_FACTOR;
236✔
5437
        }
5438
        nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
645✔
5439

5440
        const int nFullResXChunk = static_cast<int>(std::min<double>(
645✔
5441
            nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
645✔
5442
        const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
645✔
5443
            nSrcWidth,
1,290✔
5444
            nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
1,290✔
5445
                                 nKernelRadius * nOvrFactor));
645✔
5446

5447
        // Make sure that the RAM requirements to acquire the source data does
5448
        // not exceed nChunkMaxSizeForTempFile
5449
        // If so, reduce the destination chunk size, generate overviews in a
5450
        // temporary dataset, and copy that temporary dataset over the target
5451
        // overview bands (to avoid issues with lossy compression)
5452
        const bool bOverflowFullResXChunkYChunkQueried =
5453
            nBands > std::numeric_limits<int64_t>::max() /
645✔
5454
                         nFullResXChunkQueried / nFullResYChunkQueried /
645✔
5455
                         nWrkDataTypeSize;
645✔
5456

5457
        const auto nMemRequirement =
645✔
5458
            bOverflowFullResXChunkYChunkQueried
5459
                ? 0
645✔
5460
                : static_cast<GIntBig>(nFullResXChunkQueried) *
641✔
5461
                      nFullResYChunkQueried * nBands * nWrkDataTypeSize;
641✔
5462
        // Use a temporary dataset with a smaller destination chunk size
5463
        const auto nOverShootFactor =
645✔
5464
            nMemRequirement / nChunkMaxSizeForTempFile;
5465

5466
        constexpr int MIN_OVERSHOOT_FACTOR = 4;
645✔
5467
        const auto nSqrtOverShootFactor = std::max<GIntBig>(
5468
            MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
1,290✔
5469
                                      static_cast<double>(nOverShootFactor)))));
645✔
5470
        constexpr int DEFAULT_CHUNK_SIZE = 256;
645✔
5471
        constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
645✔
5472
        const int nReducedDstChunkXSize =
5473
            bOverflowFullResXChunkYChunkQueried
5474
                ? DEFAULT_CHUNK_SIZE
1,286✔
5475
                : std::max(1, static_cast<int>(nDstChunkXSize /
1,286✔
5476
                                               nSqrtOverShootFactor) &
1,286✔
5477
                                  ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
641✔
5478
        const int nReducedDstChunkYSize =
5479
            bOverflowFullResXChunkYChunkQueried
5480
                ? DEFAULT_CHUNK_SIZE
1,286✔
5481
                : std::max(1, static_cast<int>(nDstChunkYSize /
1,286✔
5482
                                               nSqrtOverShootFactor) &
1,286✔
5483
                                  ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
641✔
5484

5485
        if (bOverflowFullResXChunkYChunkQueried ||
645✔
5486
            nMemRequirement > nChunkMaxSizeForTempFile)
5487
        {
5488
            const auto nDTSize =
5489
                std::max(1, GDALGetDataTypeSizeBytes(eDataType));
43✔
5490
            const bool bTmpDSMemRequirementOverflow =
5491
                nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
43✔
5492
                             nDstHeight / nDTSize;
43✔
5493
            const auto nTmpDSMemRequirement =
43✔
5494
                bTmpDSMemRequirementOverflow
5495
                    ? 0
43✔
5496
                    : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
41✔
5497
                          nDTSize;
41✔
5498

5499
            // make sure that one band buffer doesn't overflow size_t
5500
            const bool bChunkSizeOverflow =
5501
                static_cast<size_t>(nDTSize) >
43✔
5502
                std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
43✔
5503
            const size_t nChunkSize =
43✔
5504
                bChunkSizeOverflow
5505
                    ? 0
43✔
5506
                    : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
41✔
5507

5508
            const auto CreateVRT =
5509
                [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
41✔
5510
                 pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
5511
                 iSrcOverview, &abHasNoData,
5512
                 &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
393,585✔
5513
            {
5514
                auto poVRTDS = std::make_unique<VRTDataset>(
5515
                    nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
41✔
5516
                    nVRTBlockYSize);
41✔
5517

5518
                for (int iBand = 0; iBand < nBands; ++iBand)
65,620✔
5519
                {
5520
                    auto poVRTSrc = std::make_unique<VRTSimpleSource>();
131,158✔
5521
                    poVRTSrc->SetResampling(pszResampling);
65,579✔
5522
                    poVRTDS->AddBand(eWrkDataType);
65,579✔
5523
                    auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
5524
                        poVRTDS->GetRasterBand(iBand + 1));
65,579✔
5525

5526
                    auto poSrcBand = papoSrcBands[iBand];
65,579✔
5527
                    if (iSrcOverview != -1)
65,579✔
5528
                        poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
24✔
5529
                    poVRTBand->ConfigureSource(
65,579✔
5530
                        poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
5531
                        nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
5532
                    // Add the source to the band
5533
                    poVRTBand->AddSource(poVRTSrc.release());
65,579✔
5534
                    if (abHasNoData[iBand])
65,579✔
5535
                        poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
3✔
5536
                }
5537

5538
                if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
42✔
5539
                    poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
1✔
5540
                {
5541
                    VRTSourcedRasterBand *poMaskVRTBand =
5542
                        cpl::down_cast<VRTSourcedRasterBand *>(
1✔
5543
                            poVRTDS->GetRasterBand(1)->GetMaskBand());
1✔
5544
                    auto poSrcBand = papoSrcBands[0];
1✔
5545
                    if (iSrcOverview != -1)
1✔
5546
                        poSrcBand = papapoOverviewBands[0][iSrcOverview];
×
5547
                    poMaskVRTBand->AddMaskBandSource(
1✔
5548
                        poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
1✔
5549
                        0, 0, nDstTotalWidth, nDstTotalHeight);
5550
                }
5551

5552
                return poVRTDS;
41✔
5553
            };
43✔
5554

5555
            // If the overview accommodates chunking, do so and recurse
5556
            // to avoid generating full size temporary files
5557
            if (!bOverflowFullResXChunkYChunkQueried &&
43✔
5558
                !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
39✔
5559
                (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
39✔
5560
            {
5561
                // Create a VRT with the smaller chunk to do the scaling
5562
                auto poVRTDS =
5563
                    CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
13✔
5564

5565
                std::vector<GDALRasterBand *> apoVRTBand(nBands);
13✔
5566
                std::vector<GDALRasterBand *> apoDstBand(nBands);
13✔
5567
                for (int iBand = 0; iBand < nBands; ++iBand)
65,560✔
5568
                {
5569
                    apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
65,547✔
5570
                    apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
65,547✔
5571
                }
5572

5573
                // Use a flag to avoid reading from the overview being built
5574
                GDALRasterIOExtraArg sExtraArg;
5575
                INIT_RASTERIO_EXTRA_ARG(sExtraArg);
13✔
5576
                if (iSrcOverview == -1)
13✔
5577
                    sExtraArg.bUseOnlyThisScale = true;
13✔
5578

5579
                // A single band buffer for data transfer to the overview
5580
                std::vector<GByte> abyChunk;
13✔
5581
                try
5582
                {
5583
                    abyChunk.resize(nChunkSize);
13✔
5584
                }
5585
                catch (const std::exception &)
×
5586
                {
5587
                    CPLError(CE_Failure, CPLE_OutOfMemory,
×
5588
                             "Out of memory allocating temporary buffer");
5589
                    return CE_Failure;
×
5590
                }
5591

5592
                // Loop over output height, in chunks
5593
                for (int nDstYOff = nDstYOffStart;
13✔
5594
                     nDstYOff < nDstYOffEnd && eErr == CE_None;
38✔
5595
                     /* */)
5596
                {
5597
                    const int nDstYCount =
5598
                        std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
25✔
5599
                    // Loop over output width, in output chunks
5600
                    for (int nDstXOff = nDstXOffStart;
25✔
5601
                         nDstXOff < nDstXOffEnd && eErr == CE_None;
74✔
5602
                         /* */)
5603
                    {
5604
                        const int nDstXCount =
5605
                            std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
49✔
5606
                        // Read and transfer the chunk to the overview
5607
                        for (int iBand = 0; iBand < nBands && eErr == CE_None;
98✔
5608
                             ++iBand)
5609
                        {
5610
                            eErr = apoVRTBand[iBand]->RasterIO(
98✔
5611
                                GF_Read, nDstXOff, nDstYOff, nDstXCount,
5612
                                nDstYCount, abyChunk.data(), nDstXCount,
49✔
5613
                                nDstYCount, eDataType, 0, 0, &sExtraArg);
5614
                            if (eErr == CE_None)
49✔
5615
                            {
5616
                                eErr = apoDstBand[iBand]->RasterIO(
96✔
5617
                                    GF_Write, nDstXOff, nDstYOff, nDstXCount,
5618
                                    nDstYCount, abyChunk.data(), nDstXCount,
48✔
5619
                                    nDstYCount, eDataType, 0, 0, nullptr);
5620
                            }
5621
                        }
5622

5623
                        dfCurPixelCount +=
49✔
5624
                            static_cast<double>(nDstXCount) * nDstYCount;
49✔
5625

5626
                        nDstXOff += nDstXCount;
49✔
5627
                    }  // width
5628

5629
                    if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
25✔
5630
                                     nullptr, pProgressData))
5631
                    {
5632
                        CPLError(CE_Failure, CPLE_UserInterrupt,
×
5633
                                 "User terminated");
5634
                        eErr = CE_Failure;
×
5635
                    }
5636

5637
                    nDstYOff += nDstYCount;
25✔
5638
                }  // height
5639

5640
                if (CE_None != eErr)
13✔
5641
                {
5642
                    CPLError(CE_Failure, CPLE_AppDefined,
1✔
5643
                             "Error while writing overview");
5644
                    return CE_Failure;
1✔
5645
                }
5646

5647
                pfnProgress(1.0, nullptr, pProgressData);
12✔
5648
                // Flush the overviews we just generated
5649
                for (int iBand = 0; iBand < nBands; ++iBand)
24✔
5650
                    apoDstBand[iBand]->FlushCache(false);
12✔
5651

5652
                continue;  // Next overview
12✔
5653
            }              // chunking via temporary dataset
5654

5655
            std::unique_ptr<GDALDataset> poTmpDS;
×
5656
            // Config option mostly/only for autotest purposes
5657
            const char *pszGDAL_OVR_TEMP_DRIVER =
5658
                CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
30✔
5659
            if ((!bTmpDSMemRequirementOverflow &&
30✔
5660
                 nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
4✔
5661
                 !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
4✔
5662
                EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
26✔
5663
            {
5664
                auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
10✔
5665
                if (!poTmpDrv)
10✔
5666
                {
5667
                    eErr = CE_Failure;
×
5668
                    break;
×
5669
                }
5670
                poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
10✔
5671
                                               nDstTotalHeight, nBands,
5672
                                               eDataType, nullptr));
10✔
5673
            }
5674
            else
5675
            {
5676
                // Create a temporary file for the overview
5677
                auto poTmpDrv =
5678
                    GetGDALDriverManager()->GetDriverByName("GTiff");
20✔
5679
                if (!poTmpDrv)
20✔
5680
                {
5681
                    eErr = CE_Failure;
×
5682
                    break;
×
5683
                }
5684
                std::string osTmpFilename;
40✔
5685
                auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
20✔
5686
                if (poDstDS)
20✔
5687
                {
5688
                    osTmpFilename = poDstDS->GetDescription();
20✔
5689
                    VSIStatBufL sStatBuf;
5690
                    if (!osTmpFilename.empty() &&
20✔
5691
                        VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
×
5692
                        osTmpFilename += "_tmp_ovr.tif";
×
5693
                }
5694
                if (osTmpFilename.empty())
20✔
5695
                {
5696
                    osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
20✔
5697
                    osTmpFilename += ".tif";
20✔
5698
                }
5699
                CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
20✔
5700
                         osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
5701
                CPLStringList aosCO;
40✔
5702
                if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
20✔
5703
                          (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
20✔
5704
                {
5705
                    aosCO.SetNameValue("TILED", "YES");
14✔
5706
                    aosCO.SetNameValue("BLOCKXSIZE",
5707
                                       CPLSPrintf("%d", nReducedDstChunkXSize));
14✔
5708
                    aosCO.SetNameValue("BLOCKYSIZE",
5709
                                       CPLSPrintf("%d", nReducedDstChunkYSize));
14✔
5710
                }
5711
                if (const char *pszCOList =
20✔
5712
                        poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
20✔
5713
                {
5714
                    aosCO.SetNameValue(
5715
                        "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
20✔
5716
                }
5717
                poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
20✔
5718
                                               nDstHeight, nBands, eDataType,
5719
                                               aosCO.List()));
20✔
5720
                if (poTmpDS)
20✔
5721
                {
5722
                    poTmpDS->MarkSuppressOnClose();
18✔
5723
                    VSIUnlink(osTmpFilename.c_str());
18✔
5724
                }
5725
            }
5726
            if (!poTmpDS)
30✔
5727
            {
5728
                eErr = CE_Failure;
2✔
5729
                break;
2✔
5730
            }
5731

5732
            // Create a full size VRT to do the resampling without edge effects
5733
            auto poVRTDS =
5734
                CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
28✔
5735

5736
            // Allocate a band buffer with the overview chunk size
5737
            std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
5738
                VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
5739
                                    nDstChunkYSize));
28✔
5740
            if (pDstBuffer == nullptr)
28✔
5741
            {
5742
                eErr = CE_Failure;
×
5743
                break;
×
5744
            }
5745

5746
            // Use a flag to avoid reading the overview being built
5747
            GDALRasterIOExtraArg sExtraArg;
5748
            INIT_RASTERIO_EXTRA_ARG(sExtraArg);
28✔
5749
            if (iSrcOverview == -1)
28✔
5750
                sExtraArg.bUseOnlyThisScale = true;
4✔
5751

5752
            // Scale and copy data from the VRT to the temp file
5753
            for (int nDstYOff = nDstYOffStart;
28✔
5754
                 nDstYOff < nDstYOffEnd && eErr == CE_None;
914✔
5755
                 /* */)
5756
            {
5757
                const int nDstYCount =
5758
                    std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
886✔
5759
                for (int nDstXOff = nDstXOffStart;
886✔
5760
                     nDstXOff < nDstXOffEnd && eErr == CE_None;
201,218✔
5761
                     /* */)
5762
                {
5763
                    const int nDstXCount =
5764
                        std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
200,332✔
5765
                    for (int iBand = 0; iBand < nBands && eErr == CE_None;
400,668✔
5766
                         ++iBand)
5767
                    {
5768
                        auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
200,336✔
5769
                        eErr = poSrcBand->RasterIO(
200,336✔
5770
                            GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
5771
                            pDstBuffer.get(), nDstXCount, nDstYCount,
5772
                            eWrkDataType, 0, 0, &sExtraArg);
5773
                        if (eErr == CE_None)
200,336✔
5774
                        {
5775
                            // Write to the temporary dataset, shifted
5776
                            auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
200,334✔
5777
                            eErr = poOvrBand->RasterIO(
200,334✔
5778
                                GF_Write, nDstXOff - nDstXOffStart,
5779
                                nDstYOff - nDstYOffStart, nDstXCount,
5780
                                nDstYCount, pDstBuffer.get(), nDstXCount,
5781
                                nDstYCount, eWrkDataType, 0, 0, nullptr);
5782
                        }
5783
                    }
5784
                    nDstXOff += nDstXCount;
200,332✔
5785
                }
5786
                nDstYOff += nDstYCount;
886✔
5787
            }
5788

5789
            // Copy from the temporary to the overview
5790
            for (int nDstYOff = nDstYOffStart;
28✔
5791
                 nDstYOff < nDstYOffEnd && eErr == CE_None;
54✔
5792
                 /* */)
5793
            {
5794
                const int nDstYCount =
5795
                    std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
26✔
5796
                for (int nDstXOff = nDstXOffStart;
26✔
5797
                     nDstXOff < nDstXOffEnd && eErr == CE_None;
52✔
5798
                     /* */)
5799
                {
5800
                    const int nDstXCount =
5801
                        std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
26✔
5802
                    for (int iBand = 0; iBand < nBands && eErr == CE_None;
56✔
5803
                         ++iBand)
5804
                    {
5805
                        auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
30✔
5806
                        eErr = poSrcBand->RasterIO(
30✔
5807
                            GF_Read, nDstXOff - nDstXOffStart,
5808
                            nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
5809
                            pDstBuffer.get(), nDstXCount, nDstYCount,
5810
                            eWrkDataType, 0, 0, nullptr);
5811
                        if (eErr == CE_None)
30✔
5812
                        {
5813
                            // Write to the destination overview bands
5814
                            auto poOvrBand =
30✔
5815
                                papapoOverviewBands[iBand][iOverview];
30✔
5816
                            eErr = poOvrBand->RasterIO(
30✔
5817
                                GF_Write, nDstXOff, nDstYOff, nDstXCount,
5818
                                nDstYCount, pDstBuffer.get(), nDstXCount,
5819
                                nDstYCount, eWrkDataType, 0, 0, nullptr);
5820
                        }
5821
                    }
5822
                    nDstXOff += nDstXCount;
26✔
5823
                }
5824
                nDstYOff += nDstYCount;
26✔
5825
            }
5826

5827
            if (eErr != CE_None)
28✔
5828
            {
5829
                CPLError(CE_Failure, CPLE_AppDefined,
2✔
5830
                         "Failed to write overview %d", iOverview);
5831
                return eErr;
2✔
5832
            }
5833

5834
            // Flush the data to overviews.
5835
            for (int iBand = 0; iBand < nBands; ++iBand)
56✔
5836
                papapoOverviewBands[iBand][iOverview]->FlushCache(false);
30✔
5837

5838
            continue;
26✔
5839
        }
5840

5841
        // Structure describing a resampling job
5842
        struct OvrJob
5843
        {
5844
            // Buffers to free when job is finished
5845
            std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
5846
            std::unique_ptr<PointerHolder> oSrcBufferHolder{};
5847
            std::unique_ptr<PointerHolder> oDstBufferHolder{};
5848

5849
            GDALRasterBand *poDstBand = nullptr;
5850

5851
            // Input parameters of pfnResampleFn
5852
            GDALResampleFunction pfnResampleFn = nullptr;
5853
            GDALOverviewResampleArgs args{};
5854
            const void *pChunk = nullptr;
5855

5856
            // Output values of resampling function
5857
            CPLErr eErr = CE_Failure;
5858
            void *pDstBuffer = nullptr;
5859
            GDALDataType eDstBufferDataType = GDT_Unknown;
5860

5861
            void NotifyFinished()
3,310✔
5862
            {
5863
                std::lock_guard guard(mutex);
6,620✔
5864
                bFinished = true;
3,310✔
5865
                cv.notify_one();
3,310✔
5866
            }
3,310✔
5867

5868
            bool IsFinished()
2✔
5869
            {
5870
                std::lock_guard guard(mutex);
2✔
5871
                return bFinished;
4✔
5872
            }
5873

5874
            void WaitFinished()
16✔
5875
            {
5876
                std::unique_lock oGuard(mutex);
32✔
5877
                while (!bFinished)
23✔
5878
                {
5879
                    cv.wait(oGuard);
7✔
5880
                }
5881
            }
16✔
5882

5883
          private:
5884
            // Synchronization
5885
            bool bFinished = false;
5886
            std::mutex mutex{};
5887
            std::condition_variable cv{};
5888
        };
5889

5890
        // Thread function to resample
5891
        const auto JobResampleFunc = [](void *pData)
3,310✔
5892
        {
5893
            OvrJob *poJob = static_cast<OvrJob *>(pData);
3,310✔
5894

5895
            poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
3,310✔
5896
                                               &(poJob->pDstBuffer),
5897
                                               &(poJob->eDstBufferDataType));
5898

5899
            poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
3,310✔
5900

5901
            poJob->NotifyFinished();
3,310✔
5902
        };
3,310✔
5903

5904
        // Function to write resample data to target band
5905
        const auto WriteJobData = [](const OvrJob *poJob)
3,310✔
5906
        {
5907
            return poJob->poDstBand->RasterIO(
6,620✔
5908
                GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
3,310✔
5909
                poJob->args.nDstXOff2 - poJob->args.nDstXOff,
3,310✔
5910
                poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
3,310✔
5911
                poJob->args.nDstXOff2 - poJob->args.nDstXOff,
3,310✔
5912
                poJob->args.nDstYOff2 - poJob->args.nDstYOff,
3,310✔
5913
                poJob->eDstBufferDataType, 0, 0, nullptr);
3,310✔
5914
        };
5915

5916
        // Wait for completion of oldest job and serialize it
5917
        const auto WaitAndFinalizeOldestJob =
5918
            [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
16✔
5919
        {
5920
            auto poOldestJob = jobList.front().get();
16✔
5921
            poOldestJob->WaitFinished();
16✔
5922
            CPLErr l_eErr = poOldestJob->eErr;
16✔
5923
            if (l_eErr == CE_None)
16✔
5924
            {
5925
                l_eErr = WriteJobData(poOldestJob);
16✔
5926
            }
5927

5928
            jobList.pop_front();
16✔
5929
            return l_eErr;
16✔
5930
        };
5931

5932
        // Queue of jobs
5933
        std::list<std::unique_ptr<OvrJob>> jobList;
1,204✔
5934

5935
        std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
1,204✔
5936
        std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
5937
            apabyChunkNoDataMask(nBands);
1,204✔
5938

5939
        // Iterate on destination overview, block by block.
5940
        for (int nDstYOff = nDstYOffStart;
602✔
5941
             nDstYOff < nDstYOffEnd && eErr == CE_None;
2,111✔
5942
             nDstYOff += nDstChunkYSize)
1,509✔
5943
        {
5944
            int nDstYCount;
5945
            if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
1,509✔
5946
                nDstYCount = nDstChunkYSize;
1,099✔
5947
            else
5948
                nDstYCount = nDstYOffEnd - nDstYOff;
410✔
5949

5950
            int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1,509✔
5951
            int nChunkYOff2 = static_cast<int>(
1,509✔
5952
                ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
1,509✔
5953
            if (nChunkYOff2 > nSrcHeight ||
1,509✔
5954
                nDstYOff + nDstYCount == nDstTotalHeight)
1,509✔
5955
                nChunkYOff2 = nSrcHeight;
595✔
5956
            int nYCount = nChunkYOff2 - nChunkYOff;
1,509✔
5957
            CPLAssert(nYCount <= nFullResYChunk);
1,509✔
5958

5959
            int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
1,509✔
5960
            int nChunkYSizeQueried =
1,509✔
5961
                nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
1,509✔
5962
            if (nChunkYOffQueried < 0)
1,509✔
5963
            {
5964
                nChunkYSizeQueried += nChunkYOffQueried;
148✔
5965
                nChunkYOffQueried = 0;
148✔
5966
            }
5967
            if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
1,509✔
5968
                nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
147✔
5969
            CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
1,509✔
5970

5971
            if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
1,509✔
5972
                             nullptr, pProgressData))
5973
            {
5974
                CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
1✔
5975
                eErr = CE_Failure;
1✔
5976
            }
5977

5978
            // Iterate on destination overview, block by block.
5979
            for (int nDstXOff = nDstXOffStart;
1,509✔
5980
                 nDstXOff < nDstXOffEnd && eErr == CE_None;
3,057✔
5981
                 nDstXOff += nDstChunkXSize)
1,548✔
5982
            {
5983
                int nDstXCount = 0;
1,548✔
5984
                if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
1,548✔
5985
                    nDstXCount = nDstChunkXSize;
1,531✔
5986
                else
5987
                    nDstXCount = nDstXOffEnd - nDstXOff;
17✔
5988

5989
                dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
1,548✔
5990

5991
                int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1,548✔
5992
                int nChunkXOff2 = static_cast<int>(
1,548✔
5993
                    ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1,548✔
5994
                if (nChunkXOff2 > nSrcWidth ||
1,548✔
5995
                    nDstXOff + nDstXCount == nDstTotalWidth)
1,548✔
5996
                    nChunkXOff2 = nSrcWidth;
1,473✔
5997
                const int nXCount = nChunkXOff2 - nChunkXOff;
1,548✔
5998
                CPLAssert(nXCount <= nFullResXChunk);
1,548✔
5999

6000
                int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
1,548✔
6001
                int nChunkXSizeQueried =
1,548✔
6002
                    nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
1,548✔
6003
                if (nChunkXOffQueried < 0)
1,548✔
6004
                {
6005
                    nChunkXSizeQueried += nChunkXOffQueried;
208✔
6006
                    nChunkXOffQueried = 0;
208✔
6007
                }
6008
                if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
1,548✔
6009
                    nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
217✔
6010
                CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
1,548✔
6011
#if DEBUG_VERBOSE
6012
                CPLDebug("GDAL",
6013
                         "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
6014
                         nChunkXOffQueried, nChunkYOffQueried,
6015
                         nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
6016
                         nDstYOff, nDstXCount, nDstYCount);
6017
#endif
6018

6019
                // Avoid accumulating too many tasks and exhaust RAM
6020

6021
                // Try to complete already finished jobs
6022
                while (eErr == CE_None && !jobList.empty())
1,548✔
6023
                {
6024
                    auto poOldestJob = jobList.front().get();
2✔
6025
                    if (!poOldestJob->IsFinished())
2✔
6026
                        break;
2✔
UNCOV
6027
                    eErr = poOldestJob->eErr;
×
UNCOV
6028
                    if (eErr == CE_None)
×
6029
                    {
UNCOV
6030
                        eErr = WriteJobData(poOldestJob);
×
6031
                    }
6032

UNCOV
6033
                    jobList.pop_front();
×
6034
                }
6035

6036
                // And in case we have saturated the number of threads,
6037
                // wait for completion of tasks to go below the threshold.
6038
                while (eErr == CE_None &&
3,096✔
6039
                       jobList.size() >= static_cast<size_t>(nThreads))
1,548✔
6040
                {
6041
                    eErr = WaitAndFinalizeOldestJob(jobList);
×
6042
                }
6043

6044
                // Read the source buffers for all the bands.
6045
                for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
4,859✔
6046
                {
6047
                    // (Re)allocate buffers if needed
6048
                    if (apaChunk[iBand] == nullptr)
3,311✔
6049
                    {
6050
                        apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
1,179✔
6051
                            nFullResXChunkQueried, nFullResYChunkQueried,
6052
                            nWrkDataTypeSize));
6053
                        if (apaChunk[iBand] == nullptr)
1,179✔
6054
                        {
6055
                            eErr = CE_Failure;
×
6056
                        }
6057
                    }
6058
                    if (bUseNoDataMask &&
3,652✔
6059
                        apabyChunkNoDataMask[iBand] == nullptr)
341✔
6060
                    {
6061
                        apabyChunkNoDataMask[iBand].reset(
282✔
6062
                            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
282✔
6063
                                nFullResXChunkQueried, nFullResYChunkQueried)));
6064
                        if (apabyChunkNoDataMask[iBand] == nullptr)
282✔
6065
                        {
6066
                            eErr = CE_Failure;
×
6067
                        }
6068
                    }
6069

6070
                    if (eErr == CE_None)
3,311✔
6071
                    {
6072
                        GDALRasterBand *poSrcBand = nullptr;
3,311✔
6073
                        if (iSrcOverview == -1)
3,311✔
6074
                            poSrcBand = papoSrcBands[iBand];
2,409✔
6075
                        else
6076
                            poSrcBand =
902✔
6077
                                papapoOverviewBands[iBand][iSrcOverview];
902✔
6078
                        eErr = poSrcBand->RasterIO(
3,311✔
6079
                            GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6080
                            nChunkXSizeQueried, nChunkYSizeQueried,
6081
                            apaChunk[iBand].get(), nChunkXSizeQueried,
3,311✔
6082
                            nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
6083

6084
                        if (bUseNoDataMask && eErr == CE_None)
3,311✔
6085
                        {
6086
                            auto poMaskBand = poSrcBand->IsMaskBand()
341✔
6087
                                                  ? poSrcBand
341✔
6088
                                                  : poSrcBand->GetMaskBand();
262✔
6089
                            eErr = poMaskBand->RasterIO(
341✔
6090
                                GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6091
                                nChunkXSizeQueried, nChunkYSizeQueried,
6092
                                apabyChunkNoDataMask[iBand].get(),
341✔
6093
                                nChunkXSizeQueried, nChunkYSizeQueried,
6094
                                GDT_Byte, 0, 0, nullptr);
6095
                        }
6096
                    }
6097
                }
6098

6099
                // Compute the resulting overview block.
6100
                for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
4,858✔
6101
                {
6102
                    auto poJob = std::make_unique<OvrJob>();
6,620✔
6103
                    poJob->pfnResampleFn = pfnResampleFn;
3,310✔
6104
                    poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
3,310✔
6105
                    poJob->args.eOvrDataType =
6,620✔
6106
                        poJob->poDstBand->GetRasterDataType();
3,310✔
6107
                    poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
3,310✔
6108
                    poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
3,310✔
6109
                    const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
3,310✔
6110
                        "NBITS", "IMAGE_STRUCTURE");
3,310✔
6111
                    poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
3,310✔
6112
                    poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
3,310✔
6113
                    poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
3,310✔
6114
                    poJob->args.eWrkDataType = eWrkDataType;
3,310✔
6115
                    poJob->pChunk = apaChunk[iBand].get();
3,310✔
6116
                    poJob->args.pabyChunkNodataMask =
3,310✔
6117
                        apabyChunkNoDataMask[iBand].get();
3,310✔
6118
                    poJob->args.nChunkXOff = nChunkXOffQueried;
3,310✔
6119
                    poJob->args.nChunkXSize = nChunkXSizeQueried;
3,310✔
6120
                    poJob->args.nChunkYOff = nChunkYOffQueried;
3,310✔
6121
                    poJob->args.nChunkYSize = nChunkYSizeQueried;
3,310✔
6122
                    poJob->args.nDstXOff = nDstXOff;
3,310✔
6123
                    poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
3,310✔
6124
                    poJob->args.nDstYOff = nDstYOff;
3,310✔
6125
                    poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
3,310✔
6126
                    poJob->args.pszResampling = pszResampling;
3,310✔
6127
                    poJob->args.bHasNoData = abHasNoData[iBand];
3,310✔
6128
                    poJob->args.dfNoDataValue = adfNoDataValue[iBand];
3,310✔
6129
                    poJob->args.eSrcDataType = eDataType;
3,310✔
6130
                    poJob->args.bPropagateNoData = bPropagateNoData;
3,310✔
6131

6132
                    if (poJobQueue)
3,310✔
6133
                    {
6134
                        poJob->oSrcMaskBufferHolder.reset(new PointerHolder(
32✔
6135
                            apabyChunkNoDataMask[iBand].release()));
16✔
6136

6137
                        poJob->oSrcBufferHolder.reset(
32✔
6138
                            new PointerHolder(apaChunk[iBand].release()));
16✔
6139

6140
                        poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
16✔
6141
                        jobList.emplace_back(std::move(poJob));
16✔
6142
                    }
6143
                    else
6144
                    {
6145
                        JobResampleFunc(poJob.get());
3,294✔
6146
                        eErr = poJob->eErr;
3,294✔
6147
                        if (eErr == CE_None)
3,294✔
6148
                        {
6149
                            eErr = WriteJobData(poJob.get());
3,294✔
6150
                        }
6151
                    }
6152
                }
6153
            }
6154
        }
6155

6156
        // Wait for all pending jobs to complete
6157
        while (!jobList.empty())
618✔
6158
        {
6159
            const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
16✔
6160
            if (l_eErr != CE_None && eErr == CE_None)
16✔
6161
                eErr = l_eErr;
×
6162
        }
6163

6164
        // Flush the data to overviews.
6165
        for (int iBand = 0; iBand < nBands; ++iBand)
1,779✔
6166
        {
6167
            if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
1,177✔
6168
                CE_None)
6169
                eErr = CE_Failure;
×
6170
        }
6171
    }
6172

6173
    if (eErr == CE_None)
382✔
6174
        pfnProgress(1.0, nullptr, pProgressData);
378✔
6175

6176
    return eErr;
382✔
6177
}
6178

6179
/************************************************************************/
6180
/*            GDALRegenerateOverviewsMultiBand()                        */
6181
/************************************************************************/
6182

6183
/**
6184
 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
6185
 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
6186
 *
6187
 * This function will generate one or more overview images from a base
6188
 * image using the requested downsampling algorithm.  Its primary use
6189
 * is for generating overviews via GDALDataset::BuildOverviews(), but it
6190
 * can also be used to generate downsampled images in one file from another
6191
 * outside the overview architecture.
6192
 *
6193
 * The output bands need to exist in advance and share the same characteristics
6194
 * (type, dimensions)
6195
 *
6196
 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
6197
 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
6198
 *
6199
 * It does not support color tables or complex data types.
6200
 *
6201
 * The pseudo-algorithm used by the function is :
6202
 *    for each overview
6203
 *       iterate on lines of the source by a step of deltay
6204
 *           iterate on columns of the source  by a step of deltax
6205
 *               read the source data of size deltax * deltay for all the bands
6206
 *               generate the corresponding overview block for all the bands
6207
 *
6208
 * This function will honour properly NODATA_VALUES tuples (special dataset
6209
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
6210
 * considered as the nodata value and not each value of the triplet
6211
 * independently per band.
6212
 *
6213
 * The GDAL_NUM_THREADS configuration option can be set
6214
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
6215
 * overview computation.
6216
 *
6217
 * @param apoSrcBands the list of source bands to downsample
6218
 * @param aapoOverviewBands bidimension array of bands. First dimension is
6219
 *                          indexed by bands. Second dimension is indexed by
6220
 *                          overview levels. All aapoOverviewBands[i] arrays
6221
 *                          must have the same size (i.e. same number of
6222
 *                          overviews)
6223
 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
6224
 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
6225
 * @param pfnProgress progress report function.
6226
 * @param pProgressData progress function callback data.
6227
 * @param papszOptions NULL terminated list of options as
6228
 *                     key=value pairs, or NULL
6229
 *                     The XOFF, YOFF, XSIZE and YSIZE
6230
 *                     options can be specified to express that overviews should
6231
 *                     be regenerated only in the specified subset of the source
6232
 *                     dataset.
6233
 * @return CE_None on success or CE_Failure on failure.
6234
 * @since 3.10
6235
 */
6236

6237
CPLErr GDALRegenerateOverviewsMultiBand(
19✔
6238
    const std::vector<GDALRasterBand *> &apoSrcBands,
6239
    const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
6240
    const char *pszResampling, GDALProgressFunc pfnProgress,
6241
    void *pProgressData, CSLConstList papszOptions)
6242
{
6243
    CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
19✔
6244
    for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
29✔
6245
    {
6246
        CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
10✔
6247
    }
6248

6249
    if (aapoOverviewBands.empty())
19✔
6250
        return CE_None;
×
6251

6252
    std::vector<GDALRasterBand **> apapoOverviewBands;
19✔
6253
    for (auto &apoOverviewBands : aapoOverviewBands)
48✔
6254
    {
6255
        auto papoOverviewBands = static_cast<GDALRasterBand **>(
6256
            CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
29✔
6257
        for (size_t i = 0; i < apoOverviewBands.size(); ++i)
61✔
6258
        {
6259
            papoOverviewBands[i] = apoOverviewBands[i];
32✔
6260
        }
6261
        apapoOverviewBands.push_back(papoOverviewBands);
29✔
6262
    }
6263
    const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
38✔
6264
        static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
19✔
6265
        static_cast<int>(aapoOverviewBands[0].size()),
19✔
6266
        apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
19✔
6267
        papszOptions);
6268
    for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
48✔
6269
        CPLFree(papoOverviewBands);
29✔
6270
    return eErr;
19✔
6271
}
6272

6273
/************************************************************************/
6274
/*                        GDALComputeBandStats()                        */
6275
/************************************************************************/
6276

6277
/** Undocumented
6278
 * @param hSrcBand undocumented.
6279
 * @param nSampleStep Step between scanlines used to compute statistics.
6280
 *                    When nSampleStep is equal to 1, all scanlines will
6281
 *                    be processed.
6282
 * @param pdfMean undocumented.
6283
 * @param pdfStdDev undocumented.
6284
 * @param pfnProgress undocumented.
6285
 * @param pProgressData undocumented.
6286
 * @return undocumented
6287
 */
6288
CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
18✔
6289
                                        int nSampleStep, double *pdfMean,
6290
                                        double *pdfStdDev,
6291
                                        GDALProgressFunc pfnProgress,
6292
                                        void *pProgressData)
6293

6294
{
6295
    VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
18✔
6296

6297
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
18✔
6298

6299
    if (pfnProgress == nullptr)
18✔
6300
        pfnProgress = GDALDummyProgress;
18✔
6301

6302
    const int nWidth = poSrcBand->GetXSize();
18✔
6303
    const int nHeight = poSrcBand->GetYSize();
18✔
6304

6305
    if (nSampleStep >= nHeight || nSampleStep < 1)
18✔
6306
        nSampleStep = 1;
5✔
6307

6308
    GDALDataType eWrkType = GDT_Unknown;
18✔
6309
    float *pafData = nullptr;
18✔
6310
    GDALDataType eType = poSrcBand->GetRasterDataType();
18✔
6311
    const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
18✔
6312
    if (bComplex)
18✔
6313
    {
6314
        pafData = static_cast<float *>(
6315
            VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
×
6316
        eWrkType = GDT_CFloat32;
×
6317
    }
6318
    else
6319
    {
6320
        pafData =
6321
            static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
18✔
6322
        eWrkType = GDT_Float32;
18✔
6323
    }
6324

6325
    if (nWidth == 0 || pafData == nullptr)
18✔
6326
    {
6327
        VSIFree(pafData);
×
6328
        return CE_Failure;
×
6329
    }
6330

6331
    /* -------------------------------------------------------------------- */
6332
    /*      Loop over all sample lines.                                     */
6333
    /* -------------------------------------------------------------------- */
6334
    double dfSum = 0.0;
18✔
6335
    double dfSum2 = 0.0;
18✔
6336
    int iLine = 0;
18✔
6337
    GIntBig nSamples = 0;
18✔
6338

6339
    do
2,143✔
6340
    {
6341
        if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
2,161✔
6342
                         pProgressData))
6343
        {
6344
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
×
6345
            CPLFree(pafData);
×
6346
            return CE_Failure;
×
6347
        }
6348

6349
        const CPLErr eErr =
6350
            poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
2,161✔
6351
                                1, eWrkType, 0, 0, nullptr);
6352
        if (eErr != CE_None)
2,161✔
6353
        {
6354
            CPLFree(pafData);
1✔
6355
            return eErr;
1✔
6356
        }
6357

6358
        for (int iPixel = 0; iPixel < nWidth; ++iPixel)
725,208✔
6359
        {
6360
            float fValue = 0.0f;
723,048✔
6361

6362
            if (bComplex)
723,048✔
6363
            {
6364
                // Compute the magnitude of the complex value.
6365
                fValue =
6366
                    std::hypot(pafData[static_cast<size_t>(iPixel) * 2],
×
6367
                               pafData[static_cast<size_t>(iPixel) * 2 + 1]);
×
6368
            }
6369
            else
6370
            {
6371
                fValue = pafData[iPixel];
723,048✔
6372
            }
6373

6374
            dfSum += fValue;
723,048✔
6375
            dfSum2 += static_cast<double>(fValue) * fValue;
723,048✔
6376
        }
6377

6378
        nSamples += nWidth;
2,160✔
6379
        iLine += nSampleStep;
2,160✔
6380
    } while (iLine < nHeight);
2,160✔
6381

6382
    if (!pfnProgress(1.0, nullptr, pProgressData))
17✔
6383
    {
6384
        CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
×
6385
        CPLFree(pafData);
×
6386
        return CE_Failure;
×
6387
    }
6388

6389
    /* -------------------------------------------------------------------- */
6390
    /*      Produce the result values.                                      */
6391
    /* -------------------------------------------------------------------- */
6392
    if (pdfMean != nullptr)
17✔
6393
        *pdfMean = dfSum / nSamples;
17✔
6394

6395
    if (pdfStdDev != nullptr)
17✔
6396
    {
6397
        const double dfMean = dfSum / nSamples;
17✔
6398

6399
        *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
17✔
6400
    }
6401

6402
    CPLFree(pafData);
17✔
6403

6404
    return CE_None;
17✔
6405
}
6406

6407
/************************************************************************/
6408
/*                  GDALOverviewMagnitudeCorrection()                   */
6409
/*                                                                      */
6410
/*      Correct the mean and standard deviation of the overviews of     */
6411
/*      the given band to match the base layer approximately.           */
6412
/************************************************************************/
6413

6414
/** Undocumented
6415
 * @param hBaseBand undocumented.
6416
 * @param nOverviewCount undocumented.
6417
 * @param pahOverviews undocumented.
6418
 * @param pfnProgress undocumented.
6419
 * @param pProgressData undocumented.
6420
 * @return undocumented
6421
 */
6422
CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
×
6423
                                       int nOverviewCount,
6424
                                       GDALRasterBandH *pahOverviews,
6425
                                       GDALProgressFunc pfnProgress,
6426
                                       void *pProgressData)
6427

6428
{
6429
    VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
×
6430

6431
    /* -------------------------------------------------------------------- */
6432
    /*      Compute mean/stddev for source raster.                          */
6433
    /* -------------------------------------------------------------------- */
6434
    double dfOrigMean = 0.0;
×
6435
    double dfOrigStdDev = 0.0;
×
6436
    {
6437
        const CPLErr eErr =
6438
            GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
×
6439
                                 pfnProgress, pProgressData);
6440

6441
        if (eErr != CE_None)
×
6442
            return eErr;
×
6443
    }
6444

6445
    /* -------------------------------------------------------------------- */
6446
    /*      Loop on overview bands.                                         */
6447
    /* -------------------------------------------------------------------- */
6448
    for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
×
6449
    {
6450
        GDALRasterBand *poOverview =
6451
            GDALRasterBand::FromHandle(pahOverviews[iOverview]);
×
6452
        double dfOverviewMean, dfOverviewStdDev;
6453

6454
        const CPLErr eErr =
6455
            GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
×
6456
                                 &dfOverviewStdDev, pfnProgress, pProgressData);
6457

6458
        if (eErr != CE_None)
×
6459
            return eErr;
×
6460

6461
        double dfGain = 1.0;
×
6462
        if (dfOrigStdDev >= 0.0001)
×
6463
            dfGain = dfOrigStdDev / dfOverviewStdDev;
×
6464

6465
        /* --------------------------------------------------------------------
6466
         */
6467
        /*      Apply gain and offset. */
6468
        /* --------------------------------------------------------------------
6469
         */
6470
        const int nWidth = poOverview->GetXSize();
×
6471
        const int nHeight = poOverview->GetYSize();
×
6472

6473
        GDALDataType eWrkType = GDT_Unknown;
×
6474
        float *pafData = nullptr;
×
6475
        const GDALDataType eType = poOverview->GetRasterDataType();
×
6476
        const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
×
6477
        if (bComplex)
×
6478
        {
6479
            pafData = static_cast<float *>(
6480
                VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
×
6481
            eWrkType = GDT_CFloat32;
×
6482
        }
6483
        else
6484
        {
6485
            pafData = static_cast<float *>(
6486
                VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
×
6487
            eWrkType = GDT_Float32;
×
6488
        }
6489

6490
        if (pafData == nullptr)
×
6491
        {
6492
            return CE_Failure;
×
6493
        }
6494

6495
        for (int iLine = 0; iLine < nHeight; ++iLine)
×
6496
        {
6497
            if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
×
6498
                             pProgressData))
6499
            {
6500
                CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
×
6501
                CPLFree(pafData);
×
6502
                return CE_Failure;
×
6503
            }
6504

6505
            if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
×
6506
                                     nWidth, 1, eWrkType, 0, 0,
6507
                                     nullptr) != CE_None)
×
6508
            {
6509
                CPLFree(pafData);
×
6510
                return CE_Failure;
×
6511
            }
6512

6513
            for (int iPixel = 0; iPixel < nWidth; ++iPixel)
×
6514
            {
6515
                if (bComplex)
×
6516
                {
6517
                    pafData[static_cast<size_t>(iPixel) * 2] *=
×
6518
                        static_cast<float>(dfGain);
×
6519
                    pafData[static_cast<size_t>(iPixel) * 2 + 1] *=
×
6520
                        static_cast<float>(dfGain);
×
6521
                }
6522
                else
6523
                {
6524
                    pafData[iPixel] = static_cast<float>(
×
6525
                        (pafData[iPixel] - dfOverviewMean) * dfGain +
×
6526
                        dfOrigMean);
6527
                }
6528
            }
6529

6530
            if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
×
6531
                                     nWidth, 1, eWrkType, 0, 0,
6532
                                     nullptr) != CE_None)
×
6533
            {
6534
                CPLFree(pafData);
×
6535
                return CE_Failure;
×
6536
            }
6537
        }
6538

6539
        if (!pfnProgress(1.0, nullptr, pProgressData))
×
6540
        {
6541
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
×
6542
            CPLFree(pafData);
×
6543
            return CE_Failure;
×
6544
        }
6545

6546
        CPLFree(pafData);
×
6547
    }
6548

6549
    return CE_None;
×
6550
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc