• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

OSGeo / gdal / 12706066811

10 Jan 2025 08:38AM UTC coverage: 70.084% (-2.5%) from 72.549%
12706066811

Pull #11629

github

web-flow
Merge 9418dc48f into 0df468c56
Pull Request #11629: add uv documentation for python package

563296 of 803749 relevant lines covered (70.08%)

223434.74 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

86.89
/gcore/overview.cpp
1

2
/******************************************************************************
3
 *
4
 * Project:  GDAL Core
5
 * Purpose:  Helper code to implement overview support in different drivers.
6
 * Author:   Frank Warmerdam, warmerdam@pobox.com
7
 *
8
 ******************************************************************************
9
 * Copyright (c) 2000, Frank Warmerdam
10
 * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11
 *
12
 * SPDX-License-Identifier: MIT
13
 ****************************************************************************/
14

15
#include "cpl_port.h"
16
#include "gdal_priv.h"
17

18
#include <cmath>
19
#include <cstddef>
20
#include <cstdlib>
21

22
#include <algorithm>
23
#include <complex>
24
#include <condition_variable>
25
#include <limits>
26
#include <list>
27
#include <memory>
28
#include <mutex>
29
#include <vector>
30

31
#include "cpl_conv.h"
32
#include "cpl_error.h"
33
#include "cpl_progress.h"
34
#include "cpl_vsi.h"
35
#include "gdal.h"
36
#include "gdal_thread_pool.h"
37
#include "gdalwarper.h"
38

39
#ifdef USE_NEON_OPTIMIZATIONS
40
#include "include_sse2neon.h"
41
#define USE_SSE2
42

43
#include "gdalsse_priv.h"
44

45
// Restrict to 64bit processors because they are guaranteed to have SSE2,
46
// or if __AVX2__ is defined.
47
#elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
48
#define USE_SSE2
49

50
#include "gdalsse_priv.h"
51

52
#ifdef __SSE3__
53
#include <pmmintrin.h>
54
#endif
55
#ifdef __SSSE3__
56
#include <tmmintrin.h>
57
#endif
58
#ifdef __SSE4_1__
59
#include <smmintrin.h>
60
#endif
61
#ifdef __AVX2__
62
#include <immintrin.h>
63
#endif
64

65
#endif
66

67
// To be included after above USE_SSE2 and include gdalsse_priv.h
68
// to avoid build issue on Windows x86
69
#include "gdal_priv_templates.hpp"
70

71
/************************************************************************/
72
/*                      GDALResampleChunk_Near()                        */
73
/************************************************************************/
74

75
template <class T>
76
static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
6,053✔
77
                                      const T *pChunk, T **ppDstBuffer)
78

79
{
80
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
6,053✔
81
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
6,053✔
82
    const GDALDataType eWrkDataType = args.eWrkDataType;
6,053✔
83
    const int nChunkXOff = args.nChunkXOff;
6,053✔
84
    const int nChunkXSize = args.nChunkXSize;
6,053✔
85
    const int nChunkYOff = args.nChunkYOff;
6,053✔
86
    const int nDstXOff = args.nDstXOff;
6,053✔
87
    const int nDstXOff2 = args.nDstXOff2;
6,053✔
88
    const int nDstYOff = args.nDstYOff;
6,053✔
89
    const int nDstYOff2 = args.nDstYOff2;
6,053✔
90
    const int nDstXWidth = nDstXOff2 - nDstXOff;
6,053✔
91

92
    /* -------------------------------------------------------------------- */
93
    /*      Allocate buffers.                                               */
94
    /* -------------------------------------------------------------------- */
95
    *ppDstBuffer = static_cast<T *>(
6,053✔
96
        VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
6,053✔
97
                            GDALGetDataTypeSizeBytes(eWrkDataType)));
98
    if (*ppDstBuffer == nullptr)
6,053✔
99
    {
100
        return CE_Failure;
×
101
    }
102
    T *const pDstBuffer = *ppDstBuffer;
6,053✔
103

104
    int *panSrcXOff =
105
        static_cast<int *>(VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(int)));
6,053✔
106

107
    if (panSrcXOff == nullptr)
6,053✔
108
    {
109
        VSIFree(panSrcXOff);
×
110
        return CE_Failure;
×
111
    }
112

113
    /* ==================================================================== */
114
    /*      Precompute inner loop constants.                                */
115
    /* ==================================================================== */
116
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
572,820✔
117
    {
118
        int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
566,767✔
119
        if (nSrcXOff < nChunkXOff)
566,767✔
120
            nSrcXOff = nChunkXOff;
×
121

122
        panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
566,767✔
123
    }
124

125
    /* ==================================================================== */
126
    /*      Loop over destination scanlines.                                */
127
    /* ==================================================================== */
128
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
210,831✔
129
    {
130
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
204,778✔
131
        if (nSrcYOff < nChunkYOff)
204,778✔
132
            nSrcYOff = nChunkYOff;
×
133

134
        const T *const pSrcScanline =
204,778✔
135
            pChunk +
136
            (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
204,778✔
137
            nChunkXOff;
202,308✔
138

139
        /* --------------------------------------------------------------------
140
         */
141
        /*      Loop over destination pixels */
142
        /* --------------------------------------------------------------------
143
         */
144
        T *pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
204,778✔
145
        for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
116,303,034✔
146
        {
147
            pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
116,097,564✔
148
        }
149
    }
150

151
    CPLFree(panSrcXOff);
6,053✔
152

153
    return CE_None;
6,053✔
154
}
155

156
static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
6,053✔
157
                                     const void *pChunk, void **ppDstBuffer,
158
                                     GDALDataType *peDstBufferDataType)
159
{
160
    *peDstBufferDataType = args.eWrkDataType;
6,053✔
161
    switch (args.eWrkDataType)
6,053✔
162
    {
163
        // For nearest resampling, as no computation is done, only the
164
        // size of the data type matters.
165
        case GDT_Byte:
5,925✔
166
        case GDT_Int8:
167
        {
168
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
5,925✔
169
            return GDALResampleChunk_NearT(
5,925✔
170
                args, static_cast<const uint8_t *>(pChunk),
171
                reinterpret_cast<uint8_t **>(ppDstBuffer));
5,925✔
172
        }
173

174
        case GDT_Int16:
26✔
175
        case GDT_UInt16:
176
        {
177
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
26✔
178
            return GDALResampleChunk_NearT(
26✔
179
                args, static_cast<const uint16_t *>(pChunk),
180
                reinterpret_cast<uint16_t **>(ppDstBuffer));
26✔
181
        }
182

183
        case GDT_CInt16:
55✔
184
        case GDT_Int32:
185
        case GDT_UInt32:
186
        case GDT_Float32:
187
        {
188
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
55✔
189
            return GDALResampleChunk_NearT(
55✔
190
                args, static_cast<const uint32_t *>(pChunk),
191
                reinterpret_cast<uint32_t **>(ppDstBuffer));
55✔
192
        }
193

194
        case GDT_CInt32:
43✔
195
        case GDT_CFloat32:
196
        case GDT_Int64:
197
        case GDT_UInt64:
198
        case GDT_Float64:
199
        {
200
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
43✔
201
            return GDALResampleChunk_NearT(
43✔
202
                args, static_cast<const uint64_t *>(pChunk),
203
                reinterpret_cast<uint64_t **>(ppDstBuffer));
43✔
204
        }
205

206
        case GDT_CFloat64:
4✔
207
        {
208
            return GDALResampleChunk_NearT(
4✔
209
                args, static_cast<const std::complex<double> *>(pChunk),
210
                reinterpret_cast<std::complex<double> **>(ppDstBuffer));
4✔
211
        }
212

213
        case GDT_Unknown:
×
214
        case GDT_TypeCount:
215
            break;
×
216
    }
217
    CPLAssert(false);
×
218
    return CE_Failure;
219
}
220

221
namespace
222
{
223

224
// Find in the color table the entry whose RGB value is the closest
225
// (using quadratic distance) to the test color, ignoring transparent entries.
226
int BestColorEntry(const std::vector<GDALColorEntry> &entries,
3,837✔
227
                   const GDALColorEntry &test)
228
{
229
    int nMinDist = std::numeric_limits<int>::max();
3,837✔
230
    size_t bestEntry = 0;
3,837✔
231
    for (size_t i = 0; i < entries.size(); ++i)
986,109✔
232
    {
233
        const GDALColorEntry &entry = entries[i];
982,272✔
234
        // Ignore transparent entries
235
        if (entry.c4 == 0)
982,272✔
236
            continue;
3,237✔
237

238
        int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
979,035✔
239
                    ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
979,035✔
240
                    ((test.c3 - entry.c3) * (test.c3 - entry.c3));
979,035✔
241
        if (nDist < nMinDist)
979,035✔
242
        {
243
            nMinDist = nDist;
15,847✔
244
            bestEntry = i;
15,847✔
245
        }
246
    }
247
    return static_cast<int>(bestEntry);
3,837✔
248
}
249

250
std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
7✔
251
                                           int &transparentIdx)
252
{
253
    std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
7✔
254

255
    transparentIdx = -1;
7✔
256
    int i = 0;
7✔
257
    for (auto &entry : entries)
1,799✔
258
    {
259
        table.GetColorEntryAsRGB(i, &entry);
1,792✔
260
        if (transparentIdx < 0 && entry.c4 == 0)
1,792✔
261
            transparentIdx = i;
1✔
262
        ++i;
1,792✔
263
    }
264
    return entries;
7✔
265
}
266

267
}  // unnamed  namespace
268

269
/************************************************************************/
270
/*                             SQUARE()                                 */
271
/************************************************************************/
272

273
template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
3,721✔
274
{
275
    return static_cast<Tsquare>(val) * val;
3,721✔
276
}
277

278
/************************************************************************/
279
/*                          ComputeIntegerRMS()                         */
280
/************************************************************************/
281
// Compute rms = sqrt(sumSquares / weight) in such a way that it is the
282
// integer that minimizes abs(rms**2 - sumSquares / weight)
283
template <class T, class Twork>
284
inline T ComputeIntegerRMS(double sumSquares, double weight)
42✔
285
{
286
    const double sumDivWeight = sumSquares / weight;
42✔
287
    T rms = static_cast<T>(sqrt(sumDivWeight));
42✔
288

289
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
290
    // Naive version:
291
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
292
    if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
42✔
293
        2 * sumDivWeight)
42✔
294
        rms += 1;
6✔
295
    return rms;
42✔
296
}
297

298
template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
×
299
{
300
    CPLAssert(false);
×
301
    return 0;
302
}
303

304
template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
24✔
305
{
306
    // It has been verified that given the correction on rms below, using
307
    // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
308
    // is equivalent, so use the former as it is used twice.
309
    const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
24✔
310
    const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
24✔
311
    GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
24✔
312

313
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
314
    // Naive version:
315
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
316
    // Optimized version for integer case and weight == 4
317
    if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
24✔
318
        rms += 1;
5✔
319
    return rms;
24✔
320
}
321

322
template <>
323
inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
20✔
324
{
325
    const double sumDivWeight = sumSquares * 0.25;
20✔
326
    GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
20✔
327

328
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
329
    // Naive version:
330
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
331
    // Optimized version for integer case and weight == 4
332
    if (static_cast<GUInt32>(rms) * (rms + 1) <
20✔
333
        static_cast<GUInt32>(sumDivWeight + 0.25))
20✔
334
        rms += 1;
4✔
335
    return rms;
20✔
336
}
337

338
#ifdef USE_SSE2
339

340
/************************************************************************/
341
/*                   QuadraticMeanByteSSE2OrAVX2()                      */
342
/************************************************************************/
343

344
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
345
#define sse2_packus_epi32 _mm_packus_epi32
346
#else
347
inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
516,119✔
348
{
349
    const auto minus32768_32 = _mm_set1_epi32(-32768);
516,119✔
350
    const auto minus32768_16 = _mm_set1_epi16(-32768);
516,119✔
351
    a = _mm_add_epi32(a, minus32768_32);
516,119✔
352
    b = _mm_add_epi32(b, minus32768_32);
516,119✔
353
    a = _mm_packs_epi32(a, b);
516,119✔
354
    a = _mm_sub_epi16(a, minus32768_16);
516,119✔
355
    return a;
516,119✔
356
}
357
#endif
358

359
#if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
360
#define sse2_hadd_epi16 _mm_hadd_epi16
361
#else
362
inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
4,660,800✔
363
{
364
    // Horizontal addition of adjacent pairs
365
    const auto mask = _mm_set1_epi32(0xFFFF);
4,660,800✔
366
    const auto horizLo =
367
        _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
13,982,400✔
368
    const auto horizHi =
369
        _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
13,982,400✔
370

371
    // Recombine low and high parts
372
    return _mm_packs_epi32(horizLo, horizHi);
4,660,800✔
373
}
374
#endif
375

376
#ifdef __AVX2__
377

378
#define DEST_ELTS 16
379
#define set1_epi16 _mm256_set1_epi16
380
#define set1_epi32 _mm256_set1_epi32
381
#define setzero _mm256_setzero_si256
382
#define set1_ps _mm256_set1_ps
383
#define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
384
#define unpacklo_epi8 _mm256_unpacklo_epi8
385
#define unpackhi_epi8 _mm256_unpackhi_epi8
386
#define madd_epi16 _mm256_madd_epi16
387
#define add_epi32 _mm256_add_epi32
388
#define mul_ps _mm256_mul_ps
389
#define cvtepi32_ps _mm256_cvtepi32_ps
390
#define sqrt_ps _mm256_sqrt_ps
391
#define cvttps_epi32 _mm256_cvttps_epi32
392
#define packs_epi32 _mm256_packs_epi32
393
#define packus_epi32 _mm256_packus_epi32
394
#define srli_epi32 _mm256_srli_epi32
395
#define mullo_epi16 _mm256_mullo_epi16
396
#define srli_epi16 _mm256_srli_epi16
397
#define cmpgt_epi16 _mm256_cmpgt_epi16
398
#define add_epi16 _mm256_add_epi16
399
#define sub_epi16 _mm256_sub_epi16
400
#define packus_epi16 _mm256_packus_epi16
401
/* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
402
/* to get the lower 128-bit bits of what would be a true 256-bit vector register
403
 */
404
#define store_lo(x, y)                                                         \
405
    _mm_storeu_si128(reinterpret_cast<__m128i *>(x),                           \
406
                     _mm256_extracti128_si256(                                 \
407
                         _mm256_permute4x64_epi64((y), 0 | (2 << 2)), 0))
408
#define hadd_epi16 _mm256_hadd_epi16
409
#define zeroupper() _mm256_zeroupper()
410
#else
411
#define DEST_ELTS 8
412
#define set1_epi16 _mm_set1_epi16
413
#define set1_epi32 _mm_set1_epi32
414
#define setzero _mm_setzero_si128
415
#define set1_ps _mm_set1_ps
416
#define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
417
#define unpacklo_epi8 _mm_unpacklo_epi8
418
#define unpackhi_epi8 _mm_unpackhi_epi8
419
#define madd_epi16 _mm_madd_epi16
420
#define add_epi32 _mm_add_epi32
421
#define mul_ps _mm_mul_ps
422
#define cvtepi32_ps _mm_cvtepi32_ps
423
#define sqrt_ps _mm_sqrt_ps
424
#define cvttps_epi32 _mm_cvttps_epi32
425
#define packs_epi32 _mm_packs_epi32
426
#define packus_epi32 sse2_packus_epi32
427
#define srli_epi32 _mm_srli_epi32
428
#define mullo_epi16 _mm_mullo_epi16
429
#define srli_epi16 _mm_srli_epi16
430
#define cmpgt_epi16 _mm_cmpgt_epi16
431
#define add_epi16 _mm_add_epi16
432
#define sub_epi16 _mm_sub_epi16
433
#define packus_epi16 _mm_packus_epi16
434
#define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
435
#define hadd_epi16 sse2_hadd_epi16
436
#define zeroupper() (void)0
437
#endif
438

439
#if defined(__GNUC__) && defined(__AVX2__)
440
// Disabling inlining works around a bug with gcc 9.3 (Ubuntu 20.04) in
441
// -O2 -mavx2 mode in QuadraticMeanFloatSSE2(),
442
// where the registry that contains minus_zero is correctly
443
// loaded the first time the function is called (looking at the disassembly,
444
// one sees it is loaded much earlier than the function), but gets corrupted
445
// (zeroed) in following iterations.
446
// It appears the bug is due to the explicit zeroupper() call at the end of
447
// the function.
448
// The bug is at least solved in gcc 10.2.
449
// Inlining doesn't bring much here to performance.
450
// This is also needed with gcc 9.3 on QuadraticMeanByteSSE2OrAVX2() in
451
// -O3 -mavx2 mode
452
#define NOINLINE __attribute__((noinline))
453
#else
454
#define NOINLINE
455
#endif
456

457
template <class T>
458
static int NOINLINE
459
QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
5,385✔
460
                            const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
461
                            T *CPL_RESTRICT pDstScanline)
462
{
463
    // Optimized implementation for RMS on Byte by
464
    // processing by group of 8 output pixels, so as to use
465
    // a single _mm_sqrt_ps() call for 4 output pixels
466
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
5,385✔
467

468
    int iDstPixel = 0;
5,385✔
469
    const auto one16 = set1_epi16(1);
5,385✔
470
    const auto one32 = set1_epi32(1);
5,385✔
471
    const auto zero = setzero();
5,385✔
472
    const auto minus32768 = set1_epi16(-32768);
5,385✔
473

474
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
521,496✔
475
    {
476
        // Load 2 * DEST_ELTS bytes from each line
477
        auto firstLine = loadu_int(pSrcScanlineShifted);
516,111✔
478
        auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
1,032,220✔
479
        // Extend those Bytes as UInt16s
480
        auto firstLineLo = unpacklo_epi8(firstLine, zero);
516,111✔
481
        auto firstLineHi = unpackhi_epi8(firstLine, zero);
516,111✔
482
        auto secondLineLo = unpacklo_epi8(secondLine, zero);
516,111✔
483
        auto secondLineHi = unpackhi_epi8(secondLine, zero);
516,111✔
484

485
        // Multiplication of 16 bit values and horizontal
486
        // addition of 32 bit results
487
        // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
488
        firstLineLo = madd_epi16(firstLineLo, firstLineLo);
516,111✔
489
        firstLineHi = madd_epi16(firstLineHi, firstLineHi);
516,111✔
490
        secondLineLo = madd_epi16(secondLineLo, secondLineLo);
516,111✔
491
        secondLineHi = madd_epi16(secondLineHi, secondLineHi);
516,111✔
492

493
        // Vertical addition
494
        const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
516,111✔
495
        const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
516,111✔
496

497
        const auto sumSquaresPlusOneDiv4Lo =
498
            srli_epi32(add_epi32(sumSquaresLo, one32), 2);
1,032,220✔
499
        const auto sumSquaresPlusOneDiv4Hi =
500
            srli_epi32(add_epi32(sumSquaresHi, one32), 2);
1,032,220✔
501

502
        // Take square root and truncate/floor to int32
503
        const auto rmsLo =
504
            cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
1,548,330✔
505
        const auto rmsHi =
506
            cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
1,548,330✔
507

508
        // Merge back low and high registers with each RMS value
509
        // as a 16 bit value.
510
        auto rms = packs_epi32(rmsLo, rmsHi);
516,111✔
511

512
        // Round to upper value if it minimizes the
513
        // error |rms^2 - sumSquares/4|
514
        // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
515
        //    rms += 1;
516
        // which is equivalent to:
517
        // if( rms * (rms + 1) < (sumSquares+1) / 4 )
518
        //    rms += 1;
519
        // And both left and right parts fit on 16 (unsigned) bits
520
        const auto sumSquaresPlusOneDiv4 =
521
            packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
516,111✔
522
        // cmpgt_epi16 operates on signed int16, but here
523
        // we have unsigned values, so shift them by -32768 before
524
        auto mask = cmpgt_epi16(
2,580,560✔
525
            add_epi16(sumSquaresPlusOneDiv4, minus32768),
526
            add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
527
        // The value of the mask will be -1 when the correction needs to be
528
        // applied
529
        rms = sub_epi16(rms, mask);
516,111✔
530

531
        // Pack each 16 bit RMS value to 8 bits
532
        rms = packus_epi16(rms, rms /* could be anything */);
516,111✔
533
        store_lo(&pDstScanline[iDstPixel], rms);
516,111✔
534
        pSrcScanlineShifted += 2 * DEST_ELTS;
516,111✔
535
    }
536
    zeroupper();
537

538
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
5,385✔
539
    return iDstPixel;
5,385✔
540
}
541

542
/************************************************************************/
543
/*                      AverageByteSSE2OrAVX2()                         */
544
/************************************************************************/
545

546
template <class T>
547
static int
548
AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
110,996✔
549
                      const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
550
                      T *CPL_RESTRICT pDstScanline)
551
{
552
    // Optimized implementation for average on Byte by
553
    // processing by group of 8 output pixels.
554

555
    const auto zero = setzero();
110,996✔
556
    const auto two16 = set1_epi16(2);
110,996✔
557
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
110,996✔
558

559
    int iDstPixel = 0;
110,996✔
560
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
4,771,800✔
561
    {
562
        // Load 2 * DEST_ELTS bytes from each line
563
        const auto firstLine = loadu_int(pSrcScanlineShifted);
4,660,800✔
564
        const auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
9,321,610✔
565
        // Extend those Bytes as UInt16s
566
        const auto firstLineLo = unpacklo_epi8(firstLine, zero);
4,660,800✔
567
        const auto firstLineHi = unpackhi_epi8(firstLine, zero);
4,660,800✔
568
        const auto secondLineLo = unpacklo_epi8(secondLine, zero);
4,660,800✔
569
        const auto secondLineHi = unpackhi_epi8(secondLine, zero);
4,660,800✔
570

571
        // Vertical addition
572
        const auto sumLo = add_epi16(firstLineLo, secondLineLo);
4,660,800✔
573
        const auto sumHi = add_epi16(firstLineHi, secondLineHi);
4,660,800✔
574

575
        // Horizontal addition of adjacent pairs, and recombine low and high
576
        // parts
577
        const auto sum = hadd_epi16(sumLo, sumHi);
4,660,800✔
578

579
        // average = (sum + 2) / 4
580
        auto average = srli_epi16(add_epi16(sum, two16), 2);
9,321,610✔
581

582
        // Pack each 16 bit average value to 8 bits
583
        average = packus_epi16(average, average /* could be anything */);
4,660,800✔
584
        store_lo(&pDstScanline[iDstPixel], average);
4,660,800✔
585
        pSrcScanlineShifted += 2 * DEST_ELTS;
4,660,800✔
586
    }
587
    zeroupper();
588

589
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
110,996✔
590
    return iDstPixel;
110,996✔
591
}
592

593
/************************************************************************/
594
/*                     QuadraticMeanUInt16SSE2()                        */
595
/************************************************************************/
596

597
#ifdef __SSE3__
598
#define sse2_hadd_pd _mm_hadd_pd
599
#else
600
inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
8✔
601
{
602
    auto aLo_bLo =
603
        _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
32✔
604
    auto aHi_bHi =
605
        _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
32✔
606
    return _mm_add_pd(aLo_bLo, aHi_bHi);  // (aLo + aHi, bLo + bHi)
8✔
607
}
608
#endif
609

610
inline __m128d SQUARE(__m128d x)
40✔
611
{
612
    return _mm_mul_pd(x, x);
40✔
613
}
614

615
#ifdef __AVX2__
616

617
inline __m256d SQUARE(__m256d x)
618
{
619
    return _mm256_mul_pd(x, x);
620
}
621

622
inline __m256d FIXUP_LANES(__m256d x)
623
{
624
    return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
625
}
626

627
inline __m256 FIXUP_LANES(__m256 x)
628
{
629
    return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
630
}
631

632
#endif
633

634
template <class T>
635
static int
636
QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
10✔
637
                        const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
638
                        T *CPL_RESTRICT pDstScanline)
639
{
640
    // Optimized implementation for RMS on UInt16 by
641
    // processing by group of 4 output pixels.
642
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
10✔
643

644
    int iDstPixel = 0;
10✔
645
    const auto zero = _mm_setzero_si128();
10✔
646

647
#ifdef __AVX2__
648
    const auto zeroDot25 = _mm256_set1_pd(0.25);
649
    const auto zeroDot5 = _mm256_set1_pd(0.5);
650

651
    // The first four 0's could be anything, as we only take the bottom
652
    // 128 bits.
653
    const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
654
#else
655
    const auto zeroDot25 = _mm_set1_pd(0.25);
10✔
656
    const auto zeroDot5 = _mm_set1_pd(0.5);
10✔
657
#endif
658

659
    for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
40✔
660
    {
661
        // Load 8 UInt16 from each line
662
        const auto firstLine = _mm_loadu_si128(
30✔
663
            reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
664
        const auto secondLine =
665
            _mm_loadu_si128(reinterpret_cast<__m128i const *>(
30✔
666
                pSrcScanlineShifted + nChunkXSize));
30✔
667

668
        // Detect if all of the source values fit in 14 bits.
669
        // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
670
        // and we can do a much faster implementation.
671
        const auto maskTmp =
672
            _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
60✔
673
#if defined(__i386__) || defined(_M_IX86)
674
        uint64_t nMaskFitsIn14Bits = 0;
675
        _mm_storel_epi64(
676
            reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
677
            _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
678
#else
679
        const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
30✔
680
            _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
681
#endif
682
        if (nMaskFitsIn14Bits == 0)
30✔
683
        {
684
            // Multiplication of 16 bit values and horizontal
685
            // addition of 32 bit results
686
            const auto firstLineHSumSquare =
687
                _mm_madd_epi16(firstLine, firstLine);
26✔
688
            const auto secondLineHSumSquare =
689
                _mm_madd_epi16(secondLine, secondLine);
26✔
690
            // Vertical addition
691
            const auto sumSquares =
692
                _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
26✔
693
            // In theory we should take sqrt(sumSquares * 0.25f)
694
            // but given the rounding we do, this is equivalent to
695
            // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
696
            // sumSquares <= 4 * 16383^2
697
            const auto one32 = _mm_set1_epi32(1);
26✔
698
            const auto sumSquaresPlusOneDiv4 =
699
                _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
52✔
700
            // Take square root and truncate/floor to int32
701
            auto rms = _mm_cvttps_epi32(
78✔
702
                _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
703

704
            // Round to upper value if it minimizes the
705
            // error |rms^2 - sumSquares/4|
706
            // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
707
            //    rms += 1;
708
            // which is equivalent to:
709
            // if( rms * rms + rms < (sumSquares+1) / 4 )
710
            //    rms += 1;
711
            auto mask =
712
                _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
78✔
713
                                _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
714
            rms = _mm_sub_epi32(rms, mask);
26✔
715
            // Pack each 32 bit RMS value to 16 bits
716
            rms = _mm_packs_epi32(rms, rms /* could be anything */);
26✔
717
            _mm_storel_epi64(
718
                reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
26✔
719
            pSrcScanlineShifted += 8;
26✔
720
            continue;
26✔
721
        }
722

723
        // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
724
        // to 32 bit would result in 4 multiplications instead of 8, but
725
        // mullo/mulhi have a worse throughput than mul_pd.
726

727
        // Extend those UInt16s as UInt32s
728
        const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
4✔
729
        const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
4✔
730
        const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
4✔
731
        const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
4✔
732

733
#ifdef __AVX2__
734
        // Multiplication of 32 bit values previously converted to 64 bit double
735
        const auto firstLineLoDbl = SQUARE(_mm256_cvtepi32_pd(firstLineLo));
736
        const auto firstLineHiDbl = SQUARE(_mm256_cvtepi32_pd(firstLineHi));
737
        const auto secondLineLoDbl = SQUARE(_mm256_cvtepi32_pd(secondLineLo));
738
        const auto secondLineHiDbl = SQUARE(_mm256_cvtepi32_pd(secondLineHi));
739

740
        // Vertical addition of squares
741
        const auto sumSquaresLo =
742
            _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
743
        const auto sumSquaresHi =
744
            _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
745

746
        // Horizontal addition of squares
747
        const auto sumSquares =
748
            FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
749

750
        const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
751

752
        // Take square root and truncate/floor to int32
753
        auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
754
        const auto rmsDouble = _mm256_cvtepi32_pd(rms);
755
        const auto right = _mm256_sub_pd(
756
            sumDivWeight, _mm256_add_pd(SQUARE(rmsDouble), rmsDouble));
757

758
        auto mask =
759
            _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
760
        // Extract 32-bit from each of the 4 64-bit masks
761
        // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
762
        // _MM_SHUFFLE(2,0,2,0)));
763
        mask = _mm256_permutevar8x32_ps(mask, permutation);
764
        const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
765

766
        // Apply the correction
767
        rms = _mm_sub_epi32(rms, maskI);
768

769
        // Pack each 32 bit RMS value to 16 bits
770
        rms = _mm_packus_epi32(rms, rms /* could be anything */);
771
#else
772
        // Multiplication of 32 bit values previously converted to 64 bit double
773
        const auto firstLineLoLo = SQUARE(_mm_cvtepi32_pd(firstLineLo));
4✔
774
        const auto firstLineLoHi =
775
            SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
8✔
776
        const auto firstLineHiLo = SQUARE(_mm_cvtepi32_pd(firstLineHi));
4✔
777
        const auto firstLineHiHi =
778
            SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
8✔
779

780
        const auto secondLineLoLo = SQUARE(_mm_cvtepi32_pd(secondLineLo));
4✔
781
        const auto secondLineLoHi =
782
            SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
8✔
783
        const auto secondLineHiLo = SQUARE(_mm_cvtepi32_pd(secondLineHi));
4✔
784
        const auto secondLineHiHi =
785
            SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
8✔
786

787
        // Vertical addition of squares
788
        const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
4✔
789
        const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
4✔
790
        const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
4✔
791
        const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
4✔
792

793
        // Horizontal addition of squares
794
        const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
4✔
795
        const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
4✔
796

797
        const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
4✔
798
        const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
4✔
799
        // Take square root and truncate/floor to int32
800
        const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
8✔
801
        const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
8✔
802

803
        // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
804
        // if( 0.5 < sumDivWeight - (rms * rms + rms) )
805
        //     rms += 1;
806
        const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
4✔
807
        const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
4✔
808
        const auto rightLo = _mm_sub_pd(
8✔
809
            sumDivWeightLo, _mm_add_pd(SQUARE(rmsLoDouble), rmsLoDouble));
810
        const auto rightHi = _mm_sub_pd(
12✔
811
            sumDivWeightHi, _mm_add_pd(SQUARE(rmsHiDouble), rmsHiDouble));
812

813
        const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
8✔
814
        const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
4✔
815
        // The value of the mask will be -1 when the correction needs to be
816
        // applied
817
        const auto mask = _mm_castps_si128(_mm_shuffle_ps(
8✔
818
            maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
819

820
        auto rms = _mm_castps_si128(
16✔
821
            _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
822
        // Apply the correction
823
        rms = _mm_sub_epi32(rms, mask);
4✔
824

825
        // Pack each 32 bit RMS value to 16 bits
826
        rms = sse2_packus_epi32(rms, rms /* could be anything */);
4✔
827
#endif
828

829
        _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
4✔
830
                         rms);
831
        pSrcScanlineShifted += 8;
4✔
832
    }
833

834
    zeroupper();
835

836
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
10✔
837
    return iDstPixel;
10✔
838
}
839

840
/************************************************************************/
841
/*                         AverageUInt16SSE2()                          */
842
/************************************************************************/
843

844
template <class T>
845
static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
9✔
846
                             const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
847
                             T *CPL_RESTRICT pDstScanline)
848
{
849
    // Optimized implementation for average on UInt16 by
850
    // processing by group of 8 output pixels.
851

852
    const auto mask = _mm_set1_epi32(0xFFFF);
9✔
853
    const auto two = _mm_set1_epi32(2);
9✔
854
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
9✔
855

856
    int iDstPixel = 0;
9✔
857
    for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
13✔
858
    {
859
        __m128i averageLow;
860
        // Load 8 UInt16 from each line
861
        {
862
            const auto firstLine = _mm_loadu_si128(
4✔
863
                reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
864
            const auto secondLine =
865
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
4✔
866
                    pSrcScanlineShifted + nChunkXSize));
4✔
867

868
            // Horizontal addition and extension to 32 bit
869
            const auto horizAddFirstLine = _mm_add_epi32(
12✔
870
                _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
871
            const auto horizAddSecondLine =
872
                _mm_add_epi32(_mm_and_si128(secondLine, mask),
12✔
873
                              _mm_srli_epi32(secondLine, 16));
874

875
            // Vertical addition and average computation
876
            // average = (sum + 2) >> 2
877
            const auto sum = _mm_add_epi32(
8✔
878
                _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
879
            averageLow = _mm_srli_epi32(sum, 2);
4✔
880
        }
881
        // Load 8 UInt16 from each line
882
        __m128i averageHigh;
883
        {
884
            const auto firstLine = _mm_loadu_si128(
4✔
885
                reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
4✔
886
            const auto secondLine =
887
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
4✔
888
                    pSrcScanlineShifted + 8 + nChunkXSize));
4✔
889

890
            // Horizontal addition and extension to 32 bit
891
            const auto horizAddFirstLine = _mm_add_epi32(
12✔
892
                _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
893
            const auto horizAddSecondLine =
894
                _mm_add_epi32(_mm_and_si128(secondLine, mask),
12✔
895
                              _mm_srli_epi32(secondLine, 16));
896

897
            // Vertical addition and average computation
898
            // average = (sum + 2) >> 2
899
            const auto sum = _mm_add_epi32(
8✔
900
                _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
901
            averageHigh = _mm_srli_epi32(sum, 2);
4✔
902
        }
903

904
        // Pack each 32 bit average value to 16 bits
905
        auto average = sse2_packus_epi32(averageLow, averageHigh);
4✔
906
        _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
4✔
907
                         average);
908
        pSrcScanlineShifted += 16;
4✔
909
    }
910

911
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
9✔
912
    return iDstPixel;
9✔
913
}
914

915
/************************************************************************/
916
/*                      QuadraticMeanFloatSSE2()                        */
917
/************************************************************************/
918

919
#ifdef __AVX2__
920
#define RMS_FLOAT_ELTS 8
921
#define set1_ps _mm256_set1_ps
922
#define loadu_ps _mm256_loadu_ps
923
#define andnot_ps _mm256_andnot_ps
924
#define and_ps _mm256_and_ps
925
#define max_ps _mm256_max_ps
926
#define shuffle_ps _mm256_shuffle_ps
927
#define div_ps _mm256_div_ps
928
#define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
929
#define mul_ps _mm256_mul_ps
930
#define add_ps _mm256_add_ps
931
#define hadd_ps _mm256_hadd_ps
932
#define sqrt_ps _mm256_sqrt_ps
933
#define or_ps _mm256_or_ps
934
#define unpacklo_ps _mm256_unpacklo_ps
935
#define unpackhi_ps _mm256_unpackhi_ps
936
#define storeu_ps _mm256_storeu_ps
937

938
inline __m256 SQUARE(__m256 x)
939
{
940
    return _mm256_mul_ps(x, x);
941
}
942

943
#else
944

945
#ifdef __SSE3__
946
#define sse2_hadd_ps _mm_hadd_ps
947
#else
948
inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
949
{
950
    auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
951
    auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
952
    return _mm_add_ps(aEven_bEven, aOdd_bOdd);  // (aEven + aOdd, bEven + bOdd)
953
}
954
#endif
955

956
#define RMS_FLOAT_ELTS 4
957
#define set1_ps _mm_set1_ps
958
#define loadu_ps _mm_loadu_ps
959
#define andnot_ps _mm_andnot_ps
960
#define and_ps _mm_and_ps
961
#define max_ps _mm_max_ps
962
#define shuffle_ps _mm_shuffle_ps
963
#define div_ps _mm_div_ps
964
#define cmpeq_ps _mm_cmpeq_ps
965
#define mul_ps _mm_mul_ps
966
#define add_ps _mm_add_ps
967
#define hadd_ps sse2_hadd_ps
968
#define sqrt_ps _mm_sqrt_ps
969
#define or_ps _mm_or_ps
970
#define unpacklo_ps _mm_unpacklo_ps
971
#define unpackhi_ps _mm_unpackhi_ps
972
#define storeu_ps _mm_storeu_ps
973

974
inline __m128 SQUARE(__m128 x)
272✔
975
{
976
    return _mm_mul_ps(x, x);
272✔
977
}
978

979
inline __m128 FIXUP_LANES(__m128 x)
68✔
980
{
981
    return x;
68✔
982
}
983

984
#endif
985

986
template <class T>
987
static int NOINLINE
988
QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
34✔
989
                       const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
990
                       T *CPL_RESTRICT pDstScanline)
991
{
992
    // Optimized implementation for RMS on Float32 by
993
    // processing by group of RMS_FLOAT_ELTS output pixels.
994
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
34✔
995

996
    int iDstPixel = 0;
34✔
997
    const auto minus_zero = set1_ps(-0.0f);
34✔
998
    const auto zeroDot25 = set1_ps(0.25f);
34✔
999
    const auto one = set1_ps(1.0f);
34✔
1000
    const auto infv = set1_ps(std::numeric_limits<float>::infinity());
68✔
1001

1002
    for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
102✔
1003
         iDstPixel += RMS_FLOAT_ELTS)
1004
    {
1005
        // Load 2*RMS_FLOAT_ELTS Float32 from each line
1006
        auto firstLineLo =
1007
            loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
68✔
1008
        auto firstLineHi = loadu_ps(reinterpret_cast<float const *>(
68✔
1009
            pSrcScanlineShifted + RMS_FLOAT_ELTS));
68✔
1010
        auto secondLineLo = loadu_ps(
68✔
1011
            reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
68✔
1012
        auto secondLineHi = loadu_ps(reinterpret_cast<float const *>(
68✔
1013
            pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize));
68✔
1014

1015
        // Take the absolute value
1016
        firstLineLo = andnot_ps(minus_zero, firstLineLo);
68✔
1017
        firstLineHi = andnot_ps(minus_zero, firstLineHi);
68✔
1018
        secondLineLo = andnot_ps(minus_zero, secondLineLo);
68✔
1019
        secondLineHi = andnot_ps(minus_zero, secondLineHi);
68✔
1020

1021
        auto firstLineEven =
1022
            shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
68✔
1023
        auto firstLineOdd =
1024
            shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
68✔
1025
        auto secondLineEven =
1026
            shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
68✔
1027
        auto secondLineOdd =
1028
            shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
68✔
1029

1030
        // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
1031
        const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
204✔
1032
                                 max_ps(secondLineEven, secondLineEven));
1033

1034
        // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
1035
        // This step is important to avoid that the square evaluates to infinity
1036
        // for sufficiently big input.
1037
        auto invMax = div_ps(one, maxV);
68✔
1038
        // Deal with 0 being the maximum to correct division by zero
1039
        // note: comparing to -0 leads to identical results as to comparing with
1040
        // 0
1041
        invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
136✔
1042

1043
        firstLineEven = mul_ps(firstLineEven, invMax);
68✔
1044
        firstLineOdd = mul_ps(firstLineOdd, invMax);
68✔
1045
        secondLineEven = mul_ps(secondLineEven, invMax);
68✔
1046
        secondLineOdd = mul_ps(secondLineOdd, invMax);
68✔
1047

1048
        // Compute squares
1049
        firstLineEven = SQUARE(firstLineEven);
68✔
1050
        firstLineOdd = SQUARE(firstLineOdd);
68✔
1051
        secondLineEven = SQUARE(secondLineEven);
68✔
1052
        secondLineOdd = SQUARE(secondLineOdd);
68✔
1053

1054
        const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
204✔
1055
                                       add_ps(secondLineEven, secondLineOdd));
1056

1057
        auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
204✔
1058

1059
        // Deal with infinity being the maximum
1060
        const auto maskIsInf = cmpeq_ps(maxV, infv);
68✔
1061
        rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
136✔
1062

1063
        rms = FIXUP_LANES(rms);
68✔
1064

1065
        // coverity[incompatible_cast]
1066
        storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]), rms);
68✔
1067
        pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
68✔
1068
    }
1069

1070
    zeroupper();
1071

1072
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
34✔
1073
    return iDstPixel;
34✔
1074
}
1075

1076
/************************************************************************/
1077
/*                        AverageFloatSSE2()                            */
1078
/************************************************************************/
1079

1080
template <class T>
1081
static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
14✔
1082
                            const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1083
                            T *CPL_RESTRICT pDstScanline)
1084
{
1085
    // Optimized implementation for average on Float32 by
1086
    // processing by group of 4 output pixels.
1087
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
14✔
1088

1089
    int iDstPixel = 0;
14✔
1090
    const auto zeroDot25 = _mm_set1_ps(0.25f);
14✔
1091

1092
    for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
32✔
1093
    {
1094
        // Load 8 Float32 from each line
1095
        const auto firstLineLo =
1096
            _mm_loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
18✔
1097
        const auto firstLineHi = _mm_loadu_ps(
18✔
1098
            reinterpret_cast<float const *>(pSrcScanlineShifted + 4));
18✔
1099
        const auto secondLineLo = _mm_loadu_ps(
18✔
1100
            reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
18✔
1101
        const auto secondLineHi = _mm_loadu_ps(reinterpret_cast<float const *>(
18✔
1102
            pSrcScanlineShifted + 4 + nChunkXSize));
18✔
1103

1104
        // Vertical addition
1105
        const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
18✔
1106
        const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
18✔
1107

1108
        // Horizontal addition
1109
        const auto A =
1110
            _mm_shuffle_ps(sumLo, sumHi, 0 | (2 << 2) | (0 << 4) | (2 << 6));
18✔
1111
        const auto B =
1112
            _mm_shuffle_ps(sumLo, sumHi, 1 | (3 << 2) | (1 << 4) | (3 << 6));
18✔
1113
        const auto sum = _mm_add_ps(A, B);
18✔
1114

1115
        const auto average = _mm_mul_ps(sum, zeroDot25);
18✔
1116

1117
        // coverity[incompatible_cast]
1118
        _mm_storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]),
18✔
1119
                      average);
1120
        pSrcScanlineShifted += 8;
18✔
1121
    }
1122

1123
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
14✔
1124
    return iDstPixel;
14✔
1125
}
1126

1127
#endif
1128

1129
/************************************************************************/
1130
/*                    GDALResampleChunk_AverageOrRMS()                  */
1131
/************************************************************************/
1132

1133
template <class T, class Tsum, GDALDataType eWrkDataType>
1134
static CPLErr
1135
GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
10,393✔
1136
                                 const T *pChunk, void **ppDstBuffer)
1137
{
1138
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
10,393✔
1139
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
10,393✔
1140
    const double dfSrcXDelta = args.dfSrcXDelta;
10,393✔
1141
    const double dfSrcYDelta = args.dfSrcYDelta;
10,393✔
1142
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
10,393✔
1143
    const int nChunkXOff = args.nChunkXOff;
10,393✔
1144
    const int nChunkYOff = args.nChunkYOff;
10,393✔
1145
    const int nChunkXSize = args.nChunkXSize;
10,393✔
1146
    const int nChunkYSize = args.nChunkYSize;
10,393✔
1147
    const int nDstXOff = args.nDstXOff;
10,393✔
1148
    const int nDstXOff2 = args.nDstXOff2;
10,393✔
1149
    const int nDstYOff = args.nDstYOff;
10,393✔
1150
    const int nDstYOff2 = args.nDstYOff2;
10,393✔
1151
    const char *pszResampling = args.pszResampling;
10,393✔
1152
    bool bHasNoData = args.bHasNoData;
10,393✔
1153
    const double dfNoDataValue = args.dfNoDataValue;
10,393✔
1154
    const GDALColorTable *poColorTable = args.poColorTable;
10,393✔
1155
    const bool bPropagateNoData = args.bPropagateNoData;
10,393✔
1156

1157
    // AVERAGE_BIT2GRAYSCALE
1158
    const bool bBit2Grayscale =
1159
        CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
10,393✔
1160
    const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
10,395✔
1161
    if (bBit2Grayscale)
10,397✔
1162
        poColorTable = nullptr;
9✔
1163

1164
    T tNoDataValue;
1165
    if (!bHasNoData)
10,397✔
1166
        tNoDataValue = 0;
10,346✔
1167
    else
1168
        tNoDataValue = static_cast<T>(dfNoDataValue);
51✔
1169
    const T tReplacementVal =
10,397✔
1170
        bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
107✔
1171
                         args.eOvrDataType, dfNoDataValue))
51✔
1172
                   : 0;
1173

1174
    int nChunkRightXOff = nChunkXOff + nChunkXSize;
10,397✔
1175
    int nChunkBottomYOff = nChunkYOff + nChunkYSize;
10,397✔
1176
    int nDstXWidth = nDstXOff2 - nDstXOff;
10,397✔
1177

1178
    /* -------------------------------------------------------------------- */
1179
    /*      Allocate buffers.                                               */
1180
    /* -------------------------------------------------------------------- */
1181
    *ppDstBuffer = static_cast<T *>(
10,397✔
1182
        VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
10,397✔
1183
                            GDALGetDataTypeSizeBytes(eWrkDataType)));
1184
    if (*ppDstBuffer == nullptr)
10,397✔
1185
    {
1186
        return CE_Failure;
×
1187
    }
1188
    T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
10,397✔
1189

1190
    struct PrecomputedXValue
1191
    {
1192
        int nLeftXOffShifted;
1193
        int nRightXOffShifted;
1194
        double dfLeftWeight;
1195
        double dfRightWeight;
1196
        double dfTotalWeightFullLine;
1197
    };
1198

1199
    PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1200
        VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(PrecomputedXValue)));
10,397✔
1201

1202
    if (pasSrcX == nullptr)
10,396✔
1203
    {
1204
        VSIFree(pasSrcX);
×
1205
        return CE_Failure;
×
1206
    }
1207

1208
    int nTransparentIdx = -1;
10,396✔
1209
    std::vector<GDALColorEntry> colorEntries;
10,396✔
1210
    if (poColorTable)
10,394✔
1211
        colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
5✔
1212

1213
    // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1214
    // it as nodata value
1215
    if (bHasNoData && dfNoDataValue >= 0.0f &&
10,423✔
1216
        tNoDataValue < colorEntries.size())
27✔
1217
        colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1✔
1218

1219
    // Or if we have no explicit nodata, but a color table entry that is
1220
    // transparent, consider it as the nodata value
1221
    else if (!bHasNoData && nTransparentIdx >= 0)
10,395✔
1222
    {
1223
        bHasNoData = true;
×
1224
        tNoDataValue = static_cast<T>(nTransparentIdx);
×
1225
    }
1226

1227
    /* ==================================================================== */
1228
    /*      Precompute inner loop constants.                                */
1229
    /* ==================================================================== */
1230
    bool bSrcXSpacingIsTwo = true;
10,396✔
1231
    int nLastSrcXOff2 = -1;
10,396✔
1232
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
867,068✔
1233
    {
1234
        double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
856,672✔
1235
        // Apply some epsilon to avoid numerical precision issues
1236
        int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
856,672✔
1237
        double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
856,672✔
1238
        int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
856,672✔
1239

1240
        if (nSrcXOff < nChunkXOff)
856,672✔
1241
            nSrcXOff = nChunkXOff;
×
1242
        if (nSrcXOff2 == nSrcXOff)
856,672✔
1243
            nSrcXOff2++;
×
1244
        if (nSrcXOff2 > nChunkRightXOff)
856,672✔
1245
            nSrcXOff2 = nChunkRightXOff;
1✔
1246

1247
        pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
856,672✔
1248
        pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
856,672✔
1249
            nSrcXOff2 - nChunkXOff;
856,672✔
1250
        pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
20✔
1251
            (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
856,672✔
1252
        pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
856,672✔
1253
            1 - (nSrcXOff2 - dfSrcXOff2);
856,672✔
1254
        pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
856,672✔
1255
            pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
856,672✔
1256
        if (nSrcXOff + 1 < nSrcXOff2)
856,672✔
1257
        {
1258
            pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
856,651✔
1259
                nSrcXOff2 - nSrcXOff - 2;
856,651✔
1260
            pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
856,651✔
1261
                pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
856,651✔
1262
        }
1263

1264
        if (nSrcXOff2 - nSrcXOff != 2 ||
856,672✔
1265
            (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
727,185✔
1266
        {
1267
            bSrcXSpacingIsTwo = false;
120,592✔
1268
        }
1269
        nLastSrcXOff2 = nSrcXOff2;
856,672✔
1270
    }
1271

1272
    /* ==================================================================== */
1273
    /*      Loop over destination scanlines.                                */
1274
    /* ==================================================================== */
1275
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
752,829✔
1276
    {
1277
        double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
742,432✔
1278
        int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
742,432✔
1279
        if (nSrcYOff < nChunkYOff)
742,432✔
1280
            nSrcYOff = nChunkYOff;
×
1281

1282
        double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
742,432✔
1283
        int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
742,432✔
1284
        if (nSrcYOff2 == nSrcYOff)
742,432✔
1285
            ++nSrcYOff2;
×
1286
        if (nSrcYOff2 > nChunkBottomYOff)
742,432✔
1287
            nSrcYOff2 = nChunkBottomYOff;
3✔
1288

1289
        T *const pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
742,432✔
1290

1291
        /* --------------------------------------------------------------------
1292
         */
1293
        /*      Loop over destination pixels */
1294
        /* --------------------------------------------------------------------
1295
         */
1296
        if (poColorTable == nullptr)
742,432✔
1297
        {
1298
            if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
742,317✔
1299
                pabyChunkNodataMask == nullptr)
1300
            {
1301
                if (eWrkDataType == GDT_Byte || eWrkDataType == GDT_UInt16)
1302
                {
1303
                    // Optimized case : no nodata, overview by a factor of 2 and
1304
                    // regular x and y src spacing.
1305
                    const T *pSrcScanlineShifted =
116,400✔
1306
                        pChunk + pasSrcX[0].nLeftXOffShifted +
116,400✔
1307
                        static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
116,400✔
1308
                            nChunkXSize;
116,400✔
1309
                    int iDstPixel = 0;
116,400✔
1310
#ifdef USE_SSE2
1311
                    if (bQuadraticMean && eWrkDataType == GDT_Byte)
116,381✔
1312
                    {
1313
                        iDstPixel = QuadraticMeanByteSSE2OrAVX2(
5,385✔
1314
                            nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1315
                            pDstScanline);
1316
                    }
1317
                    else if (bQuadraticMean /* && eWrkDataType == GDT_UInt16 */)
111,015✔
1318
                    {
1319
                        iDstPixel = QuadraticMeanUInt16SSE2(
10✔
1320
                            nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1321
                            pDstScanline);
1322
                    }
1323
                    else if (/* !bQuadraticMean && */ eWrkDataType == GDT_Byte)
1324
                    {
1325
                        iDstPixel = AverageByteSSE2OrAVX2(
110,996✔
1326
                            nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1327
                            pDstScanline);
1328
                    }
1329
                    else /* if( !bQuadraticMean && eWrkDataType == GDT_UInt16 )
1330
                          */
1331
                    {
1332
                        iDstPixel = AverageUInt16SSE2(nDstXWidth, nChunkXSize,
9✔
1333
                                                      pSrcScanlineShifted,
1334
                                                      pDstScanline);
1335
                    }
1336
#endif
1337
                    for (; iDstPixel < nDstXWidth; ++iDstPixel)
278,721✔
1338
                    {
1339
                        Tsum nTotal = 0;
162,321✔
1340
                        T nVal;
1341
                        if (bQuadraticMean)
162,321✔
1342
                            nTotal =
44✔
1343
                                SQUARE<Tsum>(pSrcScanlineShifted[0]) +
44✔
1344
                                SQUARE<Tsum>(pSrcScanlineShifted[1]) +
44✔
1345
                                SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
44✔
1346
                                SQUARE<Tsum>(
44✔
1347
                                    pSrcScanlineShifted[1 + nChunkXSize]);
44✔
1348
                        else
1349
                            nTotal = pSrcScanlineShifted[0] +
162,277✔
1350
                                     pSrcScanlineShifted[1] +
162,277✔
1351
                                     pSrcScanlineShifted[nChunkXSize] +
162,277✔
1352
                                     pSrcScanlineShifted[1 + nChunkXSize];
162,277✔
1353

1354
                        constexpr int nTotalWeight = 4;
162,321✔
1355
                        if (bQuadraticMean)
162,321✔
1356
                            nVal = ComputeIntegerRMS_4values<T>(nTotal);
44✔
1357
                        else
1358
                            nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
162,277✔
1359
                                                  nTotalWeight);
1360

1361
                        // No need to compare nVal against tNoDataValue as we
1362
                        // are in a case where pabyChunkNodataMask == nullptr
1363
                        // implies the absence of nodata value.
1364
                        pDstScanline[iDstPixel] = nVal;
162,321✔
1365
                        pSrcScanlineShifted += 2;
162,321✔
1366
                    }
1367
                }
1368
                else
1369
                {
1370
                    CPLAssert(eWrkDataType == GDT_Float32 ||
1371
                              eWrkDataType == GDT_Float64);
1372
                    const T *pSrcScanlineShifted =
70✔
1373
                        pChunk + pasSrcX[0].nLeftXOffShifted +
70✔
1374
                        static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
70✔
1375
                            nChunkXSize;
70✔
1376
                    int iDstPixel = 0;
70✔
1377
#ifdef USE_SSE2
1378
                    if (eWrkDataType == GDT_Float32)
1379
                    {
1380
                        if (bQuadraticMean)
48✔
1381
                        {
1382
                            iDstPixel = QuadraticMeanFloatSSE2(
34✔
1383
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1384
                                pDstScanline);
1385
                        }
1386
                        else
1387
                        {
1388
                            iDstPixel = AverageFloatSSE2(
14✔
1389
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1390
                                pDstScanline);
1391
                        }
1392
                    }
1393
#endif
1394

1395
                    for (; iDstPixel < nDstXWidth; ++iDstPixel)
268✔
1396
                    {
1397
                        T nVal;
1398
                        if (bQuadraticMean)
198✔
1399
                        {
1400
                            // Cast to double to avoid overflows
1401
                            // (using std::hypot() is much slower)
1402
                            nVal = static_cast<T>(std::sqrt(
100✔
1403
                                0.25 *
1404
                                (SQUARE<double>(pSrcScanlineShifted[0]) +
100✔
1405
                                 SQUARE<double>(pSrcScanlineShifted[1]) +
100✔
1406
                                 SQUARE<double>(
100✔
1407
                                     pSrcScanlineShifted[nChunkXSize]) +
200✔
1408
                                 SQUARE<double>(
100✔
1409
                                     pSrcScanlineShifted[1 + nChunkXSize]))));
100✔
1410
                        }
1411
                        else
1412
                        {
1413
                            nVal = static_cast<T>(
98✔
1414
                                0.25f * (pSrcScanlineShifted[0] +
98✔
1415
                                         pSrcScanlineShifted[1] +
98✔
1416
                                         pSrcScanlineShifted[nChunkXSize] +
98✔
1417
                                         pSrcScanlineShifted[1 + nChunkXSize]));
98✔
1418
                        }
1419

1420
                        // No need to compare nVal against tNoDataValue as we
1421
                        // are in a case where pabyChunkNodataMask == nullptr
1422
                        // implies the absence of nodata value.
1423
                        pDstScanline[iDstPixel] = nVal;
198✔
1424
                        pSrcScanlineShifted += 2;
198✔
1425
                    }
1426
                }
116,470✔
1427
            }
1428
            else
1429
            {
1430
                const double dfBottomWeight =
13✔
1431
                    (nSrcYOff + 1 == nSrcYOff2) ? 1.0
625,847✔
1432
                                                : 1.0 - (dfSrcYOff - nSrcYOff);
625,834✔
1433
                const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
625,847✔
1434
                nSrcYOff -= nChunkYOff;
625,847✔
1435
                nSrcYOff2 -= nChunkYOff;
625,847✔
1436

1437
                double dfTotalWeightFullColumn = dfBottomWeight;
625,847✔
1438
                if (nSrcYOff + 1 < nSrcYOff2)
625,847✔
1439
                {
1440
                    dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
625,837✔
1441
                    dfTotalWeightFullColumn += dfTopWeight;
625,837✔
1442
                }
1443

1444
                for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
18,585,156✔
1445
                {
1446
                    const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
17,958,381✔
1447
                    const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
17,958,381✔
1448

1449
                    double dfTotal = 0;
17,958,381✔
1450
                    double dfTotalWeight = 0;
17,958,381✔
1451
                    if (pabyChunkNodataMask == nullptr)
17,958,381✔
1452
                    {
1453
                        auto pChunkShifted =
1,746,435✔
1454
                            pChunk +
115✔
1455
                            static_cast<GPtrDiff_t>(nSrcYOff) * nChunkXSize;
1,746,435✔
1456
                        int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1,746,435✔
1457
                        double dfWeightY = dfBottomWeight;
1,746,435✔
1458
                        while (true)
3,493,427✔
1459
                        {
1460
                            double dfTotalLine;
1461
                            if (bQuadraticMean)
5,239,852✔
1462
                            {
1463
                                // Left pixel
1464
                                {
1465
                                    const T val = pChunkShifted[nSrcXOff];
104✔
1466
                                    dfTotalLine =
104✔
1467
                                        SQUARE<double>(val) *
104✔
1468
                                        pasSrcX[iDstPixel].dfLeftWeight;
104✔
1469
                                }
1470

1471
                                if (nSrcXOff + 1 < nSrcXOff2)
104✔
1472
                                {
1473
                                    // Middle pixels
1474
                                    for (int iX = nSrcXOff + 1;
104✔
1475
                                         iX + 1 < nSrcXOff2; ++iX)
424✔
1476
                                    {
1477
                                        const T val = pChunkShifted[iX];
320✔
1478
                                        dfTotalLine += SQUARE<double>(val);
320✔
1479
                                    }
1480

1481
                                    // Right pixel
1482
                                    {
1483
                                        const T val =
104✔
1484
                                            pChunkShifted[nSrcXOff2 - 1];
104✔
1485
                                        dfTotalLine +=
104✔
1486
                                            SQUARE<double>(val) *
104✔
1487
                                            pasSrcX[iDstPixel].dfRightWeight;
104✔
1488
                                    }
1489
                                }
1490
                            }
1491
                            else
1492
                            {
1493
                                // Left pixel
1494
                                {
1495
                                    const T val = pChunkShifted[nSrcXOff];
5,239,756✔
1496
                                    dfTotalLine =
5,239,756✔
1497
                                        val * pasSrcX[iDstPixel].dfLeftWeight;
5,239,756✔
1498
                                }
1499

1500
                                if (nSrcXOff + 1 < nSrcXOff2)
5,239,756✔
1501
                                {
1502
                                    // Middle pixels
1503
                                    for (int iX = nSrcXOff + 1;
4,239,330✔
1504
                                         iX + 1 < nSrcXOff2; ++iX)
64,183,126✔
1505
                                    {
1506
                                        const T val = pChunkShifted[iX];
59,943,836✔
1507
                                        dfTotalLine += val;
59,943,836✔
1508
                                    }
1509

1510
                                    // Right pixel
1511
                                    {
1512
                                        const T val =
4,239,330✔
1513
                                            pChunkShifted[nSrcXOff2 - 1];
4,239,330✔
1514
                                        dfTotalLine +=
4,239,330✔
1515
                                            val *
4,239,330✔
1516
                                            pasSrcX[iDstPixel].dfRightWeight;
4,239,330✔
1517
                                    }
1518
                                }
1519
                            }
1520

1521
                            dfTotal += dfTotalLine * dfWeightY;
5,239,852✔
1522
                            --nCounterY;
5,239,852✔
1523
                            if (nCounterY < 0)
5,239,852✔
1524
                                break;
1,746,435✔
1525
                            pChunkShifted += nChunkXSize;
3,493,427✔
1526
                            dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
3,493,427✔
1527
                        }
1528

1529
                        dfTotalWeight =
1,746,435✔
1530
                            pasSrcX[iDstPixel].dfTotalWeightFullLine *
1,746,435✔
1531
                            dfTotalWeightFullColumn;
1532
                    }
1533
                    else
1534
                    {
1535
                        GPtrDiff_t nCount = 0;
16,211,966✔
1536
                        for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
71,185,998✔
1537
                        {
1538
                            const auto pChunkShifted =
54,973,732✔
1539
                                pChunk +
132✔
1540
                                static_cast<GPtrDiff_t>(iY) * nChunkXSize;
54,973,732✔
1541

1542
                            double dfTotalLine = 0;
54,973,732✔
1543
                            double dfTotalWeightLine = 0;
54,973,732✔
1544
                            // Left pixel
1545
                            {
1546
                                const int iX = nSrcXOff;
54,973,732✔
1547
                                const T val = pChunkShifted[iX];
54,973,732✔
1548
                                if (pabyChunkNodataMask[iX + iY * nChunkXSize])
54,973,732✔
1549
                                {
1550
                                    nCount++;
23,417,781✔
1551
                                    const double dfWeightX =
23,417,781✔
1552
                                        pasSrcX[iDstPixel].dfLeftWeight;
23,417,781✔
1553
                                    dfTotalWeightLine = dfWeightX;
23,417,781✔
1554
                                    if (bQuadraticMean)
23,417,781✔
1555
                                        dfTotalLine =
60✔
1556
                                            SQUARE<double>(val) * dfWeightX;
60✔
1557
                                    else
1558
                                        dfTotalLine = val * dfWeightX;
23,417,681✔
1559
                                }
1560
                            }
1561

1562
                            if (nSrcXOff + 1 < nSrcXOff2)
54,973,732✔
1563
                            {
1564
                                // Middle pixels
1565
                                for (int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2;
145,163,132✔
1566
                                     ++iX)
1567
                                {
1568
                                    const T val = pChunkShifted[iX];
90,193,500✔
1569
                                    if (pabyChunkNodataMask[iX +
90,193,500✔
1570
                                                            iY * nChunkXSize])
90,193,500✔
1571
                                    {
1572
                                        nCount++;
39,727,800✔
1573
                                        dfTotalWeightLine += 1;
39,727,800✔
1574
                                        if (bQuadraticMean)
39,727,800✔
1575
                                            dfTotalLine += SQUARE<double>(val);
×
1576
                                        else
1577
                                            dfTotalLine += val;
39,727,800✔
1578
                                    }
1579
                                }
1580

1581
                                // Right pixel
1582
                                {
1583
                                    const int iX = nSrcXOff2 - 1;
54,969,932✔
1584
                                    const T val = pChunkShifted[iX];
54,969,932✔
1585
                                    if (pabyChunkNodataMask[iX +
54,969,932✔
1586
                                                            iY * nChunkXSize])
54,969,932✔
1587
                                    {
1588
                                        nCount++;
23,417,247✔
1589
                                        const double dfWeightX =
23,417,247✔
1590
                                            pasSrcX[iDstPixel].dfRightWeight;
23,417,247✔
1591
                                        dfTotalWeightLine += dfWeightX;
23,417,247✔
1592
                                        if (bQuadraticMean)
23,417,247✔
1593
                                            dfTotalLine +=
4,477✔
1594
                                                SQUARE<double>(val) * dfWeightX;
61✔
1595
                                        else
1596
                                            dfTotalLine += val * dfWeightX;
23,417,246✔
1597
                                    }
1598
                                }
1599
                            }
1600

1601
                            const double dfWeightY =
93,736,098✔
1602
                                (iY == nSrcYOff)        ? dfBottomWeight
1603
                                : (iY + 1 == nSrcYOff2) ? dfTopWeight
38,762,066✔
1604
                                                        : 1.0;
1605
                            dfTotal += dfTotalLine * dfWeightY;
54,974,032✔
1606
                            dfTotalWeight += dfTotalWeightLine * dfWeightY;
54,974,032✔
1607
                        }
1608

1609
                        if (nCount == 0 ||
16,212,366✔
1610
                            (bPropagateNoData &&
8✔
1611
                             nCount <
1612
                                 static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
8✔
1613
                                     (nSrcXOff2 - nSrcXOff)))
8✔
1614
                        {
1615
                            pDstScanline[iDstPixel] = tNoDataValue;
9,461,612✔
1616
                            continue;
9,461,612✔
1617
                        }
1618
                    }
1619
                    if (eWrkDataType == GDT_Byte)
1620
                    {
1621
                        T nVal;
1622
                        if (bQuadraticMean)
8,496,990✔
1623
                            nVal = ComputeIntegerRMS<T, int>(dfTotal,
38✔
1624
                                                             dfTotalWeight);
1625
                        else
1626
                            nVal =
8,496,950✔
1627
                                static_cast<T>(dfTotal / dfTotalWeight + 0.5);
8,496,950✔
1628
                        if (bHasNoData && nVal == tNoDataValue)
8,497,530✔
1629
                            nVal = tReplacementVal;
×
1630
                        pDstScanline[iDstPixel] = nVal;
8,497,530✔
1631
                    }
1632
                    else if (eWrkDataType == GDT_UInt16)
1633
                    {
1634
                        T nVal;
1635
                        if (bQuadraticMean)
8✔
1636
                            nVal = ComputeIntegerRMS<T, uint64_t>(
4✔
1637
                                dfTotal, dfTotalWeight);
1638
                        else
1639
                            nVal =
4✔
1640
                                static_cast<T>(dfTotal / dfTotalWeight + 0.5);
4✔
1641
                        if (bHasNoData && nVal == tNoDataValue)
8✔
1642
                            nVal = tReplacementVal;
×
1643
                        pDstScanline[iDstPixel] = nVal;
8✔
1644
                    }
1645
                    else
1646
                    {
1647
                        T nVal;
1648
                        if (bQuadraticMean)
151✔
1649
                            nVal =
20✔
1650
                                static_cast<T>(sqrt(dfTotal / dfTotalWeight));
25✔
1651
                        else
1652
                            nVal = static_cast<T>(dfTotal / dfTotalWeight);
126✔
1653
                        if (bHasNoData && nVal == tNoDataValue)
151✔
1654
                            nVal = tReplacementVal;
2✔
1655
                        pDstScanline[iDstPixel] = nVal;
151✔
1656
                    }
1657
                }
1658
            }
1659
        }
1660
        else
1661
        {
1662
            nSrcYOff -= nChunkYOff;
115✔
1663
            nSrcYOff2 -= nChunkYOff;
115✔
1664

1665
            for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
5,661✔
1666
            {
1667
                const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
6,475✔
1668
                const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
6,475✔
1669

1670
                GPtrDiff_t nTotalR = 0;
6,475✔
1671
                GPtrDiff_t nTotalG = 0;
6,475✔
1672
                GPtrDiff_t nTotalB = 0;
6,475✔
1673
                GPtrDiff_t nCount = 0;
6,475✔
1674

1675
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
19,425✔
1676
                {
1677
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
38,850✔
1678
                    {
1679
                        const T val = pChunk[iX + static_cast<GPtrDiff_t>(iY) *
25,900✔
1680
                                                      nChunkXSize];
25,900✔
1681
                        // cppcheck-suppress unsignedLessThanZero
1682
                        if (val < 0 || val >= colorEntries.size())
25,900✔
1683
                            continue;
×
1684
                        size_t idx = static_cast<size_t>(val);
25,900✔
1685
                        const auto &entry = colorEntries[idx];
25,900✔
1686
                        if (entry.c4)
25,900✔
1687
                        {
1688
                            if (bQuadraticMean)
14,128✔
1689
                            {
1690
                                nTotalR += SQUARE<int>(entry.c1);
800✔
1691
                                nTotalG += SQUARE<int>(entry.c2);
800✔
1692
                                nTotalB += SQUARE<int>(entry.c3);
800✔
1693
                                ++nCount;
800✔
1694
                            }
1695
                            else
1696
                            {
1697
                                nTotalR += entry.c1;
13,328✔
1698
                                nTotalG += entry.c2;
13,328✔
1699
                                nTotalB += entry.c3;
13,328✔
1700
                                ++nCount;
13,328✔
1701
                            }
1702
                        }
1703
                    }
1704
                }
1705

1706
                if (nCount == 0 ||
6,475✔
1707
                    (bPropagateNoData &&
×
1708
                     nCount < static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
×
1709
                                  (nSrcXOff2 - nSrcXOff)))
×
1710
                {
1711
                    pDstScanline[iDstPixel] = tNoDataValue;
2,838✔
1712
                }
1713
                else
1714
                {
1715
                    GDALColorEntry color;
1716
                    if (bQuadraticMean)
3,637✔
1717
                    {
1718
                        color.c1 =
200✔
1719
                            static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
200✔
1720
                        color.c2 =
200✔
1721
                            static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
200✔
1722
                        color.c3 =
200✔
1723
                            static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
200✔
1724
                    }
1725
                    else
1726
                    {
1727
                        color.c1 =
3,437✔
1728
                            static_cast<short>((nTotalR + nCount / 2) / nCount);
3,437✔
1729
                        color.c2 =
3,437✔
1730
                            static_cast<short>((nTotalG + nCount / 2) / nCount);
3,437✔
1731
                        color.c3 =
3,437✔
1732
                            static_cast<short>((nTotalB + nCount / 2) / nCount);
3,437✔
1733
                    }
1734
                    pDstScanline[iDstPixel] =
2,708✔
1735
                        static_cast<T>(BestColorEntry(colorEntries, color));
3,637✔
1736
                }
1737
            }
1738
        }
1739
    }
1740

1741
    CPLFree(pasSrcX);
10,397✔
1742

1743
    return CE_None;
10,397✔
1744
}
1745

1746
static CPLErr
1747
GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
10,397✔
1748
                               const void *pChunk, void **ppDstBuffer,
1749
                               GDALDataType *peDstBufferDataType)
1750
{
1751
    *peDstBufferDataType = args.eWrkDataType;
10,397✔
1752
    switch (args.eWrkDataType)
10,397✔
1753
    {
1754
        case GDT_Byte:
10,330✔
1755
        {
1756
            return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
10,330✔
1757
                args, static_cast<const GByte *>(pChunk), ppDstBuffer);
10,332✔
1758
        }
1759

1760
        case GDT_UInt16:
9✔
1761
        {
1762
            if (EQUAL(args.pszResampling, "RMS"))
9✔
1763
            {
1764
                // Use double as accumulation type, because UInt32 could overflow
1765
                return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
1766
                                                        GDT_UInt16>(
5✔
1767
                    args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
5✔
1768
            }
1769
            else
1770
            {
1771
                return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
1772
                                                        GDT_UInt16>(
4✔
1773
                    args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
4✔
1774
            }
1775
        }
1776

1777
        case GDT_Float32:
39✔
1778
        {
1779
            return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
39✔
1780
                args, static_cast<const float *>(pChunk), ppDstBuffer);
39✔
1781
        }
1782

1783
        case GDT_Float64:
17✔
1784
        {
1785
            return GDALResampleChunk_AverageOrRMS_T<double, double,
1786
                                                    GDT_Float64>(
17✔
1787
                args, static_cast<const double *>(pChunk), ppDstBuffer);
17✔
1788
        }
1789

1790
        default:
2✔
1791
            break;
2✔
1792
    }
1793

1794
    CPLAssert(false);
2✔
1795
    return CE_Failure;
1796
}
1797

1798
/************************************************************************/
1799
/*                     GDALResampleChunk_Gauss()                        */
1800
/************************************************************************/
1801

1802
static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
86✔
1803
                                      const void *pChunk, void **ppDstBuffer,
1804
                                      GDALDataType *peDstBufferDataType)
1805

1806
{
1807
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
86✔
1808
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
86✔
1809
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
86✔
1810
    const int nChunkXOff = args.nChunkXOff;
86✔
1811
    const int nChunkXSize = args.nChunkXSize;
86✔
1812
    const int nChunkYOff = args.nChunkYOff;
86✔
1813
    const int nChunkYSize = args.nChunkYSize;
86✔
1814
    const int nDstXOff = args.nDstXOff;
86✔
1815
    const int nDstXOff2 = args.nDstXOff2;
86✔
1816
    const int nDstYOff = args.nDstYOff;
86✔
1817
    const int nDstYOff2 = args.nDstYOff2;
86✔
1818
    const bool bHasNoData = args.bHasNoData;
86✔
1819
    double dfNoDataValue = args.dfNoDataValue;
86✔
1820
    const GDALColorTable *poColorTable = args.poColorTable;
86✔
1821

1822
    const double *const padfChunk = static_cast<const double *>(pChunk);
86✔
1823

1824
    *ppDstBuffer =
86✔
1825
        VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
86✔
1826
                            GDALGetDataTypeSizeBytes(GDT_Float64));
1827
    if (*ppDstBuffer == nullptr)
86✔
1828
    {
1829
        return CE_Failure;
×
1830
    }
1831
    *peDstBufferDataType = GDT_Float64;
86✔
1832
    double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
86✔
1833

1834
    /* -------------------------------------------------------------------- */
1835
    /*      Create the filter kernel and allocate scanline buffer.          */
1836
    /* -------------------------------------------------------------------- */
1837
    int nGaussMatrixDim = 3;
86✔
1838
    const int *panGaussMatrix;
1839
    constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
86✔
1840
    constexpr int anGaussMatrix5x5[] = {1,  4, 6,  4,  1,  4, 16, 24, 16,
86✔
1841
                                        4,  6, 24, 36, 24, 6, 4,  16, 24,
1842
                                        16, 4, 1,  4,  6,  4, 1};
1843
    constexpr int anGaussMatrix7x7[] = {
86✔
1844
        1,   6,  15, 20,  15,  6,   1,   6,  36, 90,  120, 90,  36,
1845
        6,   15, 90, 225, 300, 225, 90,  15, 20, 120, 300, 400, 300,
1846
        120, 20, 15, 90,  225, 300, 225, 90, 15, 6,   36,  90,  120,
1847
        90,  36, 6,  1,   6,   15,  20,  15, 6,  1};
1848

1849
    const int nOXSize = args.nOvrXSize;
86✔
1850
    const int nOYSize = args.nOvrYSize;
86✔
1851
    const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
86✔
1852

1853
    // matrix for gauss filter
1854
    if (nResYFactor <= 2)
86✔
1855
    {
1856
        panGaussMatrix = anGaussMatrix3x3;
85✔
1857
        nGaussMatrixDim = 3;
85✔
1858
    }
1859
    else if (nResYFactor <= 4)
1✔
1860
    {
1861
        panGaussMatrix = anGaussMatrix5x5;
×
1862
        nGaussMatrixDim = 5;
×
1863
    }
1864
    else
1865
    {
1866
        panGaussMatrix = anGaussMatrix7x7;
1✔
1867
        nGaussMatrixDim = 7;
1✔
1868
    }
1869

1870
#ifdef DEBUG_OUT_OF_BOUND_ACCESS
1871
    int *panGaussMatrixDup = static_cast<int *>(
1872
        CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
1873
    memcpy(panGaussMatrixDup, panGaussMatrix,
1874
           sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
1875
    panGaussMatrix = panGaussMatrixDup;
1876
#endif
1877

1878
    if (!bHasNoData)
86✔
1879
        dfNoDataValue = 0.0;
79✔
1880

1881
    std::vector<GDALColorEntry> colorEntries;
86✔
1882
    int nTransparentIdx = -1;
86✔
1883
    if (poColorTable)
86✔
1884
        colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
2✔
1885

1886
    // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1887
    // it as nodata value.
1888
    if (bHasNoData && dfNoDataValue >= 0.0f &&
92✔
1889
        dfNoDataValue < colorEntries.size())
6✔
1890
        colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
×
1891

1892
    // Or if we have no explicit nodata, but a color table entry that is
1893
    // transparent, consider it as the nodata value.
1894
    else if (!bHasNoData && nTransparentIdx >= 0)
86✔
1895
    {
1896
        dfNoDataValue = nTransparentIdx;
×
1897
    }
1898

1899
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
86✔
1900
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
86✔
1901
    const int nDstXWidth = nDstXOff2 - nDstXOff;
86✔
1902

1903
    /* ==================================================================== */
1904
    /*      Loop over destination scanlines.                                */
1905
    /* ==================================================================== */
1906
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
16,488✔
1907
    {
1908
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
16,402✔
1909
        int nSrcYOff2 =
16,402✔
1910
            static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
16,402✔
1911

1912
        if (nSrcYOff < nChunkYOff)
16,402✔
1913
        {
1914
            nSrcYOff = nChunkYOff;
×
1915
            nSrcYOff2++;
×
1916
        }
1917

1918
        const int iSizeY = nSrcYOff2 - nSrcYOff;
16,402✔
1919
        nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
16,402✔
1920
        nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
16,402✔
1921

1922
        if (nSrcYOff2 > nChunkBottomYOff ||
16,402✔
1923
            (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
16,359✔
1924
        {
1925
            nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
44✔
1926
        }
1927

1928
        int nYShiftGaussMatrix = 0;
16,402✔
1929
        if (nSrcYOff < nChunkYOff)
16,402✔
1930
        {
1931
            nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
×
1932
            nSrcYOff = nChunkYOff;
×
1933
        }
1934

1935
        const double *const padfSrcScanline =
16,402✔
1936
            padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
16,402✔
1937
        const GByte *pabySrcScanlineNodataMask = nullptr;
16,402✔
1938
        if (pabyChunkNodataMask != nullptr)
16,402✔
1939
            pabySrcScanlineNodataMask =
152✔
1940
                pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
152✔
1941

1942
        /* --------------------------------------------------------------------
1943
         */
1944
        /*      Loop over destination pixels */
1945
        /* --------------------------------------------------------------------
1946
         */
1947
        double *const padfDstScanline =
16,402✔
1948
            padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
16,402✔
1949
        for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
4,149,980✔
1950
        {
1951
            int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
4,133,580✔
1952
            int nSrcXOff2 =
4,133,580✔
1953
                static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
4,133,580✔
1954

1955
            if (nSrcXOff < nChunkXOff)
4,133,580✔
1956
            {
1957
                nSrcXOff = nChunkXOff;
×
1958
                nSrcXOff2++;
×
1959
            }
1960

1961
            const int iSizeX = nSrcXOff2 - nSrcXOff;
4,133,580✔
1962
            nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
4,133,580✔
1963
            nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
4,133,580✔
1964

1965
            if (nSrcXOff2 > nChunkRightXOff ||
4,133,580✔
1966
                (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
4,127,930✔
1967
            {
1968
                nSrcXOff2 =
5,650✔
1969
                    std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
5,650✔
1970
            }
1971

1972
            int nXShiftGaussMatrix = 0;
4,133,580✔
1973
            if (nSrcXOff < nChunkXOff)
4,133,580✔
1974
            {
1975
                nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
×
1976
                nSrcXOff = nChunkXOff;
×
1977
            }
1978

1979
            if (poColorTable == nullptr)
4,133,580✔
1980
            {
1981
                double dfTotal = 0.0;
4,133,380✔
1982
                GInt64 nCount = 0;
4,133,380✔
1983
                const int *panLineWeight =
4,133,380✔
1984
                    panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
4,133,380✔
1985
                    nXShiftGaussMatrix;
1986

1987
                for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
16,527,900✔
1988
                     ++iY, ++j, panLineWeight += nGaussMatrixDim)
12,394,500✔
1989
                {
1990
                    for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
49,561,300✔
1991
                    {
1992
                        const double val =
37,166,800✔
1993
                            padfSrcScanline[iX - nChunkXOff +
37,166,800✔
1994
                                            static_cast<GPtrDiff_t>(iY -
37,166,800✔
1995
                                                                    nSrcYOff) *
37,166,800✔
1996
                                                nChunkXSize];
37,166,800✔
1997
                        if (pabySrcScanlineNodataMask == nullptr ||
37,166,800✔
1998
                            pabySrcScanlineNodataMask[iX - nChunkXOff +
32,872✔
1999
                                                      static_cast<GPtrDiff_t>(
32,872✔
2000
                                                          iY - nSrcYOff) *
32,872✔
2001
                                                          nChunkXSize])
32,872✔
2002
                        {
2003
                            const int nWeight = panLineWeight[i];
37,146,100✔
2004
                            dfTotal += val * nWeight;
37,146,100✔
2005
                            nCount += nWeight;
37,146,100✔
2006
                        }
2007
                    }
2008
                }
2009

2010
                if (nCount == 0)
4,133,380✔
2011
                {
2012
                    padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2,217✔
2013
                }
2014
                else
2015
                {
2016
                    padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
4,131,160✔
2017
                }
2018
            }
2019
            else
2020
            {
2021
                GInt64 nTotalR = 0;
200✔
2022
                GInt64 nTotalG = 0;
200✔
2023
                GInt64 nTotalB = 0;
200✔
2024
                GInt64 nTotalWeight = 0;
200✔
2025
                const int *panLineWeight =
200✔
2026
                    panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
200✔
2027
                    nXShiftGaussMatrix;
2028

2029
                for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
780✔
2030
                     ++iY, ++j, panLineWeight += nGaussMatrixDim)
580✔
2031
                {
2032
                    for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2,262✔
2033
                    {
2034
                        const double val =
1,682✔
2035
                            padfSrcScanline[iX - nChunkXOff +
1,682✔
2036
                                            static_cast<GPtrDiff_t>(iY -
1,682✔
2037
                                                                    nSrcYOff) *
1,682✔
2038
                                                nChunkXSize];
1,682✔
2039
                        if (val < 0 || val >= colorEntries.size())
1,682✔
2040
                            continue;
×
2041

2042
                        size_t idx = static_cast<size_t>(val);
1,682✔
2043
                        if (colorEntries[idx].c4)
1,682✔
2044
                        {
2045
                            const int nWeight = panLineWeight[i];
1,682✔
2046
                            nTotalR +=
1,682✔
2047
                                static_cast<GInt64>(colorEntries[idx].c1) *
1,682✔
2048
                                nWeight;
1,682✔
2049
                            nTotalG +=
1,682✔
2050
                                static_cast<GInt64>(colorEntries[idx].c2) *
1,682✔
2051
                                nWeight;
1,682✔
2052
                            nTotalB +=
1,682✔
2053
                                static_cast<GInt64>(colorEntries[idx].c3) *
1,682✔
2054
                                nWeight;
1,682✔
2055
                            nTotalWeight += nWeight;
1,682✔
2056
                        }
2057
                    }
2058
                }
2059

2060
                if (nTotalWeight == 0)
200✔
2061
                {
2062
                    padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
×
2063
                }
2064
                else
2065
                {
2066
                    GDALColorEntry color;
2067

2068
                    color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
200✔
2069
                                                  nTotalWeight);
2070
                    color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
200✔
2071
                                                  nTotalWeight);
2072
                    color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
200✔
2073
                                                  nTotalWeight);
2074
                    padfDstScanline[iDstPixel - nDstXOff] =
200✔
2075
                        BestColorEntry(colorEntries, color);
200✔
2076
                }
2077
            }
2078
        }
2079
    }
2080

2081
#ifdef DEBUG_OUT_OF_BOUND_ACCESS
2082
    CPLFree(panGaussMatrixDup);
2083
#endif
2084

2085
    return CE_None;
86✔
2086
}
2087

2088
/************************************************************************/
2089
/*                      GDALResampleChunk_Mode()                        */
2090
/************************************************************************/
2091

2092
template <class T> static inline bool IsSame(T a, T b)
4,398✔
2093
{
2094
    return a == b;
4,398✔
2095
}
2096

2097
template <> bool IsSame<float>(float a, float b)
4,854✔
2098
{
2099
    return a == b || (std::isnan(a) && std::isnan(b));
4,854✔
2100
}
2101

2102
template <> bool IsSame<double>(double a, double b)
504✔
2103
{
2104
    return a == b || (std::isnan(a) && std::isnan(b));
504✔
2105
}
2106

2107
template <>
2108
bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
480✔
2109
{
2110
    return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
960✔
2111
                      std::isnan(b.real()) && std::isnan(b.imag()));
960✔
2112
}
2113

2114
template <>
2115
bool IsSame<std::complex<double>>(std::complex<double> a,
480✔
2116
                                  std::complex<double> b)
2117
{
2118
    return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
960✔
2119
                      std::isnan(b.real()) && std::isnan(b.imag()));
960✔
2120
}
2121

2122
template <class T>
2123
static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
136✔
2124
                                      const T *pChunk, T *const pDstBuffer)
2125

2126
{
2127
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
136✔
2128
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
136✔
2129
    const double dfSrcXDelta = args.dfSrcXDelta;
136✔
2130
    const double dfSrcYDelta = args.dfSrcYDelta;
136✔
2131
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
136✔
2132
    const int nChunkXOff = args.nChunkXOff;
136✔
2133
    const int nChunkXSize = args.nChunkXSize;
136✔
2134
    const int nChunkYOff = args.nChunkYOff;
136✔
2135
    const int nChunkYSize = args.nChunkYSize;
136✔
2136
    const int nDstXOff = args.nDstXOff;
136✔
2137
    const int nDstXOff2 = args.nDstXOff2;
136✔
2138
    const int nDstYOff = args.nDstYOff;
136✔
2139
    const int nDstYOff2 = args.nDstYOff2;
136✔
2140
    const bool bHasNoData = args.bHasNoData;
136✔
2141
    const GDALColorTable *poColorTable = args.poColorTable;
136✔
2142
    const int nDstXSize = nDstXOff2 - nDstXOff;
136✔
2143

2144
    T tNoDataValue;
8✔
2145
    if constexpr (std::is_same<T, std::complex<float>>::value ||
2146
                  std::is_same<T, std::complex<double>>::value)
2147
    {
2148
        using BaseT = typename T::value_type;
2149
        tNoDataValue =
8✔
2150
            std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2151
                                std::numeric_limits<BaseT>::quiet_NaN());
2152
    }
2153
    else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
128✔
2154
        tNoDataValue = 0;
127✔
2155
    else
2156
        tNoDataValue = static_cast<T>(args.dfNoDataValue);
1✔
2157

2158
    size_t nMaxNumPx = 0;
136✔
2159
    T *paVals = nullptr;
136✔
2160
    int *panSums = nullptr;
136✔
2161

2162
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
136✔
2163
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
136✔
2164
    std::vector<int> anVals(256, 0);
272✔
2165

2166
    /* ==================================================================== */
2167
    /*      Loop over destination scanlines.                                */
2168
    /* ==================================================================== */
2169
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
7,531✔
2170
    {
2171
        double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
7,395✔
2172
        int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
7,395✔
2173
#ifdef only_pixels_with_more_than_10_pct_participation
2174
        // When oversampling, don't take into account pixels that have a tiny
2175
        // participation in the resulting pixel
2176
        if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2177
            nSrcYOff < nChunkBottomYOff)
2178
            nSrcYOff++;
2179
#endif
2180
        if (nSrcYOff < nChunkYOff)
7,395✔
2181
            nSrcYOff = nChunkYOff;
×
2182

2183
        double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
7,395✔
2184
        int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
7,395✔
2185
#ifdef only_pixels_with_more_than_10_pct_participation
2186
        // When oversampling, don't take into account pixels that have a tiny
2187
        // participation in the resulting pixel
2188
        if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2189
            nSrcYOff2 > nChunkYOff)
2190
            nSrcYOff2--;
2191
#endif
2192
        if (nSrcYOff2 == nSrcYOff)
7,395✔
2193
            ++nSrcYOff2;
×
2194
        if (nSrcYOff2 > nChunkBottomYOff)
7,395✔
2195
            nSrcYOff2 = nChunkBottomYOff;
×
2196

2197
        const T *const paSrcScanline =
7,395✔
2198
            pChunk +
149✔
2199
            (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
7,395✔
2200
        const GByte *pabySrcScanlineNodataMask = nullptr;
7,395✔
2201
        if (pabyChunkNodataMask != nullptr)
7,395✔
2202
            pabySrcScanlineNodataMask =
1,810✔
2203
                pabyChunkNodataMask +
2204
                static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
1,810✔
2205

2206
        T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
7,395✔
2207
        /* --------------------------------------------------------------------
2208
         */
2209
        /*      Loop over destination pixels */
2210
        /* --------------------------------------------------------------------
2211
         */
2212
        for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
4,259,580✔
2213
        {
2214
            double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
4,252,187✔
2215
            // Apply some epsilon to avoid numerical precision issues
2216
            int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
4,252,187✔
2217
#ifdef only_pixels_with_more_than_10_pct_participation
2218
            // When oversampling, don't take into account pixels that have a
2219
            // tiny participation in the resulting pixel
2220
            if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2221
                nSrcXOff < nChunkRightXOff)
2222
                nSrcXOff++;
2223
#endif
2224
            if (nSrcXOff < nChunkXOff)
4,252,187✔
2225
                nSrcXOff = nChunkXOff;
×
2226

2227
            double dfSrcXOff2 =
4,252,187✔
2228
                dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
4,252,187✔
2229
            int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
4,252,187✔
2230
#ifdef only_pixels_with_more_than_10_pct_participation
2231
            // When oversampling, don't take into account pixels that have a
2232
            // tiny participation in the resulting pixel
2233
            if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2234
                nSrcXOff2 > nChunkXOff)
2235
                nSrcXOff2--;
2236
#endif
2237
            if (nSrcXOff2 == nSrcXOff)
4,252,187✔
2238
                nSrcXOff2++;
×
2239
            if (nSrcXOff2 > nChunkRightXOff)
4,252,187✔
2240
                nSrcXOff2 = nChunkRightXOff;
×
2241

2242
            bool bRegularProcessing = false;
4,252,187✔
2243
            if constexpr (!std::is_same<T, GByte>::value)
2244
                bRegularProcessing = true;
827✔
2245
            else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
4,251,360✔
2246
                bRegularProcessing = true;
×
2247

2248
            if (bRegularProcessing)
4,252,187✔
2249
            {
2250
                // Not sure how much sense it makes to run a majority
2251
                // filter on floating point data, but here it is for the sake
2252
                // of compatibility. It won't look right on RGB images by the
2253
                // nature of the filter.
2254

2255
                if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
827✔
2256
                    nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
2,481✔
2257
                    static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
827✔
2258
                            static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
827✔
2259
                        std::numeric_limits<size_t>::max() / sizeof(float))
827✔
2260
                {
2261
                    CPLError(CE_Failure, CPLE_NotSupported,
×
2262
                             "Too big downsampling factor");
2263
                    CPLFree(paVals);
×
2264
                    CPLFree(panSums);
×
2265
                    return CE_Failure;
×
2266
                }
2267
                const size_t nNumPx =
827✔
2268
                    static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
827✔
2269
                    static_cast<size_t>(nSrcXOff2 - nSrcXOff);
827✔
2270
                size_t iMaxInd = 0;
827✔
2271
                size_t iMaxVal = 0;
827✔
2272
                bool biMaxValdValid = false;
827✔
2273

2274
                if (paVals == nullptr || nNumPx > nMaxNumPx)
827✔
2275
                {
2276
                    T *paValsNew = static_cast<T *>(
2277
                        VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
71✔
2278
                    int *panSumsNew = static_cast<int *>(
2279
                        VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
71✔
2280
                    if (paValsNew != nullptr)
71✔
2281
                        paVals = paValsNew;
71✔
2282
                    if (panSumsNew != nullptr)
71✔
2283
                        panSums = panSumsNew;
71✔
2284
                    if (paValsNew == nullptr || panSumsNew == nullptr)
71✔
2285
                    {
2286
                        CPLFree(paVals);
×
2287
                        CPLFree(panSums);
×
2288
                        return CE_Failure;
×
2289
                    }
2290
                    nMaxNumPx = nNumPx;
71✔
2291
                }
2292

2293
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2,585✔
2294
                {
2295
                    const GPtrDiff_t iTotYOff =
1,758✔
2296
                        static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
1,758✔
2297
                        nChunkXOff;
1,758✔
2298
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
5,690✔
2299
                    {
2300
                        if (pabySrcScanlineNodataMask == nullptr ||
3,932✔
2301
                            pabySrcScanlineNodataMask[iX + iTotYOff])
16✔
2302
                        {
2303
                            const T val = paSrcScanline[iX + iTotYOff];
3,917✔
2304
                            size_t i = 0;  // Used after for.
3,917✔
2305

2306
                            // Check array for existing entry.
2307
                            for (; i < iMaxInd; ++i)
14,387✔
2308
                                if (IsSame(paVals[i], val) &&
17,626✔
2309
                                    ++panSums[i] > panSums[iMaxVal])
6,910✔
2310
                                {
2311
                                    iMaxVal = i;
246✔
2312
                                    biMaxValdValid = true;
246✔
2313
                                    break;
246✔
2314
                                }
2315

2316
                            // Add to arr if entry not already there.
2317
                            if (i == iMaxInd)
3,917✔
2318
                            {
2319
                                paVals[iMaxInd] = val;
3,671✔
2320
                                panSums[iMaxInd] = 1;
3,671✔
2321

2322
                                if (!biMaxValdValid)
3,671✔
2323
                                {
2324
                                    iMaxVal = iMaxInd;
824✔
2325
                                    biMaxValdValid = true;
824✔
2326
                                }
2327

2328
                                ++iMaxInd;
3,671✔
2329
                            }
2330
                        }
2331
                    }
2332
                }
2333

2334
                if (!biMaxValdValid)
827✔
2335
                    paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
3✔
2336
                else
2337
                    paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
824✔
2338
            }
2339
            else if constexpr (std::is_same<T, GByte>::value)
2340
            // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
2341
            {
2342
                // So we go here for a paletted or non-paletted byte band.
2343
                // The input values are then between 0 and 255.
2344
                int nMaxVal = 0;
4,251,360✔
2345
                int iMaxInd = -1;
4,251,360✔
2346

2347
                // The cost of this zeroing might be high. Perhaps we should
2348
                // just use the above generic case, and go to this one if the
2349
                // number of source pixels is large enough
2350
                std::fill(anVals.begin(), anVals.end(), 0);
4,251,360✔
2351

2352
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
12,777,700✔
2353
                {
2354
                    const GPtrDiff_t iTotYOff =
8,526,370✔
2355
                        static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
8,526,370✔
2356
                        nChunkXOff;
8,526,370✔
2357
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
25,649,400✔
2358
                    {
2359
                        const T val = paSrcScanline[iX + iTotYOff];
17,123,000✔
2360
                        if (!bHasNoData || val != tNoDataValue)
17,123,000✔
2361
                        {
2362
                            int nVal = static_cast<int>(val);
17,123,000✔
2363
                            if (++anVals[nVal] > nMaxVal)
17,123,000✔
2364
                            {
2365
                                // Sum the density.
2366
                                // Is it the most common value so far?
2367
                                iMaxInd = nVal;
17,006,300✔
2368
                                nMaxVal = anVals[nVal];
17,006,300✔
2369
                            }
2370
                        }
2371
                    }
2372
                }
2373

2374
                if (iMaxInd == -1)
4,251,360✔
2375
                    paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
×
2376
                else
2377
                    paDstScanline[iDstPixel - nDstXOff] =
4,251,360✔
2378
                        static_cast<T>(iMaxInd);
2379
            }
2380
        }
2381
    }
2382

2383
    CPLFree(paVals);
136✔
2384
    CPLFree(panSums);
136✔
2385

2386
    return CE_None;
136✔
2387
}
2388

2389
static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
136✔
2390
                                     const void *pChunk, void **ppDstBuffer,
2391
                                     GDALDataType *peDstBufferDataType)
2392
{
2393
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(
136✔
2394
        args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2395
        GDALGetDataTypeSizeBytes(args.eWrkDataType));
2396
    if (*ppDstBuffer == nullptr)
136✔
2397
    {
2398
        return CE_Failure;
×
2399
    }
2400

2401
    CPLAssert(args.eSrcDataType == args.eWrkDataType);
136✔
2402

2403
    *peDstBufferDataType = args.eWrkDataType;
136✔
2404
    switch (args.eWrkDataType)
136✔
2405
    {
2406
        // For mode resampling, as no computation is done, only the
2407
        // size of the data type matters... except for Byte where we have
2408
        // special processing. And for floating point values
2409
        case GDT_Byte:
65✔
2410
        {
2411
            return GDALResampleChunk_ModeT(args,
65✔
2412
                                           static_cast<const GByte *>(pChunk),
2413
                                           static_cast<GByte *>(*ppDstBuffer));
65✔
2414
        }
2415

2416
        case GDT_Int8:
4✔
2417
        {
2418
            return GDALResampleChunk_ModeT(args,
4✔
2419
                                           static_cast<const int8_t *>(pChunk),
2420
                                           static_cast<int8_t *>(*ppDstBuffer));
4✔
2421
        }
2422

2423
        case GDT_Int16:
9✔
2424
        case GDT_UInt16:
2425
        {
2426
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
9✔
2427
            return GDALResampleChunk_ModeT(
9✔
2428
                args, static_cast<const uint16_t *>(pChunk),
2429
                static_cast<uint16_t *>(*ppDstBuffer));
9✔
2430
        }
2431

2432
        case GDT_CInt16:
15✔
2433
        case GDT_Int32:
2434
        case GDT_UInt32:
2435
        {
2436
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
15✔
2437
            return GDALResampleChunk_ModeT(
15✔
2438
                args, static_cast<const uint32_t *>(pChunk),
2439
                static_cast<uint32_t *>(*ppDstBuffer));
15✔
2440
        }
2441

2442
        case GDT_Float32:
17✔
2443
        {
2444
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
17✔
2445
            return GDALResampleChunk_ModeT(args,
17✔
2446
                                           static_cast<const float *>(pChunk),
2447
                                           static_cast<float *>(*ppDstBuffer));
17✔
2448
        }
2449

2450
        case GDT_CInt32:
12✔
2451
        case GDT_Int64:
2452
        case GDT_UInt64:
2453
        {
2454
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
12✔
2455
            return GDALResampleChunk_ModeT(
12✔
2456
                args, static_cast<const uint64_t *>(pChunk),
2457
                static_cast<uint64_t *>(*ppDstBuffer));
12✔
2458
        }
2459

2460
        case GDT_Float64:
6✔
2461
        {
2462
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
6✔
2463
            return GDALResampleChunk_ModeT(args,
6✔
2464
                                           static_cast<const double *>(pChunk),
2465
                                           static_cast<double *>(*ppDstBuffer));
6✔
2466
        }
2467

2468
        case GDT_CFloat32:
4✔
2469
        {
2470
            return GDALResampleChunk_ModeT(
4✔
2471
                args, static_cast<const std::complex<float> *>(pChunk),
2472
                static_cast<std::complex<float> *>(*ppDstBuffer));
4✔
2473
        }
2474

2475
        case GDT_CFloat64:
4✔
2476
        {
2477
            return GDALResampleChunk_ModeT(
4✔
2478
                args, static_cast<const std::complex<double> *>(pChunk),
2479
                static_cast<std::complex<double> *>(*ppDstBuffer));
4✔
2480
        }
2481

2482
        case GDT_Unknown:
×
2483
        case GDT_TypeCount:
2484
            break;
×
2485
    }
2486

2487
    CPLAssert(false);
×
2488
    return CE_Failure;
2489
}
2490

2491
/************************************************************************/
2492
/*                  GDALResampleConvolutionHorizontal()                 */
2493
/************************************************************************/
2494

2495
template <class T>
2496
static inline double
2497
GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
44,642✔
2498
                                  int nSrcPixelCount)
2499
{
2500
    double dfVal1 = 0.0;
44,642✔
2501
    double dfVal2 = 0.0;
44,642✔
2502
    int i = 0;  // Used after for.
44,642✔
2503
    // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2504
    // manually (untypical) unrolled loop in -O2 and -O3:
2505
    // https://github.com/OSGeo/gdal/issues/9508
2506
#if !defined(__INTEL_CLANG_COMPILER)
2507
    for (; i + 3 < nSrcPixelCount; i += 4)
89,044✔
2508
    {
2509
        dfVal1 += pChunk[i] * padfWeights[i];
44,402✔
2510
        dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
44,402✔
2511
        dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
44,402✔
2512
        dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
44,402✔
2513
    }
2514
#endif
2515
    for (; i < nSrcPixelCount; ++i)
46,066✔
2516
    {
2517
        dfVal1 += pChunk[i] * padfWeights[i];
1,424✔
2518
    }
2519
    return dfVal1 + dfVal2;
44,642✔
2520
}
2521

2522
template <class T>
2523
static inline void GDALResampleConvolutionHorizontalWithMask(
48✔
2524
    const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2525
    int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2526
{
2527
    dfVal = 0;
48✔
2528
    dfWeightSum = 0;
48✔
2529
    int i = 0;
48✔
2530
    for (; i + 3 < nSrcPixelCount; i += 4)
48✔
2531
    {
2532
        const double dfWeight0 = padfWeights[i] * pabyMask[i];
×
2533
        const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
×
2534
        const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
×
2535
        const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
×
2536
        dfVal += pChunk[i] * dfWeight0;
×
2537
        dfVal += pChunk[i + 1] * dfWeight1;
×
2538
        dfVal += pChunk[i + 2] * dfWeight2;
×
2539
        dfVal += pChunk[i + 3] * dfWeight3;
×
2540
        dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
×
2541
    }
2542
    for (; i < nSrcPixelCount; ++i)
178✔
2543
    {
2544
        const double dfWeight = padfWeights[i] * pabyMask[i];
130✔
2545
        dfVal += pChunk[i] * dfWeight;
130✔
2546
        dfWeightSum += dfWeight;
130✔
2547
    }
2548
}
48✔
2549

2550
template <class T>
2551
static inline void GDALResampleConvolutionHorizontal_3rows(
1,330,334✔
2552
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2553
    const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2554
    double &dfRes2, double &dfRes3)
2555
{
2556
    double dfVal1 = 0.0;
1,330,334✔
2557
    double dfVal2 = 0.0;
1,330,334✔
2558
    double dfVal3 = 0.0;
1,330,334✔
2559
    double dfVal4 = 0.0;
1,330,334✔
2560
    double dfVal5 = 0.0;
1,330,334✔
2561
    double dfVal6 = 0.0;
1,330,334✔
2562
    int i = 0;  // Used after for.
1,330,334✔
2563
    for (; i + 3 < nSrcPixelCount; i += 4)
2,715,057✔
2564
    {
2565
        dfVal1 += pChunkRow1[i] * padfWeights[i];
1,384,722✔
2566
        dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
1,384,722✔
2567
        dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
1,384,722✔
2568
        dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
1,384,722✔
2569
        dfVal3 += pChunkRow2[i] * padfWeights[i];
1,384,722✔
2570
        dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
1,384,722✔
2571
        dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
1,384,722✔
2572
        dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
1,384,722✔
2573
        dfVal5 += pChunkRow3[i] * padfWeights[i];
1,384,722✔
2574
        dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
1,384,722✔
2575
        dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
1,384,722✔
2576
        dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
1,384,722✔
2577
    }
2578
    for (; i < nSrcPixelCount; ++i)
1,366,941✔
2579
    {
2580
        dfVal1 += pChunkRow1[i] * padfWeights[i];
36,607✔
2581
        dfVal3 += pChunkRow2[i] * padfWeights[i];
36,607✔
2582
        dfVal5 += pChunkRow3[i] * padfWeights[i];
36,607✔
2583
    }
2584
    dfRes1 = dfVal1 + dfVal2;
1,330,334✔
2585
    dfRes2 = dfVal3 + dfVal4;
1,330,334✔
2586
    dfRes3 = dfVal5 + dfVal6;
1,330,334✔
2587
}
1,330,334✔
2588

2589
template <class T>
2590
static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
18,188✔
2591
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2592
    const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2593
    double &dfRes2, double &dfRes3)
2594
{
2595
    GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
18,188✔
2596
                                            padfWeights, nSrcPixelCount, dfRes1,
2597
                                            dfRes2, dfRes3);
2598
}
18,188✔
2599

2600
template <class T>
2601
static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
1,247,346✔
2602
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2603
    const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2604
{
2605
    GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
1,247,346✔
2606
                                            padfWeights, 4, dfRes1, dfRes2,
2607
                                            dfRes3);
2608
}
1,247,346✔
2609

2610
/************************************************************************/
2611
/*                  GDALResampleConvolutionVertical()                   */
2612
/************************************************************************/
2613

2614
template <class T>
2615
static inline double
2616
GDALResampleConvolutionVertical(const T *pChunk, int nStride,
463,389✔
2617
                                const double *padfWeights, int nSrcLineCount)
2618
{
2619
    double dfVal1 = 0.0;
463,389✔
2620
    double dfVal2 = 0.0;
463,389✔
2621
    int i = 0;
463,389✔
2622
    int j = 0;
463,389✔
2623
    for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
912,400✔
2624
    {
2625
        dfVal1 += pChunk[j] * padfWeights[i];
449,011✔
2626
        dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
449,011✔
2627
        dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
449,011✔
2628
        dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
449,011✔
2629
    }
2630
    for (; i < nSrcLineCount; ++i, j += nStride)
516,308✔
2631
    {
2632
        dfVal1 += pChunk[j] * padfWeights[i];
52,919✔
2633
    }
2634
    return dfVal1 + dfVal2;
463,389✔
2635
}
2636

2637
template <class T>
2638
static inline void GDALResampleConvolutionVertical_2cols(
2,880,000✔
2639
    const T *pChunk, int nStride, const double *padfWeights, int nSrcLineCount,
2640
    double &dfRes1, double &dfRes2)
2641
{
2642
    double dfVal1 = 0.0;
2,880,000✔
2643
    double dfVal2 = 0.0;
2,880,000✔
2644
    double dfVal3 = 0.0;
2,880,000✔
2645
    double dfVal4 = 0.0;
2,880,000✔
2646
    int i = 0;
2,880,000✔
2647
    int j = 0;
2,880,000✔
2648
    for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
5,716,800✔
2649
    {
2650
        dfVal1 += pChunk[j] * padfWeights[i];
2,836,800✔
2651
        dfVal3 += pChunk[j + 1] * padfWeights[i];
2,836,800✔
2652
        dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
2,836,800✔
2653
        dfVal3 += pChunk[j + 1 + nStride] * padfWeights[i + 1];
2,836,800✔
2654
        dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2,836,800✔
2655
        dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2,836,800✔
2656
        dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2,836,800✔
2657
        dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2,836,800✔
2658
    }
2659
    for (; i < nSrcLineCount; ++i, j += nStride)
2,995,210✔
2660
    {
2661
        dfVal1 += pChunk[j] * padfWeights[i];
115,210✔
2662
        dfVal3 += pChunk[j + 1] * padfWeights[i];
115,210✔
2663
    }
2664
    dfRes1 = dfVal1 + dfVal2;
2,880,000✔
2665
    dfRes2 = dfVal3 + dfVal4;
2,880,000✔
2666
}
2,880,000✔
2667

2668
#ifdef USE_SSE2
2669

2670
#ifdef __AVX__
2671
/************************************************************************/
2672
/*             GDALResampleConvolutionVertical_16cols<T>                */
2673
/************************************************************************/
2674

2675
template <class T>
2676
static inline void
2677
GDALResampleConvolutionVertical_16cols(const T *pChunk, int nStride,
2678
                                       const double *padfWeights,
2679
                                       int nSrcLineCount, float *afDest)
2680
{
2681
    int i = 0;
2682
    int j = 0;
2683
    XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2684
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2685
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2686
    XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2687
    for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2688
    {
2689
        XMMReg4Double w0 =
2690
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2691
        XMMReg4Double w1 =
2692
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2693
        XMMReg4Double w2 =
2694
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2695
        XMMReg4Double w3 =
2696
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2697
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2698
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2699
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2700
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2701
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2702
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2703
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2704
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2705
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2706
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2707
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2708
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2709
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2710
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2711
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2712
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2713
    }
2714
    for (; i < nSrcLineCount; ++i, j += nStride)
2715
    {
2716
        XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2717
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2718
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2719
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2720
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2721
    }
2722
    v_acc0.Store4Val(afDest);
2723
    v_acc1.Store4Val(afDest + 4);
2724
    v_acc2.Store4Val(afDest + 8);
2725
    v_acc3.Store4Val(afDest + 12);
2726
}
2727

2728
template <class T>
2729
static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2730
                                                          const double *, int,
2731
                                                          double *)
2732
{
2733
    // Cannot be reached
2734
    CPLAssert(false);
2735
}
2736

2737
#else
2738

2739
/************************************************************************/
2740
/*              GDALResampleConvolutionVertical_8cols<T>                */
2741
/************************************************************************/
2742

2743
template <class T>
2744
static inline void
2745
GDALResampleConvolutionVertical_8cols(const T *pChunk, int nStride,
18,641,800✔
2746
                                      const double *padfWeights,
2747
                                      int nSrcLineCount, float *afDest)
2748
{
2749
    int i = 0;
18,641,800✔
2750
    int j = 0;
18,641,800✔
2751
    XMMReg4Double v_acc0 = XMMReg4Double::Zero();
18,641,800✔
2752
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
18,601,100✔
2753
    for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
33,848,500✔
2754
    {
2755
        XMMReg4Double w0 =
15,229,700✔
2756
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
15,229,700✔
2757
        XMMReg4Double w1 =
15,198,800✔
2758
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
15,198,800✔
2759
        XMMReg4Double w2 =
15,223,800✔
2760
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
15,223,800✔
2761
        XMMReg4Double w3 =
15,221,900✔
2762
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
15,221,900✔
2763
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
15,229,100✔
2764
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
15,180,800✔
2765
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
15,199,000✔
2766
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
15,164,800✔
2767
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
15,193,100✔
2768
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
15,199,400✔
2769
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
15,152,800✔
2770
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
15,196,100✔
2771
    }
2772
    for (; i < nSrcLineCount; ++i, j += nStride)
30,026,000✔
2773
    {
2774
        XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
11,407,100✔
2775
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
11,407,100✔
2776
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
11,407,100✔
2777
    }
2778
    v_acc0.Store4Val(afDest);
18,618,900✔
2779
    v_acc1.Store4Val(afDest + 4);
18,614,300✔
2780
}
18,648,600✔
2781

2782
template <class T>
2783
static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
2784
                                                         const double *, int,
2785
                                                         double *)
2786
{
2787
    // Cannot be reached
2788
    CPLAssert(false);
2789
}
2790

2791
#endif  // __AVX__
2792

2793
/************************************************************************/
2794
/*              GDALResampleConvolutionHorizontalSSE2<T>                */
2795
/************************************************************************/
2796

2797
template <class T>
2798
static inline double GDALResampleConvolutionHorizontalSSE2(
2,738,443✔
2799
    const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2800
{
2801
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2,738,443✔
2802
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2,738,148✔
2803
    int i = 0;  // Used after for.
2,738,215✔
2804
    for (; i + 7 < nSrcPixelCount; i += 8)
2,814,109✔
2805
    {
2806
        // Retrieve the pixel & accumulate
2807
        const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
75,831✔
2808
        const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
75,831✔
2809
        const XMMReg4Double v_weight1 =
75,831✔
2810
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
75,831✔
2811
        const XMMReg4Double v_weight2 =
75,831✔
2812
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
75,831✔
2813

2814
        v_acc1 += v_pixels1 * v_weight1;
75,831✔
2815
        v_acc2 += v_pixels2 * v_weight2;
75,831✔
2816
    }
2817

2818
    v_acc1 += v_acc2;
2,738,277✔
2819

2820
    double dfVal = v_acc1.GetHorizSum();
2,738,188✔
2821
    for (; i < nSrcPixelCount; ++i)
9,509,780✔
2822
    {
2823
        dfVal += pChunk[i] * padfWeightsAligned[i];
6,771,660✔
2824
    }
2825
    return dfVal;
2,738,123✔
2826
}
2827

2828
/************************************************************************/
2829
/*              GDALResampleConvolutionHorizontal<GByte>                */
2830
/************************************************************************/
2831

2832
template <>
2833
inline double GDALResampleConvolutionHorizontal<GByte>(
2,189,670✔
2834
    const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2835
{
2836
    return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2,189,670✔
2837
                                                 nSrcPixelCount);
2,189,660✔
2838
}
2839

2840
template <>
2841
inline double GDALResampleConvolutionHorizontal<GUInt16>(
548,677✔
2842
    const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2843
{
2844
    return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
548,677✔
2845
                                                 nSrcPixelCount);
548,814✔
2846
}
2847

2848
/************************************************************************/
2849
/*              GDALResampleConvolutionHorizontalWithMaskSSE2<T>        */
2850
/************************************************************************/
2851

2852
template <class T>
2853
static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
5,736,213✔
2854
    const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
2855
    int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2856
{
2857
    int i = 0;  // Used after for.
5,736,213✔
2858
    XMMReg4Double v_acc = XMMReg4Double::Zero();
5,736,213✔
2859
    XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
5,736,213✔
2860
    for (; i + 3 < nSrcPixelCount; i += 4)
16,247,021✔
2861
    {
2862
        const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
10,510,858✔
2863
        const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
10,510,858✔
2864
        XMMReg4Double v_weight =
10,510,858✔
2865
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
10,510,858✔
2866
        v_weight *= v_mask;
10,510,858✔
2867
        v_acc += v_pixels * v_weight;
10,510,858✔
2868
        v_acc_weight += v_weight;
10,510,858✔
2869
    }
2870

2871
    dfVal = v_acc.GetHorizSum();
5,736,213✔
2872
    dfWeightSum = v_acc_weight.GetHorizSum();
5,736,213✔
2873
    for (; i < nSrcPixelCount; ++i)
5,927,983✔
2874
    {
2875
        const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
191,772✔
2876
        dfVal += pChunk[i] * dfWeight;
191,772✔
2877
        dfWeightSum += dfWeight;
191,772✔
2878
    }
2879
}
5,736,213✔
2880

2881
/************************************************************************/
2882
/*              GDALResampleConvolutionHorizontalWithMask<GByte>        */
2883
/************************************************************************/
2884

2885
template <>
2886
inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
5,736,150✔
2887
    const GByte *pChunk, const GByte *pabyMask,
2888
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2889
    double &dfWeightSum)
2890
{
2891
    GDALResampleConvolutionHorizontalWithMaskSSE2(
5,736,150✔
2892
        pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2893
        dfWeightSum);
2894
}
5,736,150✔
2895

2896
template <>
2897
inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
63✔
2898
    const GUInt16 *pChunk, const GByte *pabyMask,
2899
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2900
    double &dfWeightSum)
2901
{
2902
    GDALResampleConvolutionHorizontalWithMaskSSE2(
63✔
2903
        pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2904
        dfWeightSum);
2905
}
63✔
2906

2907
/************************************************************************/
2908
/*              GDALResampleConvolutionHorizontal_3rows_SSE2<T>         */
2909
/************************************************************************/
2910

2911
template <class T>
2912
static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
10,023,930✔
2913
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2914
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2915
    double &dfRes2, double &dfRes3)
2916
{
2917
    XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
10,023,930✔
2918
                  v_acc2 = XMMReg4Double::Zero(),
10,023,930✔
2919
                  v_acc3 = XMMReg4Double::Zero();
10,023,930✔
2920
    int i = 0;
10,023,930✔
2921
    for (; i + 7 < nSrcPixelCount; i += 8)
19,990,566✔
2922
    {
2923
        // Retrieve the pixel & accumulate.
2924
        XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
9,966,606✔
2925
        XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
9,966,606✔
2926
        const XMMReg4Double v_weight1 =
9,966,606✔
2927
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
9,966,606✔
2928
        const XMMReg4Double v_weight2 =
9,966,606✔
2929
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
9,966,606✔
2930

2931
        v_acc1 += v_pixels1 * v_weight1;
9,966,606✔
2932
        v_acc1 += v_pixels2 * v_weight2;
9,966,606✔
2933

2934
        v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
9,966,606✔
2935
        v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
9,966,606✔
2936
        v_acc2 += v_pixels1 * v_weight1;
9,966,606✔
2937
        v_acc2 += v_pixels2 * v_weight2;
9,966,606✔
2938

2939
        v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
9,966,606✔
2940
        v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
9,966,606✔
2941
        v_acc3 += v_pixels1 * v_weight1;
9,966,606✔
2942
        v_acc3 += v_pixels2 * v_weight2;
9,966,606✔
2943
    }
2944

2945
    dfRes1 = v_acc1.GetHorizSum();
10,023,930✔
2946
    dfRes2 = v_acc2.GetHorizSum();
10,023,930✔
2947
    dfRes3 = v_acc3.GetHorizSum();
10,023,930✔
2948
    for (; i < nSrcPixelCount; ++i)
21,488,226✔
2949
    {
2950
        dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
11,464,296✔
2951
        dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
11,464,296✔
2952
        dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
11,464,296✔
2953
    }
2954
}
10,023,930✔
2955

2956
/************************************************************************/
2957
/*              GDALResampleConvolutionHorizontal_3rows<GByte>          */
2958
/************************************************************************/
2959

2960
template <>
2961
inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
10,023,900✔
2962
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
2963
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2964
    double &dfRes2, double &dfRes3)
2965
{
2966
    GDALResampleConvolutionHorizontal_3rows_SSE2(
10,023,900✔
2967
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2968
        dfRes1, dfRes2, dfRes3);
2969
}
10,023,900✔
2970

2971
template <>
2972
inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
30✔
2973
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
2974
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
2975
    int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
2976
{
2977
    GDALResampleConvolutionHorizontal_3rows_SSE2(
30✔
2978
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2979
        dfRes1, dfRes2, dfRes3);
2980
}
30✔
2981

2982
/************************************************************************/
2983
/*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>   */
2984
/************************************************************************/
2985

2986
template <class T>
2987
static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2,173,455✔
2988
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2989
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2990
    double &dfRes2, double &dfRes3)
2991
{
2992
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2,173,455✔
2993
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2,173,395✔
2994
    XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2,173,421✔
2995
    int i = 0;  // Use after for.
2,173,402✔
2996
    for (; i + 3 < nSrcPixelCount; i += 4)
2,176,686✔
2997
    {
2998
        // Retrieve the pixel & accumulate.
2999
        const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3,284✔
3000
        const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3,284✔
3001
        const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3,284✔
3002
        const XMMReg4Double v_weight =
3,284✔
3003
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3,284✔
3004

3005
        v_acc1 += v_pixels1 * v_weight;
3,284✔
3006
        v_acc2 += v_pixels2 * v_weight;
3,284✔
3007
        v_acc3 += v_pixels3 * v_weight;
3,284✔
3008
    }
3009

3010
    dfRes1 = v_acc1.GetHorizSum();
2,173,396✔
3011
    dfRes2 = v_acc2.GetHorizSum();
2,173,406✔
3012
    dfRes3 = v_acc3.GetHorizSum();
2,173,400✔
3013

3014
    for (; i < nSrcPixelCount; ++i)
6,495,769✔
3015
    {
3016
        dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
4,322,363✔
3017
        dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
4,322,363✔
3018
        dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
4,322,363✔
3019
    }
3020
}
2,173,406✔
3021

3022
/************************************************************************/
3023
/*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>    */
3024
/************************************************************************/
3025

3026
template <>
3027
inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
2,106,400✔
3028
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3029
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3030
    double &dfRes2, double &dfRes3)
3031
{
3032
    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2,106,400✔
3033
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3034
        dfRes1, dfRes2, dfRes3);
3035
}
2,106,400✔
3036

3037
template <>
3038
inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
66,979✔
3039
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3040
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3041
    int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3042
{
3043
    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
66,979✔
3044
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3045
        dfRes1, dfRes2, dfRes3);
3046
}
67,087✔
3047

3048
/************************************************************************/
3049
/*     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>       */
3050
/************************************************************************/
3051

3052
template <class T>
3053
static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
12,265,740✔
3054
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3055
    const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3056
    double &dfRes3)
3057
{
3058
    const XMMReg4Double v_weight =
12,265,740✔
3059
        XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3060

3061
    // Retrieve the pixel & accumulate.
3062
    const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
12,262,260✔
3063
    const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
12,283,120✔
3064
    const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
12,269,170✔
3065

3066
    XMMReg4Double v_acc1 = v_pixels1 * v_weight;
12,290,330✔
3067
    XMMReg4Double v_acc2 = v_pixels2 * v_weight;
12,281,870✔
3068
    XMMReg4Double v_acc3 = v_pixels3 * v_weight;
12,273,880✔
3069

3070
    dfRes1 = v_acc1.GetHorizSum();
12,284,180✔
3071
    dfRes2 = v_acc2.GetHorizSum();
12,280,310✔
3072
    dfRes3 = v_acc3.GetHorizSum();
12,288,110✔
3073
}
12,290,540✔
3074

3075
/************************************************************************/
3076
/*       GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>      */
3077
/************************************************************************/
3078

3079
template <>
3080
inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
6,636,420✔
3081
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3082
    const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3083
    double &dfRes3)
3084
{
3085
    GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
6,636,420✔
3086
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3087
        dfRes3);
3088
}
6,635,590✔
3089

3090
template <>
3091
inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
5,655,340✔
3092
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3093
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3094
    double &dfRes2, double &dfRes3)
3095
{
3096
    GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
5,655,340✔
3097
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3098
        dfRes3);
3099
}
5,638,110✔
3100

3101
#endif  // USE_SSE2
3102

3103
/************************************************************************/
3104
/*                    GDALResampleChunk_Convolution()                   */
3105
/************************************************************************/
3106

3107
template <class T, class Twork, GDALDataType eWrkDataType>
3108
static CPLErr GDALResampleChunk_ConvolutionT(
3,673✔
3109
    const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3110
    FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3111
    int nKernelRadius, bool bKernelWithNegativeWeights, float fMaxVal)
3112

3113
{
3114
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3,673✔
3115
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3,673✔
3116
    const double dfSrcXDelta = args.dfSrcXDelta;
3,673✔
3117
    const double dfSrcYDelta = args.dfSrcYDelta;
3,673✔
3118
    constexpr int nBands = 1;
3,673✔
3119
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3,673✔
3120
    const int nChunkXOff = args.nChunkXOff;
3,673✔
3121
    const int nChunkXSize = args.nChunkXSize;
3,673✔
3122
    const int nChunkYOff = args.nChunkYOff;
3,673✔
3123
    const int nChunkYSize = args.nChunkYSize;
3,673✔
3124
    const int nDstXOff = args.nDstXOff;
3,673✔
3125
    const int nDstXOff2 = args.nDstXOff2;
3,673✔
3126
    const int nDstYOff = args.nDstYOff;
3,673✔
3127
    const int nDstYOff2 = args.nDstYOff2;
3,673✔
3128
    const bool bHasNoData = args.bHasNoData;
3,673✔
3129
    double dfNoDataValue = args.dfNoDataValue;
3,673✔
3130

3131
    if (!bHasNoData)
3,673✔
3132
        dfNoDataValue = 0.0;
3,623✔
3133
    const auto dstDataType = args.eOvrDataType;
3,673✔
3134
    const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3,673✔
3135
    const double dfReplacementVal =
3,672✔
3136
        bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
46✔
3137
                   : dfNoDataValue;
3138
    // cppcheck-suppress unreadVariable
3139
    const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3,672✔
3140
    const auto nNodataValueInt64 = static_cast<GInt64>(dfNoDataValue);
3,668✔
3141
    constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3,668✔
3142

3143
    // TODO: we should have some generic function to do this.
3144
    Twork fDstMin = -std::numeric_limits<Twork>::max();
3,668✔
3145
    Twork fDstMax = std::numeric_limits<Twork>::max();
3,668✔
3146
    if (dstDataType == GDT_Byte)
3,668✔
3147
    {
3148
        fDstMin = std::numeric_limits<GByte>::min();
2,947✔
3149
        fDstMax = std::numeric_limits<GByte>::max();
2,946✔
3150
    }
3151
    else if (dstDataType == GDT_Int8)
722✔
3152
    {
3153
        fDstMin = std::numeric_limits<GInt8>::min();
1✔
3154
        fDstMax = std::numeric_limits<GInt8>::max();
1✔
3155
    }
3156
    else if (dstDataType == GDT_UInt16)
721✔
3157
    {
3158
        fDstMin = std::numeric_limits<GUInt16>::min();
395✔
3159
        fDstMax = std::numeric_limits<GUInt16>::max();
390✔
3160
    }
3161
    else if (dstDataType == GDT_Int16)
329✔
3162
    {
3163
        fDstMin = std::numeric_limits<GInt16>::min();
279✔
3164
        fDstMax = std::numeric_limits<GInt16>::max();
279✔
3165
    }
3166
    else if (dstDataType == GDT_UInt32)
50✔
3167
    {
3168
        fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
1✔
3169
        fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
1✔
3170
    }
3171
    else if (dstDataType == GDT_Int32)
49✔
3172
    {
3173
        // cppcheck-suppress unreadVariable
3174
        fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
2✔
3175
        // cppcheck-suppress unreadVariable
3176
        fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
2✔
3177
    }
3178
    else if (dstDataType == GDT_UInt64)
47✔
3179
    {
3180
        // cppcheck-suppress unreadVariable
3181
        fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
1✔
3182
        // cppcheck-suppress unreadVariable
3183
        fDstMax = static_cast<Twork>(std::numeric_limits<uint64_t>::max());
1✔
3184
    }
3185
    else if (dstDataType == GDT_Int64)
46✔
3186
    {
3187
        // cppcheck-suppress unreadVariable
3188
        fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
1✔
3189
        // cppcheck-suppress unreadVariable
3190
        fDstMax = static_cast<Twork>(std::numeric_limits<int64_t>::max());
1✔
3191
    }
3192

3193
    auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
27,546,401✔
3194
                               nNodataValueInt64, dfNoDataValue,
3195
                               dfReplacementVal](Twork fVal)
3196
    {
3197
        if (!bHasNoData)
14,634,600✔
3198
            return fVal;
11,407,800✔
3199

3200
        // Clamp value before comparing to nodata: this is only needed for
3201
        // kernels with negative weights (Lanczos)
3202
        Twork fClamped = fVal;
3,226,780✔
3203
        if (fClamped < fDstMin)
3,226,780✔
3204
            fClamped = fDstMin;
12,874✔
3205
        else if (fClamped > fDstMax)
3,213,910✔
3206
            fClamped = fDstMax;
12,852✔
3207
        if (isIntegerDT)
3,226,780✔
3208
        {
3209
            if (nNodataValueInt64 == static_cast<GInt64>(std::round(fClamped)))
3,226,370✔
3210
            {
3211
                // Do not use the nodata value
3212
                return static_cast<Twork>(dfReplacementVal);
13,869✔
3213
            }
3214
        }
3215
        else if (dfNoDataValue == fClamped)
417✔
3216
        {
3217
            // Do not use the nodata value
3218
            return static_cast<Twork>(dfReplacementVal);
1✔
3219
        }
3220
        return fClamped;
3,212,920✔
3221
    };
3222

3223
    /* -------------------------------------------------------------------- */
3224
    /*      Allocate work buffers.                                          */
3225
    /* -------------------------------------------------------------------- */
3226
    const int nDstXSize = nDstXOff2 - nDstXOff;
3,669✔
3227
    Twork *pafWrkScanline = nullptr;
3,669✔
3228
    if (dstDataType != eWrkDataType)
3,669✔
3229
    {
3230
        pafWrkScanline =
3231
            static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3,624✔
3232
        if (pafWrkScanline == nullptr)
3,623✔
3233
            return CE_Failure;
×
3234
    }
3235

3236
    const double dfXScale = 1.0 / dfXRatioDstToSrc;
3,668✔
3237
    const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3,668✔
3238
    const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3,668✔
3239
    const double dfYScale = 1.0 / dfYRatioDstToSrc;
3,668✔
3240
    const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3,668✔
3241
    const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3,668✔
3242

3243
    // Temporary array to store result of horizontal filter.
3244
    double *padfHorizontalFiltered = static_cast<double *>(
3245
        VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3,668✔
3246

3247
    // To store convolution coefficients.
3248
    double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3,671✔
3249
        static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
3250
                         0.5) *
3251
        sizeof(double)));
3252

3253
    GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3,669✔
3254
    if (pabyChunkNodataMask)
3,669✔
3255
        pabyChunkNodataMaskHorizontalFiltered =
3256
            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
377✔
3257
    if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3,669✔
3258
        (pabyChunkNodataMask != nullptr &&
377✔
3259
         pabyChunkNodataMaskHorizontalFiltered == nullptr))
3260
    {
3261
        VSIFree(pafWrkScanline);
1✔
3262
        VSIFree(padfHorizontalFiltered);
×
3263
        VSIFreeAligned(padfWeights);
×
3264
        VSIFree(pabyChunkNodataMaskHorizontalFiltered);
×
3265
        return CE_Failure;
×
3266
    }
3267

3268
    /* ==================================================================== */
3269
    /*      First pass: horizontal filter                                   */
3270
    /* ==================================================================== */
3271
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3,668✔
3272
#ifdef USE_SSE2
3273
    bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3,668✔
3274
#endif
3275
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2,723,773✔
3276
    {
3277
        const double dfSrcPixel =
2,720,107✔
3278
            (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
2,720,107✔
3279
        int nSrcPixelStart =
2,720,107✔
3280
            static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
2,720,107✔
3281
        if (nSrcPixelStart < nChunkXOff)
2,720,107✔
3282
            nSrcPixelStart = nChunkXOff;
55,116✔
3283
        int nSrcPixelStop =
2,720,107✔
3284
            static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
2,720,107✔
3285
        if (nSrcPixelStop > nChunkRightXOff)
2,720,107✔
3286
            nSrcPixelStop = nChunkRightXOff;
55,129✔
3287
#if 0
3288
        if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3289
        {
3290
            printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3291
        }
3292
        if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3293
        {
3294
            printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3295
        }
3296
#endif
3297
        const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
2,720,107✔
3298
        double dfWeightSum = 0.0;
2,720,107✔
3299

3300
        // Compute convolution coefficients.
3301
        int nSrcPixel = nSrcPixelStart;
2,720,107✔
3302
        double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
2,720,107✔
3303
        for (; nSrcPixel + 3 < nSrcPixelStop; nSrcPixel += 4)
3,564,609✔
3304
        {
3305
            padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
844,457✔
3306
            dfX += dfXScaleWeight;
844,457✔
3307
            padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
844,457✔
3308
            dfX += dfXScaleWeight;
844,457✔
3309
            padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
844,457✔
3310
            dfX += dfXScaleWeight;
844,457✔
3311
            padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
844,457✔
3312
            dfX += dfXScaleWeight;
844,457✔
3313
            dfWeightSum +=
844,505✔
3314
                pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
844,457✔
3315
        }
3316
        for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
6,700,544✔
3317
        {
3318
            const double dfWeight = pfnFilterFunc(dfX);
3,981,421✔
3319
            padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3,980,397✔
3320
            dfWeightSum += dfWeight;
3,980,397✔
3321
        }
3322

3323
        const int nHeight = nChunkYSize * nBands;
2,719,133✔
3324
        if (pabyChunkNodataMask == nullptr)
2,719,133✔
3325
        {
3326
            if (dfWeightSum != 0)
2,648,896✔
3327
            {
3328
                const double dfInvWeightSum = 1.0 / dfWeightSum;
2,648,896✔
3329
                for (int i = 0; i < nSrcPixelCount; ++i)
9,457,143✔
3330
                    padfWeights[i] *= dfInvWeightSum;
6,808,245✔
3331
            }
3332
            int iSrcLineOff = 0;
2,648,896✔
3333
#ifdef USE_SSE2
3334
            if (nSrcPixelCount == 4)
2,648,896✔
3335
            {
3336
                for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
14,056,036✔
3337
                {
3338
                    const GPtrDiff_t j =
13,532,946✔
3339
                        static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
13,532,946✔
3340
                        (nSrcPixelStart - nChunkXOff);
13,532,946✔
3341
                    double dfVal1 = 0.0;
13,532,946✔
3342
                    double dfVal2 = 0.0;
13,532,946✔
3343
                    double dfVal3 = 0.0;
13,532,946✔
3344
                    GDALResampleConvolutionHorizontalPixelCount4_3rows(
13,532,946✔
3345
                        pChunk + j, pChunk + j + nChunkXSize,
13,532,946✔
3346
                        pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
13,532,946✔
3347
                        dfVal2, dfVal3);
3348
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
13,520,346✔
3349
                                               nDstXSize +
13,520,346✔
3350
                                           iDstPixel - nDstXOff] = dfVal1;
13,520,346✔
3351
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
13,520,346✔
3352
                                            1) *
13,520,346✔
3353
                                               nDstXSize +
13,520,346✔
3354
                                           iDstPixel - nDstXOff] = dfVal2;
13,520,346✔
3355
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
13,520,346✔
3356
                                            2) *
13,520,346✔
3357
                                               nDstXSize +
13,520,346✔
3358
                                           iDstPixel - nDstXOff] = dfVal3;
13,520,346✔
3359
                }
3360
            }
3361
            else if (bSrcPixelCountLess8)
2,113,201✔
3362
            {
3363
                for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
4,226,414✔
3364
                {
3365
                    const GPtrDiff_t j =
2,191,507✔
3366
                        static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
2,191,507✔
3367
                        (nSrcPixelStart - nChunkXOff);
2,191,507✔
3368
                    double dfVal1 = 0.0;
2,191,507✔
3369
                    double dfVal2 = 0.0;
2,191,507✔
3370
                    double dfVal3 = 0.0;
2,191,507✔
3371
                    GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2,191,507✔
3372
                        pChunk + j, pChunk + j + nChunkXSize,
2,191,507✔
3373
                        pChunk + j + 2 * nChunkXSize, padfWeights,
2,191,507✔
3374
                        nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3375
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
2,191,672✔
3376
                                               nDstXSize +
2,191,672✔
3377
                                           iDstPixel - nDstXOff] = dfVal1;
2,191,672✔
3378
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
2,191,672✔
3379
                                            1) *
2,191,672✔
3380
                                               nDstXSize +
2,191,672✔
3381
                                           iDstPixel - nDstXOff] = dfVal2;
2,191,672✔
3382
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
2,191,672✔
3383
                                            2) *
2,191,672✔
3384
                                               nDstXSize +
2,191,672✔
3385
                                           iDstPixel - nDstXOff] = dfVal3;
2,191,672✔
3386
                }
3387
            }
3388
            else
3389
#endif
3390
            {
3391
                for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
10,167,229✔
3392
                {
3393
                    const GPtrDiff_t j =
10,088,730✔
3394
                        static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
10,088,730✔
3395
                        (nSrcPixelStart - nChunkXOff);
10,088,730✔
3396
                    double dfVal1 = 0.0;
10,088,730✔
3397
                    double dfVal2 = 0.0;
10,088,730✔
3398
                    double dfVal3 = 0.0;
10,088,730✔
3399
                    GDALResampleConvolutionHorizontal_3rows(
10,088,730✔
3400
                        pChunk + j, pChunk + j + nChunkXSize,
10,088,730✔
3401
                        pChunk + j + 2 * nChunkXSize, padfWeights,
10,088,730✔
3402
                        nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3403
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
10,088,730✔
3404
                                               nDstXSize +
10,088,730✔
3405
                                           iDstPixel - nDstXOff] = dfVal1;
10,088,730✔
3406
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
10,088,730✔
3407
                                            1) *
10,088,730✔
3408
                                               nDstXSize +
10,088,730✔
3409
                                           iDstPixel - nDstXOff] = dfVal2;
10,088,730✔
3410
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
10,088,730✔
3411
                                            2) *
10,088,730✔
3412
                                               nDstXSize +
10,088,730✔
3413
                                           iDstPixel - nDstXOff] = dfVal3;
10,088,730✔
3414
                }
3415
            }
3416
            for (; iSrcLineOff < nHeight; ++iSrcLineOff)
5,419,588✔
3417
            {
3418
                const GPtrDiff_t j =
2,783,117✔
3419
                    static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
2,783,117✔
3420
                    (nSrcPixelStart - nChunkXOff);
2,783,117✔
3421
                const double dfVal = GDALResampleConvolutionHorizontal(
5,521,612✔
3422
                    pChunk + j, padfWeights, nSrcPixelCount);
2,783,117✔
3423
                padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
2,783,128✔
3424
                                           nDstXSize +
2,783,128✔
3425
                                       iDstPixel - nDstXOff] = dfVal;
2,783,128✔
3426
            }
3427
        }
3428
        else
3429
        {
3430
            for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
18,289,719✔
3431
            {
3432
                const GPtrDiff_t j =
18,206,118✔
3433
                    static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
18,206,118✔
3434
                    (nSrcPixelStart - nChunkXOff);
18,206,118✔
3435

3436
                if (bKernelWithNegativeWeights)
18,206,118✔
3437
                {
3438
                    int nConsecutiveValid = 0;
17,725,512✔
3439
                    int nMaxConsecutiveValid = 0;
17,725,512✔
3440
                    for (int k = 0; k < nSrcPixelCount; k++)
164,371,458✔
3441
                    {
3442
                        if (pabyChunkNodataMask[j + k])
146,646,146✔
3443
                            nConsecutiveValid++;
40,208,853✔
3444
                        else if (nConsecutiveValid)
106,436,793✔
3445
                        {
3446
                            nMaxConsecutiveValid = std::max(
96,592✔
3447
                                nMaxConsecutiveValid, nConsecutiveValid);
96,592✔
3448
                            nConsecutiveValid = 0;
96,592✔
3449
                        }
3450
                    }
3451
                    nMaxConsecutiveValid =
17,725,512✔
3452
                        std::max(nMaxConsecutiveValid, nConsecutiveValid);
17,725,512✔
3453
                    if (nMaxConsecutiveValid < nSrcPixelCount / 2)
17,725,512✔
3454
                    {
3455
                        const size_t nTempOffset =
12,469,807✔
3456
                            static_cast<size_t>(iSrcLineOff) * nDstXSize +
12,469,807✔
3457
                            iDstPixel - nDstXOff;
12,469,807✔
3458
                        padfHorizontalFiltered[nTempOffset] = 0.0;
12,469,807✔
3459
                        pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
12,469,807✔
3460
                        continue;
12,469,807✔
3461
                    }
3462
                }
3463

3464
                double dfVal = 0.0;
5,736,261✔
3465
                GDALResampleConvolutionHorizontalWithMask(
5,736,261✔
3466
                    pChunk + j, pabyChunkNodataMask + j, padfWeights,
5,736,261✔
3467
                    nSrcPixelCount, dfVal, dfWeightSum);
3468
                const size_t nTempOffset =
5,749,665✔
3469
                    static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
5,749,665✔
3470
                    nDstXOff;
5,749,665✔
3471
                if (dfWeightSum > 0.0)
5,749,665✔
3472
                {
3473
                    padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
5,691,828✔
3474
                    pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
5,691,828✔
3475
                }
3476
                else
3477
                {
3478
                    padfHorizontalFiltered[nTempOffset] = 0.0;
57,834✔
3479
                    pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
57,834✔
3480
                }
3481
            }
3482
        }
3483
    }
3484

3485
    /* ==================================================================== */
3486
    /*      Second pass: vertical filter                                    */
3487
    /* ==================================================================== */
3488
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3,673✔
3489

3490
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
197,002✔
3491
    {
3492
        Twork *const pafDstScanline =
193,329✔
3493
            pafWrkScanline ? pafWrkScanline
193,329✔
3494
                           : static_cast<Twork *>(pDstBuffer) +
8,421✔
3495
                                 (iDstLine - nDstYOff) * nDstXSize;
8,421✔
3496

3497
        const double dfSrcLine =
193,329✔
3498
            (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
193,329✔
3499
        int nSrcLineStart =
193,329✔
3500
            static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
193,329✔
3501
        int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
193,329✔
3502
        if (nSrcLineStart < nChunkYOff)
193,329✔
3503
            nSrcLineStart = nChunkYOff;
2,285✔
3504
        if (nSrcLineStop > nChunkBottomYOff)
193,329✔
3505
            nSrcLineStop = nChunkBottomYOff;
2,321✔
3506
#if 0
3507
        if( nSrcLineStart < nChunkYOff &&
3508
            nChunkYOff > 0 )
3509
        {
3510
            printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3511
        }
3512
        if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3513
        {
3514
            printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3515
        }
3516
#endif
3517
        const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
193,329✔
3518
        double dfWeightSum = 0.0;
193,329✔
3519

3520
        // Compute convolution coefficients.
3521
        int nSrcLine = nSrcLineStart;  // Used after for.
193,329✔
3522
        double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
193,329✔
3523
        for (; nSrcLine + 3 < nSrcLineStop;
428,774✔
3524
             nSrcLine += 4, dfY += 4 * dfYScaleWeight)
235,445✔
3525
        {
3526
            padfWeights[nSrcLine - nSrcLineStart] = dfY;
235,450✔
3527
            padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
235,450✔
3528
            padfWeights[nSrcLine + 2 - nSrcLineStart] =
235,450✔
3529
                dfY + 2 * dfYScaleWeight;
235,450✔
3530
            padfWeights[nSrcLine + 3 - nSrcLineStart] =
235,450✔
3531
                dfY + 3 * dfYScaleWeight;
235,450✔
3532
            dfWeightSum +=
235,445✔
3533
                pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
235,450✔
3534
        }
3535
        for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
226,297✔
3536
        {
3537
            const double dfWeight = pfnFilterFunc(dfY);
32,976✔
3538
            padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
32,973✔
3539
            dfWeightSum += dfWeight;
32,973✔
3540
        }
3541

3542
        if (pabyChunkNodataMask == nullptr)
193,321✔
3543
        {
3544
            if (dfWeightSum != 0)
159,809✔
3545
            {
3546
                const double dfInvWeightSum = 1.0 / dfWeightSum;
159,810✔
3547
                for (int i = 0; i < nSrcLineCount; ++i)
900,114✔
3548
                    padfWeights[i] *= dfInvWeightSum;
740,304✔
3549
            }
3550
        }
3551

3552
        if (pabyChunkNodataMask == nullptr)
193,321✔
3553
        {
3554
            int iFilteredPixelOff = 0;  // Used after for.
159,810✔
3555
            // j used after for.
3556
            size_t j =
159,810✔
3557
                (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
159,810✔
3558
#ifdef USE_SSE2
3559
            if constexpr (eWrkDataType == GDT_Float32)
3560
            {
3561
#ifdef __AVX__
3562
                for (; iFilteredPixelOff + 15 < nDstXSize;
3563
                     iFilteredPixelOff += 16, j += 16)
3564
                {
3565
                    GDALResampleConvolutionVertical_16cols(
3566
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3567
                        nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3568
                    if (bHasNoData)
3569
                    {
3570
                        for (int k = 0; k < 16; k++)
3571
                        {
3572
                            pafDstScanline[iFilteredPixelOff + k] =
3573
                                replaceValIfNodata(
3574
                                    pafDstScanline[iFilteredPixelOff + k]);
3575
                        }
3576
                    }
3577
                }
3578
#else
3579
                for (; iFilteredPixelOff + 7 < nDstXSize;
18,750,866✔
3580
                     iFilteredPixelOff += 8, j += 8)
3581
                {
3582
                    GDALResampleConvolutionVertical_8cols(
18,648,050✔
3583
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
18,648,050✔
3584
                        nSrcLineCount, pafDstScanline + iFilteredPixelOff);
18,648,050✔
3585
                    if (bHasNoData)
18,598,290✔
3586
                    {
3587
                        for (int k = 0; k < 8; k++)
17,820✔
3588
                        {
3589
                            pafDstScanline[iFilteredPixelOff + k] =
15,840✔
3590
                                replaceValIfNodata(
15,840✔
3591
                                    pafDstScanline[iFilteredPixelOff + k]);
15,840✔
3592
                        }
3593
                    }
3594
                }
3595
#endif
3596

3597
                for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
566,211✔
3598
                {
3599
                    const Twork fVal =
463,464✔
3600
                        static_cast<Twork>(GDALResampleConvolutionVertical(
463,390✔
3601
                            padfHorizontalFiltered + j, nDstXSize, padfWeights,
463,390✔
3602
                            nSrcLineCount));
3603
                    pafDstScanline[iFilteredPixelOff] =
463,404✔
3604
                        replaceValIfNodata(fVal);
463,464✔
3605
                }
3606
            }
3607
            else
3608
#endif
3609
            {
3610
                for (; iFilteredPixelOff + 1 < nDstXSize;
2,887,210✔
3611
                     iFilteredPixelOff += 2, j += 2)
3612
                {
3613
                    double dfVal1 = 0.0;
2,880,000✔
3614
                    double dfVal2 = 0.0;
2,880,000✔
3615
                    GDALResampleConvolutionVertical_2cols(
2,880,000✔
3616
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
2,880,000✔
3617
                        nSrcLineCount, dfVal1, dfVal2);
3618
                    pafDstScanline[iFilteredPixelOff] =
5,760,010✔
3619
                        replaceValIfNodata(static_cast<Twork>(dfVal1));
2,880,000✔
3620
                    pafDstScanline[iFilteredPixelOff + 1] =
2,880,000✔
3621
                        replaceValIfNodata(static_cast<Twork>(dfVal2));
2,880,000✔
3622
                }
3623
                if (iFilteredPixelOff < nDstXSize)
7,206✔
3624
                {
3625
                    const double dfVal = GDALResampleConvolutionVertical(
2✔
3626
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
2✔
3627
                        nSrcLineCount);
3628
                    pafDstScanline[iFilteredPixelOff] =
2✔
3629
                        replaceValIfNodata(static_cast<Twork>(dfVal));
2✔
3630
                }
3631
            }
3632
        }
3633
        else
3634
        {
3635
            for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
17,284,640✔
3636
                 ++iFilteredPixelOff)
3637
            {
3638
                double dfVal = 0.0;
17,251,205✔
3639
                dfWeightSum = 0.0;
17,251,205✔
3640
                size_t j = (nSrcLineStart - nChunkYOff) *
17,251,205✔
3641
                               static_cast<size_t>(nDstXSize) +
17,251,205✔
3642
                           iFilteredPixelOff;
17,251,205✔
3643
                if (bKernelWithNegativeWeights)
17,251,205✔
3644
                {
3645
                    int nConsecutiveValid = 0;
17,026,301✔
3646
                    int nMaxConsecutiveValid = 0;
17,026,301✔
3647
                    for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
121,244,321✔
3648
                    {
3649
                        const double dfWeight =
104,218,020✔
3650
                            padfWeights[i] *
104,218,020✔
3651
                            pabyChunkNodataMaskHorizontalFiltered[j];
3652
                        if (pabyChunkNodataMaskHorizontalFiltered[j])
104,218,020✔
3653
                        {
3654
                            nConsecutiveValid++;
41,787,737✔
3655
                        }
3656
                        else if (nConsecutiveValid)
62,429,783✔
3657
                        {
3658
                            nMaxConsecutiveValid = std::max(
199,248✔
3659
                                nMaxConsecutiveValid, nConsecutiveValid);
199,248✔
3660
                            nConsecutiveValid = 0;
199,248✔
3661
                        }
3662
                        dfVal += padfHorizontalFiltered[j] * dfWeight;
104,218,020✔
3663
                        dfWeightSum += dfWeight;
104,218,020✔
3664
                    }
3665
                    nMaxConsecutiveValid =
17,026,301✔
3666
                        std::max(nMaxConsecutiveValid, nConsecutiveValid);
17,026,301✔
3667
                    if (nMaxConsecutiveValid < nSrcLineCount / 2)
17,026,301✔
3668
                    {
3669
                        pafDstScanline[iFilteredPixelOff] =
8,839,831✔
3670
                            static_cast<Twork>(dfNoDataValue);
8,839,739✔
3671
                        continue;
8,839,831✔
3672
                    }
3673
                }
3674
                else
3675
                {
3676
                    for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
1,130,262✔
3677
                    {
3678
                        const double dfWeight =
905,432✔
3679
                            padfWeights[i] *
905,432✔
3680
                            pabyChunkNodataMaskHorizontalFiltered[j];
3681
                        dfVal += padfHorizontalFiltered[j] * dfWeight;
905,432✔
3682
                        dfWeightSum += dfWeight;
905,432✔
3683
                    }
3684
                }
3685
                if (dfWeightSum > 0.0)
8,411,324✔
3686
                {
3687
                    pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
8,395,283✔
3688
                        static_cast<Twork>(dfVal / dfWeightSum));
8,395,271✔
3689
                }
3690
                else
3691
                {
3692
                    pafDstScanline[iFilteredPixelOff] =
16,039✔
3693
                        static_cast<Twork>(dfNoDataValue);
16,015✔
3694
                }
3695
            }
3696
        }
3697

3698
        if (fMaxVal != 0.0f)
143,538✔
3699
        {
3700
            for (int i = 0; i < nDstXSize; ++i)
192,324✔
3701
            {
3702
                if (pafDstScanline[i] > fMaxVal)
192,088✔
3703
                    pafDstScanline[i] = fMaxVal;
96,022✔
3704
            }
3705
        }
3706

3707
        if (pafWrkScanline)
143,538✔
3708
        {
3709
            GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
184,912✔
3710
                            static_cast<GByte *>(pDstBuffer) +
3711
                                static_cast<size_t>(iDstLine - nDstYOff) *
184,912✔
3712
                                    nDstXSize * nDstDataTypeSize,
184,912✔
3713
                            dstDataType, nDstDataTypeSize, nDstXSize);
3714
        }
3715
    }
3716

3717
    VSIFree(pafWrkScanline);
3,673✔
3718
    VSIFreeAligned(padfWeights);
3,673✔
3719
    VSIFree(padfHorizontalFiltered);
3,673✔
3720
    VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3,673✔
3721

3722
    return CE_None;
3,673✔
3723
}
3724

3725
static CPLErr
3726
GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
3,673✔
3727
                              const void *pChunk, void **ppDstBuffer,
3728
                              GDALDataType *peDstBufferDataType)
3729
{
3730
    GDALResampleAlg eResample;
3731
    bool bKernelWithNegativeWeights = false;
3,673✔
3732
    if (EQUAL(args.pszResampling, "BILINEAR"))
3,673✔
3733
        eResample = GRA_Bilinear;
2,597✔
3734
    else if (EQUAL(args.pszResampling, "CUBIC"))
1,076✔
3735
    {
3736
        eResample = GRA_Cubic;
997✔
3737
        bKernelWithNegativeWeights = true;
997✔
3738
    }
3739
    else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
79✔
3740
        eResample = GRA_CubicSpline;
23✔
3741
    else if (EQUAL(args.pszResampling, "LANCZOS"))
56✔
3742
    {
3743
        eResample = GRA_Lanczos;
54✔
3744
        bKernelWithNegativeWeights = true;
54✔
3745
    }
3746
    else
3747
    {
3748
        CPLAssert(false);
2✔
3749
        return CE_Failure;
3750
    }
3751
    const int nKernelRadius = GWKGetFilterRadius(eResample);
3,671✔
3752
    FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
3,670✔
3753
    const FilterFunc4ValuesType pfnFilterFunc4Values =
3754
        GWKGetFilterFunc4Values(eResample);
3,672✔
3755

3756
    float fMaxVal = 0.f;
3,669✔
3757
    // Cubic, etc... can have overshoots, so make sure we clamp values to the
3758
    // maximum value if NBITS is set.
3759
    if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
3,669✔
3760
        (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
8✔
3761
         args.eOvrDataType == GDT_UInt32))
×
3762
    {
3763
        int nBits = args.nOvrNBITS;
8✔
3764
        if (nBits == GDALGetDataTypeSize(args.eOvrDataType))
8✔
3765
            nBits = 0;
1✔
3766
        if (nBits > 0 && nBits < 32)
8✔
3767
            fMaxVal = static_cast<float>((1U << nBits) - 1);
7✔
3768
    }
3769

3770
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(
3,669✔
3771
        args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
3772
        GDALGetDataTypeSizeBytes(args.eOvrDataType));
3773
    if (*ppDstBuffer == nullptr)
3,672✔
3774
    {
3775
        return CE_Failure;
×
3776
    }
3777
    *peDstBufferDataType = args.eOvrDataType;
3,672✔
3778

3779
    switch (args.eWrkDataType)
3,672✔
3780
    {
3781
        case GDT_Byte:
2,947✔
3782
        {
3783
            return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
2,947✔
3784
                args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
3785
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3786
                bKernelWithNegativeWeights, fMaxVal);
2,947✔
3787
        }
3788

3789
        case GDT_UInt16:
395✔
3790
        {
3791
            return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
395✔
3792
                args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
3793
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3794
                bKernelWithNegativeWeights, fMaxVal);
396✔
3795
        }
3796

3797
        case GDT_Float32:
301✔
3798
        {
3799
            return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
301✔
3800
                args, static_cast<const float *>(pChunk), *ppDstBuffer,
3801
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3802
                bKernelWithNegativeWeights, fMaxVal);
301✔
3803
        }
3804

3805
        case GDT_Float64:
29✔
3806
        {
3807
            return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
29✔
3808
                args, static_cast<const double *>(pChunk), *ppDstBuffer,
3809
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3810
                bKernelWithNegativeWeights, fMaxVal);
29✔
3811
        }
3812

3813
        default:
×
3814
            break;
×
3815
    }
3816

3817
    CPLAssert(false);
×
3818
    return CE_Failure;
3819
}
3820

3821
/************************************************************************/
3822
/*                       GDALResampleChunkC32R()                        */
3823
/************************************************************************/
3824

3825
static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
2✔
3826
                                    const float *pafChunk, const int nChunkYOff,
3827
                                    const int nChunkYSize, const int nDstYOff,
3828
                                    const int nDstYOff2, const int nOvrXSize,
3829
                                    const int nOvrYSize, void **ppDstBuffer,
3830
                                    GDALDataType *peDstBufferDataType,
3831
                                    const char *pszResampling)
3832

3833
{
3834
    enum Method
3835
    {
3836
        NEAR,
3837
        AVERAGE,
3838
        AVERAGE_MAGPHASE,
3839
        RMS,
3840
    };
3841

3842
    Method eMethod = NEAR;
2✔
3843
    if (STARTS_WITH_CI(pszResampling, "NEAR"))
2✔
3844
    {
3845
        eMethod = NEAR;
×
3846
    }
3847
    else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
2✔
3848
    {
3849
        eMethod = AVERAGE_MAGPHASE;
×
3850
    }
3851
    else if (EQUAL(pszResampling, "RMS"))
2✔
3852
    {
3853
        eMethod = RMS;
2✔
3854
    }
3855
    else if (STARTS_WITH_CI(pszResampling, "AVER"))
×
3856
    {
3857
        eMethod = AVERAGE;
×
3858
    }
3859
    else
3860
    {
3861
        CPLError(
×
3862
            CE_Failure, CPLE_NotSupported,
3863
            "Resampling method %s is not supported for complex data types. "
3864
            "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
3865
            pszResampling);
3866
        return CE_Failure;
×
3867
    }
3868

3869
    const int nOXSize = nOvrXSize;
2✔
3870
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
2✔
3871
                                       GDALGetDataTypeSizeBytes(GDT_CFloat32));
3872
    if (*ppDstBuffer == nullptr)
2✔
3873
    {
3874
        return CE_Failure;
×
3875
    }
3876
    float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
2✔
3877
    *peDstBufferDataType = GDT_CFloat32;
2✔
3878

3879
    const int nOYSize = nOvrYSize;
2✔
3880
    const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
2✔
3881
    const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
2✔
3882

3883
    /* ==================================================================== */
3884
    /*      Loop over destination scanlines.                                */
3885
    /* ==================================================================== */
3886
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
8✔
3887
    {
3888
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
6✔
3889
        if (nSrcYOff < nChunkYOff)
6✔
3890
            nSrcYOff = nChunkYOff;
×
3891

3892
        int nSrcYOff2 =
6✔
3893
            static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
6✔
3894
        if (nSrcYOff2 == nSrcYOff)
6✔
3895
            nSrcYOff2++;
×
3896

3897
        if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
6✔
3898
        {
3899
            if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
2✔
3900
                nSrcYOff = nSrcHeight - 1;
×
3901
            nSrcYOff2 = nSrcHeight;
2✔
3902
        }
3903
        if (nSrcYOff2 > nChunkYOff + nChunkYSize)
6✔
3904
            nSrcYOff2 = nChunkYOff + nChunkYSize;
×
3905

3906
        const float *const pafSrcScanline =
6✔
3907
            pafChunk + ((nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
6✔
3908
        float *const pafDstScanline =
6✔
3909
            pafDstBuffer + (iDstLine - nDstYOff) * 2 * nOXSize;
6✔
3910

3911
        /* --------------------------------------------------------------------
3912
         */
3913
        /*      Loop over destination pixels */
3914
        /* --------------------------------------------------------------------
3915
         */
3916
        for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
18✔
3917
        {
3918
            int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
12✔
3919
            int nSrcXOff2 =
12✔
3920
                static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
12✔
3921
            if (nSrcXOff2 == nSrcXOff)
12✔
3922
                nSrcXOff2++;
×
3923
            if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
12✔
3924
            {
3925
                if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
6✔
3926
                    nSrcXOff = nSrcWidth - 1;
×
3927
                nSrcXOff2 = nSrcWidth;
6✔
3928
            }
3929

3930
            if (eMethod == NEAR)
12✔
3931
            {
3932
                pafDstScanline[iDstPixel * 2] = pafSrcScanline[nSrcXOff * 2];
×
3933
                pafDstScanline[iDstPixel * 2 + 1] =
×
3934
                    pafSrcScanline[nSrcXOff * 2 + 1];
×
3935
            }
3936
            else if (eMethod == AVERAGE_MAGPHASE)
12✔
3937
            {
3938
                double dfTotalR = 0.0;
×
3939
                double dfTotalI = 0.0;
×
3940
                double dfTotalM = 0.0;
×
3941
                int nCount = 0;
×
3942

3943
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
×
3944
                {
3945
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
×
3946
                    {
3947
                        const double dfR =
×
3948
                            pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
×
3949
                                                        iY - nSrcYOff) *
×
3950
                                                        nSrcWidth * 2];
×
3951
                        const double dfI =
×
3952
                            pafSrcScanline[iX * 2 +
×
3953
                                           static_cast<GPtrDiff_t>(iY -
×
3954
                                                                   nSrcYOff) *
×
3955
                                               nSrcWidth * 2 +
×
3956
                                           1];
×
3957
                        dfTotalR += dfR;
×
3958
                        dfTotalI += dfI;
×
3959
                        dfTotalM += std::hypot(dfR, dfI);
×
3960
                        ++nCount;
×
3961
                    }
3962
                }
3963

3964
                CPLAssert(nCount > 0);
×
3965
                if (nCount == 0)
×
3966
                {
3967
                    pafDstScanline[iDstPixel * 2] = 0.0;
×
3968
                    pafDstScanline[iDstPixel * 2 + 1] = 0.0;
×
3969
                }
3970
                else
3971
                {
3972
                    pafDstScanline[iDstPixel * 2] =
×
3973
                        static_cast<float>(dfTotalR / nCount);
×
3974
                    pafDstScanline[iDstPixel * 2 + 1] =
×
3975
                        static_cast<float>(dfTotalI / nCount);
×
3976
                    const double dfM =
3977
                        std::hypot(pafDstScanline[iDstPixel * 2],
×
3978
                                   pafDstScanline[iDstPixel * 2 + 1]);
×
3979
                    const double dfDesiredM = dfTotalM / nCount;
×
3980
                    double dfRatio = 1.0;
×
3981
                    if (dfM != 0.0)
×
3982
                        dfRatio = dfDesiredM / dfM;
×
3983

3984
                    pafDstScanline[iDstPixel * 2] *=
×
3985
                        static_cast<float>(dfRatio);
×
3986
                    pafDstScanline[iDstPixel * 2 + 1] *=
×
3987
                        static_cast<float>(dfRatio);
×
3988
                }
3989
            }
3990
            else if (eMethod == RMS)
12✔
3991
            {
3992
                double dfTotalR = 0.0;
12✔
3993
                double dfTotalI = 0.0;
12✔
3994
                int nCount = 0;
12✔
3995

3996
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
36✔
3997
                {
3998
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
72✔
3999
                    {
4000
                        const double dfR =
48✔
4001
                            pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
48✔
4002
                                                        iY - nSrcYOff) *
48✔
4003
                                                        nSrcWidth * 2];
48✔
4004
                        const double dfI =
48✔
4005
                            pafSrcScanline[iX * 2 +
48✔
4006
                                           static_cast<GPtrDiff_t>(iY -
48✔
4007
                                                                   nSrcYOff) *
48✔
4008
                                               nSrcWidth * 2 +
48✔
4009
                                           1];
48✔
4010

4011
                        dfTotalR += SQUARE(dfR);
48✔
4012
                        dfTotalI += SQUARE(dfI);
48✔
4013

4014
                        ++nCount;
48✔
4015
                    }
4016
                }
4017

4018
                CPLAssert(nCount > 0);
12✔
4019
                if (nCount == 0)
12✔
4020
                {
4021
                    pafDstScanline[iDstPixel * 2] = 0.0;
×
4022
                    pafDstScanline[iDstPixel * 2 + 1] = 0.0;
×
4023
                }
4024
                else
4025
                {
4026
                    /* compute RMS */
4027
                    pafDstScanline[iDstPixel * 2] =
12✔
4028
                        static_cast<float>(sqrt(dfTotalR / nCount));
12✔
4029
                    pafDstScanline[iDstPixel * 2 + 1] =
12✔
4030
                        static_cast<float>(sqrt(dfTotalI / nCount));
12✔
4031
                }
4032
            }
4033
            else if (eMethod == AVERAGE)
×
4034
            {
4035
                double dfTotalR = 0.0;
×
4036
                double dfTotalI = 0.0;
×
4037
                int nCount = 0;
×
4038

4039
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
×
4040
                {
4041
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
×
4042
                    {
4043
                        // TODO(schwehr): Maybe use std::complex?
4044
                        dfTotalR +=
×
4045
                            pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
×
4046
                                                        iY - nSrcYOff) *
×
4047
                                                        nSrcWidth * 2];
×
4048
                        dfTotalI += pafSrcScanline[iX * 2 +
×
4049
                                                   static_cast<GPtrDiff_t>(
×
4050
                                                       iY - nSrcYOff) *
×
4051
                                                       nSrcWidth * 2 +
×
4052
                                                   1];
×
4053
                        ++nCount;
×
4054
                    }
4055
                }
4056

4057
                CPLAssert(nCount > 0);
×
4058
                if (nCount == 0)
×
4059
                {
4060
                    pafDstScanline[iDstPixel * 2] = 0.0;
×
4061
                    pafDstScanline[iDstPixel * 2 + 1] = 0.0;
×
4062
                }
4063
                else
4064
                {
4065
                    pafDstScanline[iDstPixel * 2] =
×
4066
                        static_cast<float>(dfTotalR / nCount);
×
4067
                    pafDstScanline[iDstPixel * 2 + 1] =
×
4068
                        static_cast<float>(dfTotalI / nCount);
×
4069
                }
4070
            }
4071
        }
4072
    }
4073

4074
    return CE_None;
2✔
4075
}
4076

4077
/************************************************************************/
4078
/*                  GDALRegenerateCascadingOverviews()                  */
4079
/*                                                                      */
4080
/*      Generate a list of overviews in order from largest to           */
4081
/*      smallest, computing each from the next larger.                  */
4082
/************************************************************************/
4083

4084
static CPLErr GDALRegenerateCascadingOverviews(
42✔
4085
    GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4086
    const char *pszResampling, GDALProgressFunc pfnProgress,
4087
    void *pProgressData, CSLConstList papszOptions)
4088

4089
{
4090
    /* -------------------------------------------------------------------- */
4091
    /*      First, we must put the overviews in order from largest to       */
4092
    /*      smallest.                                                       */
4093
    /* -------------------------------------------------------------------- */
4094
    for (int i = 0; i < nOverviews - 1; ++i)
120✔
4095
    {
4096
        for (int j = 0; j < nOverviews - i - 1; ++j)
270✔
4097
        {
4098
            if (papoOvrBands[j]->GetXSize() *
192✔
4099
                    static_cast<float>(papoOvrBands[j]->GetYSize()) <
192✔
4100
                papoOvrBands[j + 1]->GetXSize() *
192✔
4101
                    static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
192✔
4102
            {
4103
                GDALRasterBand *poTempBand = papoOvrBands[j];
×
4104
                papoOvrBands[j] = papoOvrBands[j + 1];
×
4105
                papoOvrBands[j + 1] = poTempBand;
×
4106
            }
4107
        }
4108
    }
4109

4110
    /* -------------------------------------------------------------------- */
4111
    /*      Count total pixels so we can prepare appropriate scaled         */
4112
    /*      progress functions.                                             */
4113
    /* -------------------------------------------------------------------- */
4114
    double dfTotalPixels = 0.0;
42✔
4115

4116
    for (int i = 0; i < nOverviews; ++i)
162✔
4117
    {
4118
        dfTotalPixels += papoOvrBands[i]->GetXSize() *
120✔
4119
                         static_cast<double>(papoOvrBands[i]->GetYSize());
120✔
4120
    }
4121

4122
    /* -------------------------------------------------------------------- */
4123
    /*      Generate all the bands.                                         */
4124
    /* -------------------------------------------------------------------- */
4125
    double dfPixelsProcessed = 0.0;
42✔
4126

4127
    for (int i = 0; i < nOverviews; ++i)
162✔
4128
    {
4129
        GDALRasterBand *poBaseBand = poSrcBand;
120✔
4130
        if (i != 0)
120✔
4131
            poBaseBand = papoOvrBands[i - 1];
78✔
4132

4133
        double dfPixels = papoOvrBands[i]->GetXSize() *
120✔
4134
                          static_cast<double>(papoOvrBands[i]->GetYSize());
120✔
4135

4136
        void *pScaledProgressData = GDALCreateScaledProgress(
240✔
4137
            dfPixelsProcessed / dfTotalPixels,
4138
            (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
120✔
4139
            pProgressData);
4140

4141
        const CPLErr eErr = GDALRegenerateOverviewsEx(
240✔
4142
            poBaseBand, 1,
4143
            reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
120✔
4144
            pszResampling, GDALScaledProgress, pScaledProgressData,
4145
            papszOptions);
4146
        GDALDestroyScaledProgress(pScaledProgressData);
120✔
4147

4148
        if (eErr != CE_None)
120✔
4149
            return eErr;
×
4150

4151
        dfPixelsProcessed += dfPixels;
120✔
4152

4153
        // Only do the bit2grayscale promotion on the base band.
4154
        if (STARTS_WITH_CI(pszResampling,
120✔
4155
                           "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4156
            pszResampling = "AVERAGE";
8✔
4157
    }
4158

4159
    return CE_None;
42✔
4160
}
4161

4162
/************************************************************************/
4163
/*                    GDALGetResampleFunction()                         */
4164
/************************************************************************/
4165

4166
GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
3,866✔
4167
                                             int *pnRadius)
4168
{
4169
    if (pnRadius)
3,866✔
4170
        *pnRadius = 0;
3,866✔
4171
    if (STARTS_WITH_CI(pszResampling, "NEAR"))
3,866✔
4172
        return GDALResampleChunk_Near;
435✔
4173
    else if (STARTS_WITH_CI(pszResampling, "AVER") ||
3,431✔
4174
             EQUAL(pszResampling, "RMS"))
2,919✔
4175
        return GDALResampleChunk_AverageOrRMS;
543✔
4176
    else if (EQUAL(pszResampling, "GAUSS"))
2,888✔
4177
    {
4178
        if (pnRadius)
26✔
4179
            *pnRadius = 1;
26✔
4180
        return GDALResampleChunk_Gauss;
26✔
4181
    }
4182
    else if (EQUAL(pszResampling, "MODE"))
2,862✔
4183
        return GDALResampleChunk_Mode;
96✔
4184
    else if (EQUAL(pszResampling, "CUBIC"))
2,766✔
4185
    {
4186
        if (pnRadius)
394✔
4187
            *pnRadius = GWKGetFilterRadius(GRA_Cubic);
395✔
4188
        return GDALResampleChunk_Convolution;
388✔
4189
    }
4190
    else if (EQUAL(pszResampling, "CUBICSPLINE"))
2,372✔
4191
    {
4192
        if (pnRadius)
3✔
4193
            *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
3✔
4194
        return GDALResampleChunk_Convolution;
3✔
4195
    }
4196
    else if (EQUAL(pszResampling, "LANCZOS"))
2,369✔
4197
    {
4198
        if (pnRadius)
8✔
4199
            *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
8✔
4200
        return GDALResampleChunk_Convolution;
8✔
4201
    }
4202
    else if (EQUAL(pszResampling, "BILINEAR"))
2,361✔
4203
    {
4204
        if (pnRadius)
2,367✔
4205
            *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
2,367✔
4206
        return GDALResampleChunk_Convolution;
2,367✔
4207
    }
4208
    else
4209
    {
4210
        CPLError(
×
4211
            CE_Failure, CPLE_AppDefined,
4212
            "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4213
            pszResampling);
4214
        return nullptr;
×
4215
    }
4216
}
4217

4218
/************************************************************************/
4219
/*                      GDALGetOvrWorkDataType()                        */
4220
/************************************************************************/
4221

4222
GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
3,758✔
4223
                                    GDALDataType eSrcDataType)
4224
{
4225
    if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
3,758✔
4226
    {
4227
        return eSrcDataType;
520✔
4228
    }
4229
    else if (eSrcDataType == GDT_Byte &&
3,238✔
4230
             (STARTS_WITH_CI(pszResampling, "AVER") ||
2,925✔
4231
              EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
2,466✔
4232
              EQUAL(pszResampling, "CUBICSPLINE") ||
2,257✔
4233
              EQUAL(pszResampling, "LANCZOS") ||
2,254✔
4234
              EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
2,249✔
4235
    {
4236
        return GDT_Byte;
2,917✔
4237
    }
4238
    else if (eSrcDataType == GDT_UInt16 &&
321✔
4239
             (STARTS_WITH_CI(pszResampling, "AVER") ||
122✔
4240
              EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
110✔
4241
              EQUAL(pszResampling, "CUBICSPLINE") ||
3✔
4242
              EQUAL(pszResampling, "LANCZOS") ||
3✔
4243
              EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
2✔
4244
    {
4245
        return GDT_UInt16;
115✔
4246
    }
4247
    else if (EQUAL(pszResampling, "GAUSS"))
206✔
4248
        return GDT_Float64;
20✔
4249

4250
    if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
186✔
4251
        eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
184✔
4252
        eSrcDataType == GDT_Float32)
4253
    {
4254
        return GDT_Float32;
150✔
4255
    }
4256
    return GDT_Float64;
36✔
4257
}
4258

4259
namespace
4260
{
4261
// Structure to hold a pointer to free with CPLFree()
4262
struct PointerHolder
4263
{
4264
    void *ptr = nullptr;
4265

4266
    explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
34,667✔
4267
    {
4268
    }
34,667✔
4269

4270
    ~PointerHolder()
34,669✔
4271
    {
34,669✔
4272
        CPLFree(ptr);
34,669✔
4273
    }
34,669✔
4274

4275
    PointerHolder(const PointerHolder &) = delete;
4276
    PointerHolder &operator=(const PointerHolder &) = delete;
4277
};
4278
}  // namespace
4279

4280
/************************************************************************/
4281
/*                      GDALRegenerateOverviews()                       */
4282
/************************************************************************/
4283

4284
/**
4285
 * \brief Generate downsampled overviews.
4286
 *
4287
 * This function will generate one or more overview images from a base image
4288
 * using the requested downsampling algorithm.  Its primary use is for
4289
 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4290
 * used to generate downsampled images in one file from another outside the
4291
 * overview architecture.
4292
 *
4293
 * The output bands need to exist in advance.
4294
 *
4295
 * The full set of resampling algorithms is documented in
4296
 * GDALDataset::BuildOverviews().
4297
 *
4298
 * This function will honour properly NODATA_VALUES tuples (special dataset
4299
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4300
 * considered as the nodata value and not each value of the triplet
4301
 * independently per band.
4302
 *
4303
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4304
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4305
 * overview computation.
4306
 *
4307
 * @param hSrcBand the source (base level) band.
4308
 * @param nOverviewCount the number of downsampled bands being generated.
4309
 * @param pahOvrBands the list of downsampled bands to be generated.
4310
 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4311
 * @param pfnProgress progress report function.
4312
 * @param pProgressData progress function callback data.
4313
 * @return CE_None on success or CE_Failure on failure.
4314
 */
4315
CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
252✔
4316
                               GDALRasterBandH *pahOvrBands,
4317
                               const char *pszResampling,
4318
                               GDALProgressFunc pfnProgress,
4319
                               void *pProgressData)
4320

4321
{
4322
    return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
252✔
4323
                                     pszResampling, pfnProgress, pProgressData,
4324
                                     nullptr);
252✔
4325
}
4326

4327
/************************************************************************/
4328
/*                     GDALRegenerateOverviewsEx()                      */
4329
/************************************************************************/
4330

4331
/**
4332
 * \brief Generate downsampled overviews.
4333
 *
4334
 * This function will generate one or more overview images from a base image
4335
 * using the requested downsampling algorithm.  Its primary use is for
4336
 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4337
 * used to generate downsampled images in one file from another outside the
4338
 * overview architecture.
4339
 *
4340
 * The output bands need to exist in advance.
4341
 *
4342
 * The full set of resampling algorithms is documented in
4343
 * GDALDataset::BuildOverviews().
4344
 *
4345
 * This function will honour properly NODATA_VALUES tuples (special dataset
4346
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4347
 * considered as the nodata value and not each value of the triplet
4348
 * independently per band.
4349
 *
4350
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4351
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4352
 * overview computation.
4353
 *
4354
 * @param hSrcBand the source (base level) band.
4355
 * @param nOverviewCount the number of downsampled bands being generated.
4356
 * @param pahOvrBands the list of downsampled bands to be generated.
4357
 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4358
 * @param pfnProgress progress report function.
4359
 * @param pProgressData progress function callback data.
4360
 * @param papszOptions NULL terminated list of options as key=value pairs, or
4361
 * NULL
4362
 * @return CE_None on success or CE_Failure on failure.
4363
 * @since GDAL 3.6
4364
 */
4365
CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
808✔
4366
                                 GDALRasterBandH *pahOvrBands,
4367
                                 const char *pszResampling,
4368
                                 GDALProgressFunc pfnProgress,
4369
                                 void *pProgressData, CSLConstList papszOptions)
4370

4371
{
4372
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
808✔
4373
    GDALRasterBand **papoOvrBands =
808✔
4374
        reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4375

4376
    if (pfnProgress == nullptr)
808✔
4377
        pfnProgress = GDALDummyProgress;
252✔
4378

4379
    if (EQUAL(pszResampling, "NONE"))
808✔
4380
        return CE_None;
61✔
4381

4382
    int nKernelRadius = 0;
747✔
4383
    GDALResampleFunction pfnResampleFn =
4384
        GDALGetResampleFunction(pszResampling, &nKernelRadius);
747✔
4385

4386
    if (pfnResampleFn == nullptr)
747✔
4387
        return CE_Failure;
×
4388

4389
    /* -------------------------------------------------------------------- */
4390
    /*      Check color tables...                                           */
4391
    /* -------------------------------------------------------------------- */
4392
    GDALColorTable *poColorTable = nullptr;
747✔
4393

4394
    if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
386✔
4395
         EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
1,568✔
4396
        poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
446✔
4397
    {
4398
        poColorTable = poSrcBand->GetColorTable();
9✔
4399
        if (poColorTable != nullptr)
9✔
4400
        {
4401
            if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
9✔
4402
            {
4403
                CPLError(CE_Warning, CPLE_AppDefined,
×
4404
                         "Computing overviews on palette index raster bands "
4405
                         "with a palette whose color interpretation is not RGB "
4406
                         "will probably lead to unexpected results.");
4407
                poColorTable = nullptr;
×
4408
            }
4409
            else if (poColorTable->IsIdentity())
9✔
4410
            {
4411
                poColorTable = nullptr;
×
4412
            }
4413
        }
4414
        else
4415
        {
4416
            CPLError(CE_Warning, CPLE_AppDefined,
×
4417
                     "Computing overviews on palette index raster bands "
4418
                     "without a palette will probably lead to unexpected "
4419
                     "results.");
4420
        }
4421
    }
4422
    // Not ready yet
4423
    else if ((EQUAL(pszResampling, "CUBIC") ||
2,160✔
4424
              EQUAL(pszResampling, "CUBICSPLINE") ||
684✔
4425
              EQUAL(pszResampling, "LANCZOS") ||
684✔
4426
              EQUAL(pszResampling, "BILINEAR")) &&
1,479✔
4427
             poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
57✔
4428
    {
4429
        CPLError(CE_Warning, CPLE_AppDefined,
×
4430
                 "Computing %s overviews on palette index raster bands "
4431
                 "will probably lead to unexpected results.",
4432
                 pszResampling);
4433
    }
4434

4435
    // If we have a nodata mask and we are doing something more complicated
4436
    // than nearest neighbouring, we have to fetch to nodata mask.
4437

4438
    GDALRasterBand *poMaskBand = nullptr;
747✔
4439
    bool bUseNoDataMask = false;
747✔
4440
    bool bCanUseCascaded = true;
747✔
4441

4442
    if (!STARTS_WITH_CI(pszResampling, "NEAR"))
747✔
4443
    {
4444
        // Special case if we are an alpha/mask band. We want it to be
4445
        // considered as the mask band to avoid alpha=0 to be taken into account
4446
        // in average computation.
4447
        if (poSrcBand->IsMaskBand())
503✔
4448
        {
4449
            poMaskBand = poSrcBand;
90✔
4450
            bUseNoDataMask = true;
90✔
4451
        }
4452
        else
4453
        {
4454
            poMaskBand = poSrcBand->GetMaskBand();
413✔
4455
            const int nMaskFlags = poSrcBand->GetMaskFlags();
413✔
4456
            bCanUseCascaded =
413✔
4457
                (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
413✔
4458
            bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
413✔
4459
        }
4460
    }
4461

4462
    /* -------------------------------------------------------------------- */
4463
    /*      If we are operating on multiple overviews, and using            */
4464
    /*      averaging, lets do them in cascading order to reduce the        */
4465
    /*      amount of computation.                                          */
4466
    /* -------------------------------------------------------------------- */
4467

4468
    // In case the mask made be computed from another band of the dataset,
4469
    // we can't use cascaded generation, as the computation of the overviews
4470
    // of the band used for the mask band may not have yet occurred (#3033).
4471
    if ((STARTS_WITH_CI(pszResampling, "AVER") ||
747✔
4472
         EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
386✔
4473
         EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
355✔
4474
         EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
301✔
4475
         EQUAL(pszResampling, "MODE")) &&
747✔
4476
        nOverviewCount > 1 && bCanUseCascaded)
42✔
4477
        return GDALRegenerateCascadingOverviews(
42✔
4478
            poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4479
            pProgressData, papszOptions);
42✔
4480

4481
    /* -------------------------------------------------------------------- */
4482
    /*      Setup one horizontal swath to read from the raw buffer.         */
4483
    /* -------------------------------------------------------------------- */
4484
    int nFRXBlockSize = 0;
705✔
4485
    int nFRYBlockSize = 0;
705✔
4486
    poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
705✔
4487

4488
    const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
705✔
4489
    const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
1,166✔
4490
                                       EQUAL(pszResampling, "MODE") ||
1,120✔
4491
                                       !GDALDataTypeIsComplex(eSrcDataType);
415✔
4492
    const GDALDataType eWrkDataType =
4493
        bUseGenericResampleFn
4494
            ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
705✔
4495
            : GDT_CFloat32;
705✔
4496

4497
    const int nWidth = poSrcBand->GetXSize();
705✔
4498
    const int nHeight = poSrcBand->GetYSize();
705✔
4499

4500
    int nMaxOvrFactor = 1;
705✔
4501
    for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
1,491✔
4502
    {
4503
        const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
786✔
4504
        const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
786✔
4505
        nMaxOvrFactor = std::max(
786✔
4506
            nMaxOvrFactor,
4507
            static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
786✔
4508
        nMaxOvrFactor = std::max(
786✔
4509
            nMaxOvrFactor,
4510
            static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
786✔
4511
    }
4512

4513
    int nFullResYChunk = nFRYBlockSize;
705✔
4514
    int nMaxChunkYSizeQueried = 0;
705✔
4515

4516
    const auto UpdateChunkHeightAndGetChunkSize =
4517
        [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
9,165✔
4518
         eWrkDataType, nWidth]()
27,495✔
4519
    {
4520
        // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4521
        // + nFullResYChunk) / nMaxOvrFactor)
4522
        nFullResYChunk = std::max(nFullResYChunk, 2 * nMaxOvrFactor);
9,165✔
4523
        nMaxChunkYSizeQueried =
9,165✔
4524
            nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
9,165✔
4525
        return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
9,165✔
4526
               nMaxChunkYSizeQueried * nWidth;
9,165✔
4527
    };
705✔
4528

4529
    // Only configurable for debug / testing
4530
    const char *pszChunkYSize =
4531
        CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
705✔
4532
    if (pszChunkYSize)
705✔
4533
    {
4534
        // coverity[tainted_data]
4535
        nFullResYChunk = atoi(pszChunkYSize);
×
4536
    }
4537

4538
    // Only configurable for debug / testing
4539
    const int nChunkMaxSize =
4540
        atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
705✔
4541

4542
    auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
705✔
4543
    if (nChunkSize > nChunkMaxSize)
705✔
4544
    {
4545
        if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
3✔
4546
            !GDALDataTypeIsComplex(eSrcDataType) &&
9✔
4547
            (!STARTS_WITH_CI(pszResampling, "AVER") ||
3✔
4548
             EQUAL(pszResampling, "AVERAGE")))
×
4549
        {
4550
            // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
4551
            // which use a block based strategy, which is much less memory
4552
            // hungry.
4553
            return GDALRegenerateOverviewsMultiBand(
3✔
4554
                1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
4555
                pfnProgress, pProgressData, papszOptions);
3✔
4556
        }
4557
        else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
×
4558
        {
4559
            return GDALRegenerateCascadingOverviews(
×
4560
                poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
4561
                pfnProgress, pProgressData, papszOptions);
×
4562
        }
4563
    }
4564
    else if (pszChunkYSize == nullptr)
702✔
4565
    {
4566
        // Try to get as close as possible to nChunkMaxSize
4567
        while (nChunkSize * 2 < nChunkMaxSize)
9,162✔
4568
        {
4569
            nFullResYChunk *= 2;
8,460✔
4570
            nChunkSize = UpdateChunkHeightAndGetChunkSize();
8,460✔
4571
        }
4572
    }
4573

4574
    int nHasNoData = 0;
702✔
4575
    const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
702✔
4576
    const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
702✔
4577
    const bool bPropagateNoData =
4578
        CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
702✔
4579

4580
    // Structure describing a resampling job
4581
    struct OvrJob
4582
    {
4583
        // Buffers to free when job is finished
4584
        std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
4585
        std::shared_ptr<PointerHolder> oSrcBufferHolder{};
4586
        std::unique_ptr<PointerHolder> oDstBufferHolder{};
4587

4588
        GDALRasterBand *poDstBand = nullptr;
4589

4590
        // Input parameters of pfnResampleFn
4591
        GDALResampleFunction pfnResampleFn = nullptr;
4592
        int nSrcWidth = 0;
4593
        int nSrcHeight = 0;
4594
        int nDstWidth = 0;
4595
        GDALOverviewResampleArgs args{};
4596
        const void *pChunk = nullptr;
4597
        bool bUseGenericResampleFn = false;
4598

4599
        // Output values of resampling function
4600
        CPLErr eErr = CE_Failure;
4601
        void *pDstBuffer = nullptr;
4602
        GDALDataType eDstBufferDataType = GDT_Unknown;
4603

4604
        // Synchronization
4605
        bool bFinished = false;
4606
        std::mutex mutex{};
4607
        std::condition_variable cv{};
4608

4609
        void SetSrcMaskBufferHolder(
×
4610
            const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
4611
        {
4612
            oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
×
4613
        }
×
4614

4615
        void SetSrcBufferHolder(
×
4616
            const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
4617
        {
4618
            oSrcBufferHolder = oSrcBufferHolderIn;
×
4619
        }
×
4620
    };
4621

4622
    // Thread function to resample
4623
    const auto JobResampleFunc = [](void *pData)
784✔
4624
    {
4625
        OvrJob *poJob = static_cast<OvrJob *>(pData);
784✔
4626

4627
        if (poJob->bUseGenericResampleFn)
784✔
4628
        {
4629
            poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
782✔
4630
                                               &(poJob->pDstBuffer),
4631
                                               &(poJob->eDstBufferDataType));
4632
        }
4633
        else
4634
        {
4635
            poJob->eErr = GDALResampleChunkC32R(
2✔
4636
                poJob->nSrcWidth, poJob->nSrcHeight,
4637
                static_cast<const float *>(poJob->pChunk),
2✔
4638
                poJob->args.nChunkYOff, poJob->args.nChunkYSize,
4639
                poJob->args.nDstYOff, poJob->args.nDstYOff2,
4640
                poJob->args.nOvrXSize, poJob->args.nOvrYSize,
4641
                &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
4642
                poJob->args.pszResampling);
4643
        }
4644

4645
        poJob->oDstBufferHolder =
4646
            std::make_unique<PointerHolder>(poJob->pDstBuffer);
784✔
4647

4648
        {
4649
            std::lock_guard<std::mutex> guard(poJob->mutex);
1,568✔
4650
            poJob->bFinished = true;
784✔
4651
            poJob->cv.notify_one();
784✔
4652
        }
4653
    };
784✔
4654

4655
    // Function to write resample data to target band
4656
    const auto WriteJobData = [](const OvrJob *poJob)
784✔
4657
    {
4658
        return poJob->poDstBand->RasterIO(
1,568✔
4659
            GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
784✔
4660
            poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
784✔
4661
            poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
784✔
4662
            poJob->eDstBufferDataType, 0, 0, nullptr);
784✔
4663
    };
4664

4665
    // Wait for completion of oldest job and serialize it
4666
    const auto WaitAndFinalizeOldestJob =
4667
        [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
×
4668
    {
4669
        auto poOldestJob = jobList.front().get();
×
4670
        {
4671
            std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
×
4672
            // coverity[missing_lock:FALSE]
4673
            while (!poOldestJob->bFinished)
×
4674
            {
4675
                poOldestJob->cv.wait(oGuard);
×
4676
            }
4677
        }
4678
        CPLErr l_eErr = poOldestJob->eErr;
×
4679
        if (l_eErr == CE_None)
×
4680
        {
4681
            l_eErr = WriteJobData(poOldestJob);
×
4682
        }
4683

4684
        jobList.pop_front();
×
4685
        return l_eErr;
×
4686
    };
4687

4688
    // Queue of jobs
4689
    std::list<std::unique_ptr<OvrJob>> jobList;
1,404✔
4690

4691
    GByte *pabyChunkNodataMask = nullptr;
702✔
4692
    void *pChunk = nullptr;
702✔
4693

4694
    const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
702✔
4695
    const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
2,808✔
4696
                                                       ? CPLGetNumCPUs()
702✔
4697
                                                       : atoi(pszThreads)));
702✔
4698
    auto poThreadPool =
4699
        nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
702✔
4700
    auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
4701
                                   : std::unique_ptr<CPLJobQueue>(nullptr);
1,404✔
4702

4703
    /* -------------------------------------------------------------------- */
4704
    /*      Loop over image operating on chunks.                            */
4705
    /* -------------------------------------------------------------------- */
4706
    int nChunkYOff = 0;
702✔
4707
    CPLErr eErr = CE_None;
702✔
4708

4709
    for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
1,409✔
4710
         nChunkYOff += nFullResYChunk)
707✔
4711
    {
4712
        if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
707✔
4713
                         pProgressData))
4714
        {
4715
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
×
4716
            eErr = CE_Failure;
×
4717
        }
4718

4719
        if (nFullResYChunk + nChunkYOff > nHeight)
707✔
4720
            nFullResYChunk = nHeight - nChunkYOff;
700✔
4721

4722
        int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
707✔
4723
        int nChunkYSizeQueried =
707✔
4724
            nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
707✔
4725
        if (nChunkYOffQueried < 0)
707✔
4726
        {
4727
            nChunkYSizeQueried += nChunkYOffQueried;
62✔
4728
            nChunkYOffQueried = 0;
62✔
4729
        }
4730
        if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
707✔
4731
            nChunkYSizeQueried = nHeight - nChunkYOffQueried;
62✔
4732

4733
        // Avoid accumulating too many tasks and exhaust RAM
4734
        // Try to complete already finished jobs
4735
        while (eErr == CE_None && !jobList.empty())
707✔
4736
        {
4737
            auto poOldestJob = jobList.front().get();
×
4738
            {
4739
                std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
×
4740
                if (!poOldestJob->bFinished)
×
4741
                {
4742
                    break;
×
4743
                }
4744
            }
4745
            eErr = poOldestJob->eErr;
×
4746
            if (eErr == CE_None)
×
4747
            {
4748
                eErr = WriteJobData(poOldestJob);
×
4749
            }
4750

4751
            jobList.pop_front();
×
4752
        }
4753

4754
        // And in case we have saturated the number of threads,
4755
        // wait for completion of tasks to go below the threshold.
4756
        while (eErr == CE_None &&
1,414✔
4757
               jobList.size() >= static_cast<size_t>(nThreads))
707✔
4758
        {
4759
            eErr = WaitAndFinalizeOldestJob(jobList);
×
4760
        }
4761

4762
        // (Re)allocate buffers if needed
4763
        if (pChunk == nullptr)
707✔
4764
        {
4765
            pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
702✔
4766
                                         nMaxChunkYSizeQueried, nWidth);
4767
        }
4768
        if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
707✔
4769
        {
4770
            pabyChunkNodataMask = static_cast<GByte *>(
4771
                VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
274✔
4772
        }
4773

4774
        if (pChunk == nullptr ||
707✔
4775
            (bUseNoDataMask && pabyChunkNodataMask == nullptr))
274✔
4776
        {
4777
            CPLFree(pChunk);
×
4778
            CPLFree(pabyChunkNodataMask);
×
4779
            return CE_Failure;
×
4780
        }
4781

4782
        // Read chunk.
4783
        if (eErr == CE_None)
707✔
4784
            eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
707✔
4785
                                       nChunkYSizeQueried, pChunk, nWidth,
4786
                                       nChunkYSizeQueried, eWrkDataType, 0, 0,
4787
                                       nullptr);
4788
        if (eErr == CE_None && bUseNoDataMask)
707✔
4789
            eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
274✔
4790
                                        nChunkYSizeQueried, pabyChunkNodataMask,
4791
                                        nWidth, nChunkYSizeQueried, GDT_Byte, 0,
4792
                                        0, nullptr);
4793

4794
        // Special case to promote 1bit data to 8bit 0/255 values.
4795
        if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
707✔
4796
        {
4797
            if (eWrkDataType == GDT_Float32)
9✔
4798
            {
4799
                float *pafChunk = static_cast<float *>(pChunk);
×
4800
                for (GPtrDiff_t i = 0;
×
4801
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
×
4802
                     i++)
4803
                {
4804
                    if (pafChunk[i] == 1.0)
×
4805
                        pafChunk[i] = 255.0;
×
4806
                }
4807
            }
4808
            else if (eWrkDataType == GDT_Byte)
9✔
4809
            {
4810
                GByte *pabyChunk = static_cast<GByte *>(pChunk);
9✔
4811
                for (GPtrDiff_t i = 0;
168,417✔
4812
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
168,417✔
4813
                     i++)
4814
                {
4815
                    if (pabyChunk[i] == 1)
168,408✔
4816
                        pabyChunk[i] = 255;
127,437✔
4817
                }
4818
            }
4819
            else if (eWrkDataType == GDT_UInt16)
×
4820
            {
4821
                GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
×
4822
                for (GPtrDiff_t i = 0;
×
4823
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
×
4824
                     i++)
4825
                {
4826
                    if (pasChunk[i] == 1)
×
4827
                        pasChunk[i] = 255;
×
4828
                }
4829
            }
4830
            else if (eWrkDataType == GDT_Float64)
×
4831
            {
4832
                double *padfChunk = static_cast<double *>(pChunk);
×
4833
                for (GPtrDiff_t i = 0;
×
4834
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
×
4835
                     i++)
4836
                {
4837
                    if (padfChunk[i] == 1.0)
×
4838
                        padfChunk[i] = 255.0;
×
4839
                }
4840
            }
4841
            else
4842
            {
4843
                CPLAssert(false);
×
4844
            }
4845
        }
4846
        else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
698✔
4847
        {
4848
            if (eWrkDataType == GDT_Float32)
×
4849
            {
4850
                float *pafChunk = static_cast<float *>(pChunk);
×
4851
                for (GPtrDiff_t i = 0;
×
4852
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
×
4853
                     i++)
4854
                {
4855
                    if (pafChunk[i] == 1.0)
×
4856
                        pafChunk[i] = 0.0;
×
4857
                    else if (pafChunk[i] == 0.0)
×
4858
                        pafChunk[i] = 255.0;
×
4859
                }
4860
            }
4861
            else if (eWrkDataType == GDT_Byte)
×
4862
            {
4863
                GByte *pabyChunk = static_cast<GByte *>(pChunk);
×
4864
                for (GPtrDiff_t i = 0;
×
4865
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
×
4866
                     i++)
4867
                {
4868
                    if (pabyChunk[i] == 1)
×
4869
                        pabyChunk[i] = 0;
×
4870
                    else if (pabyChunk[i] == 0)
×
4871
                        pabyChunk[i] = 255;
×
4872
                }
4873
            }
4874
            else if (eWrkDataType == GDT_UInt16)
×
4875
            {
4876
                GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
×
4877
                for (GPtrDiff_t i = 0;
×
4878
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
×
4879
                     i++)
4880
                {
4881
                    if (pasChunk[i] == 1)
×
4882
                        pasChunk[i] = 0;
×
4883
                    else if (pasChunk[i] == 0)
×
4884
                        pasChunk[i] = 255;
×
4885
                }
4886
            }
4887
            else if (eWrkDataType == GDT_Float64)
×
4888
            {
4889
                double *padfChunk = static_cast<double *>(pChunk);
×
4890
                for (GPtrDiff_t i = 0;
×
4891
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
×
4892
                     i++)
4893
                {
4894
                    if (padfChunk[i] == 1.0)
×
4895
                        padfChunk[i] = 0.0;
×
4896
                    else if (padfChunk[i] == 0.0)
×
4897
                        padfChunk[i] = 255.0;
×
4898
                }
4899
            }
4900
            else
4901
            {
4902
                CPLAssert(false);
×
4903
            }
4904
        }
4905

4906
        auto oSrcBufferHolder =
4907
            std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
1,414✔
4908
        auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
4909
            poJobQueue ? pabyChunkNodataMask : nullptr);
1,414✔
4910

4911
        for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
1,491✔
4912
             ++iOverview)
4913
        {
4914
            GDALRasterBand *poDstBand = papoOvrBands[iOverview];
784✔
4915
            const int nDstWidth = poDstBand->GetXSize();
784✔
4916
            const int nDstHeight = poDstBand->GetYSize();
784✔
4917

4918
            const double dfXRatioDstToSrc =
784✔
4919
                static_cast<double>(nWidth) / nDstWidth;
784✔
4920
            const double dfYRatioDstToSrc =
784✔
4921
                static_cast<double>(nHeight) / nDstHeight;
784✔
4922

4923
            /* --------------------------------------------------------------------
4924
             */
4925
            /*      Figure out the line to start writing to, and the first line
4926
             */
4927
            /*      to not write to.  In theory this approach should ensure that
4928
             */
4929
            /*      every output line will be written if all input chunks are */
4930
            /*      processed. */
4931
            /* --------------------------------------------------------------------
4932
             */
4933
            int nDstYOff =
784✔
4934
                static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
784✔
4935
            if (nDstYOff == nDstHeight)
784✔
4936
                continue;
×
4937
            int nDstYOff2 = static_cast<int>(
784✔
4938
                0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
784✔
4939

4940
            if (nChunkYOff + nFullResYChunk == nHeight)
784✔
4941
                nDstYOff2 = nDstHeight;
777✔
4942
#if DEBUG_VERBOSE
4943
            CPLDebug("GDAL",
4944
                     "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
4945
                     nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
4946
                     nDstWidth, nDstYOff2 - nDstYOff);
4947
#endif
4948

4949
            auto poJob = std::make_unique<OvrJob>();
1,568✔
4950
            poJob->pfnResampleFn = pfnResampleFn;
784✔
4951
            poJob->bUseGenericResampleFn = bUseGenericResampleFn;
784✔
4952
            poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
784✔
4953
            poJob->args.nOvrXSize = poDstBand->GetXSize();
784✔
4954
            poJob->args.nOvrYSize = poDstBand->GetYSize();
784✔
4955
            const char *pszNBITS =
4956
                poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
784✔
4957
            poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
784✔
4958
            poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
784✔
4959
            poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
784✔
4960
            poJob->args.eWrkDataType = eWrkDataType;
784✔
4961
            poJob->pChunk = pChunk;
784✔
4962
            poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
784✔
4963
            poJob->nSrcWidth = nWidth;
784✔
4964
            poJob->nSrcHeight = nHeight;
784✔
4965
            poJob->args.nChunkXOff = 0;
784✔
4966
            poJob->args.nChunkXSize = nWidth;
784✔
4967
            poJob->args.nChunkYOff = nChunkYOffQueried;
784✔
4968
            poJob->args.nChunkYSize = nChunkYSizeQueried;
784✔
4969
            poJob->nDstWidth = nDstWidth;
784✔
4970
            poJob->args.nDstXOff = 0;
784✔
4971
            poJob->args.nDstXOff2 = nDstWidth;
784✔
4972
            poJob->args.nDstYOff = nDstYOff;
784✔
4973
            poJob->args.nDstYOff2 = nDstYOff2;
784✔
4974
            poJob->poDstBand = poDstBand;
784✔
4975
            poJob->args.pszResampling = pszResampling;
784✔
4976
            poJob->args.bHasNoData = bHasNoData;
784✔
4977
            poJob->args.dfNoDataValue = dfNoDataValue;
784✔
4978
            poJob->args.poColorTable = poColorTable;
784✔
4979
            poJob->args.eSrcDataType = eSrcDataType;
784✔
4980
            poJob->args.bPropagateNoData = bPropagateNoData;
784✔
4981

4982
            if (poJobQueue)
784✔
4983
            {
4984
                poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
×
4985
                poJob->SetSrcBufferHolder(oSrcBufferHolder);
×
4986
                poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
×
4987
                jobList.emplace_back(std::move(poJob));
×
4988
            }
4989
            else
4990
            {
4991
                JobResampleFunc(poJob.get());
784✔
4992
                eErr = poJob->eErr;
784✔
4993
                if (eErr == CE_None)
784✔
4994
                {
4995
                    eErr = WriteJobData(poJob.get());
784✔
4996
                }
4997
            }
4998
        }
4999

5000
        if (poJobQueue)
707✔
5001
        {
5002
            pChunk = nullptr;
×
5003
            pabyChunkNodataMask = nullptr;
×
5004
        }
5005
    }
5006

5007
    VSIFree(pChunk);
702✔
5008
    VSIFree(pabyChunkNodataMask);
702✔
5009

5010
    // Wait for all pending jobs to complete
5011
    while (!jobList.empty())
702✔
5012
    {
5013
        const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
×
5014
        if (l_eErr != CE_None && eErr == CE_None)
×
5015
            eErr = l_eErr;
×
5016
    }
5017

5018
    /* -------------------------------------------------------------------- */
5019
    /*      Renormalized overview mean / stddev if needed.                  */
5020
    /* -------------------------------------------------------------------- */
5021
    if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
702✔
5022
    {
5023
        GDALOverviewMagnitudeCorrection(
×
5024
            poSrcBand, nOverviewCount,
5025
            reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5026
            GDALDummyProgress, nullptr);
5027
    }
5028

5029
    /* -------------------------------------------------------------------- */
5030
    /*      It can be important to flush out data to overviews.             */
5031
    /* -------------------------------------------------------------------- */
5032
    for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
1,479✔
5033
         ++iOverview)
5034
    {
5035
        eErr = papoOvrBands[iOverview]->FlushCache(false);
777✔
5036
    }
5037

5038
    if (eErr == CE_None)
702✔
5039
        pfnProgress(1.0, nullptr, pProgressData);
702✔
5040

5041
    return eErr;
702✔
5042
}
5043

5044
/************************************************************************/
5045
/*            GDALRegenerateOverviewsMultiBand()                        */
5046
/************************************************************************/
5047

5048
/**
5049
 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5050
 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5051
 *
5052
 * This function will generate one or more overview images from a base
5053
 * image using the requested downsampling algorithm.  Its primary use
5054
 * is for generating overviews via GDALDataset::BuildOverviews(), but it
5055
 * can also be used to generate downsampled images in one file from another
5056
 * outside the overview architecture.
5057
 *
5058
 * The output bands need to exist in advance and share the same characteristics
5059
 * (type, dimensions)
5060
 *
5061
 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5062
 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5063
 *
5064
 * It does not support color tables or complex data types.
5065
 *
5066
 * The pseudo-algorithm used by the function is :
5067
 *    for each overview
5068
 *       iterate on lines of the source by a step of deltay
5069
 *           iterate on columns of the source  by a step of deltax
5070
 *               read the source data of size deltax * deltay for all the bands
5071
 *               generate the corresponding overview block for all the bands
5072
 *
5073
 * This function will honour properly NODATA_VALUES tuples (special dataset
5074
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5075
 * considered as the nodata value and not each value of the triplet
5076
 * independently per band.
5077
 *
5078
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5079
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5080
 * overview computation.
5081
 *
5082
 * @param nBands the number of bands, size of papoSrcBands and size of
5083
 *               first dimension of papapoOverviewBands
5084
 * @param papoSrcBands the list of source bands to downsample
5085
 * @param nOverviews the number of downsampled overview levels being generated.
5086
 * @param papapoOverviewBands bidimension array of bands. First dimension is
5087
 *                            indexed by nBands. Second dimension is indexed by
5088
 *                            nOverviews.
5089
 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5090
 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5091
 * @param pfnProgress progress report function.
5092
 * @param pProgressData progress function callback data.
5093
 * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5094
 *                     key=value pairs, or NULL
5095
 *                     Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5096
 *                     options can be specified to express that overviews should
5097
 *                     be regenerated only in the specified subset of the source
5098
 *                     dataset.
5099
 * @return CE_None on success or CE_Failure on failure.
5100
 */
5101

5102
CPLErr GDALRegenerateOverviewsMultiBand(
362✔
5103
    int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5104
    GDALRasterBand *const *const *papapoOverviewBands,
5105
    const char *pszResampling, GDALProgressFunc pfnProgress,
5106
    void *pProgressData, CSLConstList papszOptions)
5107
{
5108
    CPL_IGNORE_RET_VAL(papszOptions);
362✔
5109

5110
    if (pfnProgress == nullptr)
362✔
5111
        pfnProgress = GDALDummyProgress;
6✔
5112

5113
    if (EQUAL(pszResampling, "NONE"))
362✔
5114
        return CE_None;
2✔
5115

5116
    // Sanity checks.
5117
    if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
360✔
5118
        !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
169✔
5119
        !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
70✔
5120
        !EQUAL(pszResampling, "CUBICSPLINE") &&
18✔
5121
        !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
17✔
5122
        !EQUAL(pszResampling, "MODE"))
5✔
5123
    {
5124
        CPLError(CE_Failure, CPLE_NotSupported,
×
5125
                 "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5126
                 "not supported",
5127
                 pszResampling);
5128
        return CE_Failure;
×
5129
    }
5130

5131
    int nKernelRadius = 0;
360✔
5132
    GDALResampleFunction pfnResampleFn =
5133
        GDALGetResampleFunction(pszResampling, &nKernelRadius);
360✔
5134
    if (pfnResampleFn == nullptr)
360✔
5135
        return CE_Failure;
×
5136

5137
    const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
360✔
5138
    const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
360✔
5139
    if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
360✔
5140
        return CE_None;
×
5141
    GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
360✔
5142
    for (int iBand = 1; iBand < nBands; ++iBand)
662✔
5143
    {
5144
        if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
604✔
5145
            papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
302✔
5146
        {
5147
            CPLError(
×
5148
                CE_Failure, CPLE_NotSupported,
5149
                "GDALRegenerateOverviewsMultiBand: all the source bands must "
5150
                "have the same dimensions");
5151
            return CE_Failure;
×
5152
        }
5153
        if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
302✔
5154
        {
5155
            CPLError(
×
5156
                CE_Failure, CPLE_NotSupported,
5157
                "GDALRegenerateOverviewsMultiBand: all the source bands must "
5158
                "have the same data type");
5159
            return CE_Failure;
×
5160
        }
5161
    }
5162

5163
    for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
956✔
5164
    {
5165
        const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
596✔
5166
        const int nDstWidth = poOvrFirstBand->GetXSize();
596✔
5167
        const int nDstHeight = poOvrFirstBand->GetYSize();
596✔
5168
        for (int iBand = 1; iBand < nBands; ++iBand)
1,168✔
5169
        {
5170
            const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
572✔
5171
            if (poOvrBand->GetXSize() != nDstWidth ||
1,144✔
5172
                poOvrBand->GetYSize() != nDstHeight)
572✔
5173
            {
5174
                CPLError(
×
5175
                    CE_Failure, CPLE_NotSupported,
5176
                    "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5177
                    "of the same level must have the same dimensions");
5178
                return CE_Failure;
×
5179
            }
5180
            if (poOvrBand->GetRasterDataType() != eDataType)
572✔
5181
            {
5182
                CPLError(
×
5183
                    CE_Failure, CPLE_NotSupported,
5184
                    "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5185
                    "must have the same data type as the source bands");
5186
                return CE_Failure;
×
5187
            }
5188
        }
5189
    }
5190

5191
    // First pass to compute the total number of pixels to write.
5192
    double dfTotalPixelCount = 0;
360✔
5193
    const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
360✔
5194
    const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
360✔
5195
    const int nSrcXSize = atoi(CSLFetchNameValueDef(
360✔
5196
        papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5197
    const int nSrcYSize = atoi(CSLFetchNameValueDef(
360✔
5198
        papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5199
    for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
956✔
5200
    {
5201
        dfTotalPixelCount +=
596✔
5202
            static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
1,192✔
5203
            papapoOverviewBands[0][iOverview]->GetXSize() *
596✔
5204
            static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
1,192✔
5205
            papapoOverviewBands[0][iOverview]->GetYSize();
596✔
5206
    }
5207

5208
    const GDALDataType eWrkDataType =
5209
        GDALGetOvrWorkDataType(pszResampling, eDataType);
360✔
5210
    const int nWrkDataTypeSize = GDALGetDataTypeSizeBytes(eWrkDataType);
360✔
5211

5212
    const bool bIsMask = papoSrcBands[0]->IsMaskBand();
360✔
5213

5214
    // If we have a nodata mask and we are doing something more complicated
5215
    // than nearest neighbouring, we have to fetch to nodata mask.
5216
    const bool bUseNoDataMask =
5217
        !STARTS_WITH_CI(pszResampling, "NEAR") &&
523✔
5218
        (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
163✔
5219

5220
    bool *const pabHasNoData =
5221
        static_cast<bool *>(VSI_MALLOC_VERBOSE(nBands * sizeof(bool)));
360✔
5222
    double *const padfNoDataValue =
5223
        static_cast<double *>(VSI_MALLOC_VERBOSE(nBands * sizeof(double)));
360✔
5224
    if (pabHasNoData == nullptr || padfNoDataValue == nullptr)
360✔
5225
    {
5226
        CPLFree(pabHasNoData);
×
5227
        CPLFree(padfNoDataValue);
×
5228
        return CE_Failure;
×
5229
    }
5230

5231
    for (int iBand = 0; iBand < nBands; ++iBand)
1,022✔
5232
    {
5233
        int nHasNoData = 0;
662✔
5234
        padfNoDataValue[iBand] =
1,324✔
5235
            papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
662✔
5236
        pabHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
662✔
5237
    }
5238
    const bool bPropagateNoData =
5239
        CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
360✔
5240

5241
    const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
360✔
5242
    const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
1,440✔
5243
                                                       ? CPLGetNumCPUs()
360✔
5244
                                                       : atoi(pszThreads)));
360✔
5245
    auto poThreadPool =
5246
        nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
360✔
5247
    auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5248
                                   : std::unique_ptr<CPLJobQueue>(nullptr);
360✔
5249

5250
    // Only configurable for debug / testing
5251
    const int nChunkMaxSize = std::max(
5252
        100, atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760")));
360✔
5253

5254
    // Second pass to do the real job.
5255
    double dfCurPixelCount = 0;
360✔
5256
    CPLErr eErr = CE_None;
360✔
5257
    for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
955✔
5258
         ++iOverview)
5259
    {
5260
        int iSrcOverview = -1;  // -1 means the source bands.
595✔
5261

5262
        const int nDstTotalWidth =
5263
            papapoOverviewBands[0][iOverview]->GetXSize();
595✔
5264
        const int nDstTotalHeight =
5265
            papapoOverviewBands[0][iOverview]->GetYSize();
595✔
5266

5267
        // Compute the coordinates of the target region to refresh
5268
        constexpr double EPS = 1e-8;
595✔
5269
        const int nDstXOffStart = static_cast<int>(
595✔
5270
            static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
595✔
5271
            EPS);
5272
        const int nDstXOffEnd =
5273
            std::min(static_cast<int>(
1,190✔
5274
                         std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
595✔
5275
                                       nToplevelSrcWidth * nDstTotalWidth -
595✔
5276
                                   EPS)),
5277
                     nDstTotalWidth);
595✔
5278
        const int nDstWidth = nDstXOffEnd - nDstXOffStart;
595✔
5279
        const int nDstYOffStart =
595✔
5280
            static_cast<int>(static_cast<double>(nSrcYOff) /
595✔
5281
                                 nToplevelSrcHeight * nDstTotalHeight +
595✔
5282
                             EPS);
5283
        const int nDstYOffEnd =
5284
            std::min(static_cast<int>(
1,190✔
5285
                         std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
595✔
5286
                                       nToplevelSrcHeight * nDstTotalHeight -
595✔
5287
                                   EPS)),
5288
                     nDstTotalHeight);
595✔
5289

5290
        // Try to use previous level of overview as the source to compute
5291
        // the next level.
5292
        int nSrcWidth = nToplevelSrcWidth;
595✔
5293
        int nSrcHeight = nToplevelSrcHeight;
595✔
5294
        if (iOverview > 0 &&
830✔
5295
            papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
235✔
5296
        {
5297
            nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
227✔
5298
            nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
227✔
5299
            iSrcOverview = iOverview - 1;
227✔
5300
        }
5301

5302
        const double dfXRatioDstToSrc =
595✔
5303
            static_cast<double>(nSrcWidth) / nDstTotalWidth;
595✔
5304
        const double dfYRatioDstToSrc =
595✔
5305
            static_cast<double>(nSrcHeight) / nDstTotalHeight;
595✔
5306

5307
        int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
1,190✔
5308
                                  static_cast<int>(0.5 + dfYRatioDstToSrc));
595✔
5309
        if (nOvrFactor == 0)
595✔
5310
            nOvrFactor = 1;
×
5311

5312
        int nDstChunkXSize = 0;
595✔
5313
        int nDstChunkYSize = 0;
595✔
5314
        papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
595✔
5315
                                                        &nDstChunkYSize);
5316

5317
        const char *pszDST_CHUNK_X_SIZE =
5318
            CSLFetchNameValue(papszOptions, "DST_CHUNK_X_SIZE");
595✔
5319
        const char *pszDST_CHUNK_Y_SIZE =
5320
            CSLFetchNameValue(papszOptions, "DST_CHUNK_Y_SIZE");
595✔
5321
        if (pszDST_CHUNK_X_SIZE && pszDST_CHUNK_Y_SIZE)
595✔
5322
        {
5323
            nDstChunkXSize = std::max(1, atoi(pszDST_CHUNK_X_SIZE));
12✔
5324
            nDstChunkYSize = std::max(1, atoi(pszDST_CHUNK_Y_SIZE));
12✔
5325
            CPLDebug("GDAL", "Using dst chunk size %d x %d", nDstChunkXSize,
12✔
5326
                     nDstChunkYSize);
5327
        }
5328

5329
        // Try to extend the chunk size so that the memory needed to acquire
5330
        // source pixels goes up to 10 MB.
5331
        // This can help for drivers that support multi-threaded reading
5332
        const int nFullResYChunk =
595✔
5333
            2 + static_cast<int>(nDstChunkYSize * dfYRatioDstToSrc);
595✔
5334
        const int nFullResYChunkQueried =
595✔
5335
            nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
595✔
5336
        while (nDstChunkXSize < nDstWidth)
831✔
5337
        {
5338
            const int nFullResXChunk =
253✔
5339
                2 + static_cast<int>(2 * nDstChunkXSize * dfXRatioDstToSrc);
253✔
5340

5341
            const int nFullResXChunkQueried =
253✔
5342
                nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
253✔
5343

5344
            if (static_cast<GIntBig>(nFullResXChunkQueried) *
253✔
5345
                    nFullResYChunkQueried * nBands * nWrkDataTypeSize >
253✔
5346
                nChunkMaxSize)
253✔
5347
            {
5348
                break;
17✔
5349
            }
5350

5351
            nDstChunkXSize *= 2;
236✔
5352
        }
5353
        nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
595✔
5354

5355
        const int nFullResXChunk =
595✔
5356
            2 + static_cast<int>(nDstChunkXSize * dfXRatioDstToSrc);
595✔
5357
        const int nFullResXChunkQueried =
595✔
5358
            nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
595✔
5359

5360
        // Make sure that the RAM requirements to acquire the source data does
5361
        // not exceed nChunkMaxSize
5362
        // If so, reduce the destination chunk size, generate overviews in a
5363
        // temporary dataset, and copy that temporary dataset over the target
5364
        // overview bands (to avoid issues with lossy compression)
5365
        const auto nMemRequirement =
595✔
5366
            static_cast<GIntBig>(nFullResXChunkQueried) *
595✔
5367
            nFullResYChunkQueried * nBands * nWrkDataTypeSize;
595✔
5368
        if (nMemRequirement > nChunkMaxSize &&
595✔
5369
            !(pszDST_CHUNK_X_SIZE && pszDST_CHUNK_Y_SIZE))
10✔
5370
        {
5371
            // Compute a smaller destination chunk size
5372
            const auto nOverShootFactor = nMemRequirement / nChunkMaxSize;
12✔
5373
            const auto nSqrtOverShootFactor = std::max<GIntBig>(
5374
                4, static_cast<GIntBig>(std::ceil(
24✔
5375
                       std::sqrt(static_cast<double>(nOverShootFactor)))));
12✔
5376
            const int nReducedDstChunkXSize = std::max(
5377
                1, static_cast<int>(nDstChunkXSize / nSqrtOverShootFactor));
12✔
5378
            const int nReducedDstChunkYSize = std::max(
5379
                1, static_cast<int>(nDstChunkYSize / nSqrtOverShootFactor));
12✔
5380
            if (nReducedDstChunkXSize < nDstChunkXSize ||
12✔
5381
                nReducedDstChunkYSize < nDstChunkYSize)
×
5382
            {
5383
                CPLStringList aosOptions(papszOptions);
12✔
5384
                aosOptions.SetNameValue(
5385
                    "DST_CHUNK_X_SIZE",
5386
                    CPLSPrintf("%d", nReducedDstChunkXSize));
12✔
5387
                aosOptions.SetNameValue(
5388
                    "DST_CHUNK_Y_SIZE",
5389
                    CPLSPrintf("%d", nReducedDstChunkYSize));
12✔
5390

5391
                const auto nTmpDSMemRequirement =
5392
                    static_cast<GIntBig>(nDstTotalWidth) * nDstTotalHeight *
12✔
5393
                    nBands * GDALGetDataTypeSizeBytes(eDataType);
12✔
5394
                std::unique_ptr<GDALDataset> poTmpDS;
×
5395
                // Config option mostly/only for autotest purposes
5396
                const char *pszGDAL_OVR_TEMP_DRIVER =
5397
                    CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
12✔
5398
                if ((nTmpDSMemRequirement <= nChunkMaxSize &&
12✔
5399
                     !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
2✔
5400
                    EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
10✔
5401
                {
5402
                    auto poTmpDrv =
5403
                        GetGDALDriverManager()->GetDriverByName("MEM");
11✔
5404
                    if (!poTmpDrv)
11✔
5405
                    {
5406
                        eErr = CE_Failure;
×
5407
                        break;
×
5408
                    }
5409
                    poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
11✔
5410
                                                   nDstTotalHeight, nBands,
5411
                                                   eDataType, nullptr));
11✔
5412
                }
5413
                else
5414
                {
5415
                    auto poTmpDrv =
5416
                        GetGDALDriverManager()->GetDriverByName("GTiff");
1✔
5417
                    if (!poTmpDrv)
1✔
5418
                    {
5419
                        eErr = CE_Failure;
×
5420
                        break;
×
5421
                    }
5422
                    std::string osTmpFilename;
2✔
5423
                    auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
1✔
5424
                    if (poDstDS)
1✔
5425
                    {
5426
                        osTmpFilename = poDstDS->GetDescription();
1✔
5427
                        VSIStatBufL sStatBuf;
5428
                        if (!osTmpFilename.empty() &&
1✔
5429
                            VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
×
5430
                            osTmpFilename += "_tmp_ovr.tif";
×
5431
                    }
5432
                    if (osTmpFilename.empty())
1✔
5433
                    {
5434
                        osTmpFilename = CPLGenerateTempFilename(nullptr);
1✔
5435
                        osTmpFilename += ".tif";
1✔
5436
                    }
5437
                    CPLDebug("GDAL",
1✔
5438
                             "Creating temporary file %s of %d x %d x %d",
5439
                             osTmpFilename.c_str(), nDstTotalWidth,
5440
                             nDstTotalHeight, nBands);
5441
                    CPLStringList aosCO;
2✔
5442
                    poTmpDS.reset(poTmpDrv->Create(
1✔
5443
                        osTmpFilename.c_str(), nDstTotalWidth, nDstTotalHeight,
5444
                        nBands, eDataType, aosCO.List()));
1✔
5445
                    if (poTmpDS)
1✔
5446
                    {
5447
                        poTmpDS->MarkSuppressOnClose();
1✔
5448
                        VSIUnlink(osTmpFilename.c_str());
1✔
5449
                    }
5450
                }
5451
                if (!poTmpDS)
12✔
5452
                {
5453
                    eErr = CE_Failure;
×
5454
                    break;
×
5455
                }
5456

5457
                std::vector<GDALRasterBand **> apapoOverviewBands(nBands);
12✔
5458
                for (int i = 0; i < nBands; ++i)
27✔
5459
                {
5460
                    apapoOverviewBands[i] = static_cast<GDALRasterBand **>(
30✔
5461
                        CPLMalloc(sizeof(GDALRasterBand *)));
15✔
5462
                    apapoOverviewBands[i][0] = poTmpDS->GetRasterBand(i + 1);
15✔
5463
                }
5464

5465
                const double dfExtraPixels =
5466
                    static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
24✔
5467
                    papapoOverviewBands[0][iOverview]->GetXSize() *
12✔
5468
                    static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
24✔
5469
                    papapoOverviewBands[0][iOverview]->GetYSize();
12✔
5470

5471
                void *pScaledProgressData = GDALCreateScaledProgress(
24✔
5472
                    dfCurPixelCount / dfTotalPixelCount,
5473
                    (dfCurPixelCount + dfExtraPixels) / dfTotalPixelCount,
12✔
5474
                    pfnProgress, pProgressData);
5475

5476
                // Generate overviews in temporary dataset
5477
                eErr = GDALRegenerateOverviewsMultiBand(
12✔
5478
                    nBands, papoSrcBands, 1, apapoOverviewBands.data(),
12✔
5479
                    pszResampling, GDALScaledProgress, pScaledProgressData,
5480
                    aosOptions.List());
12✔
5481

5482
                GDALDestroyScaledProgress(pScaledProgressData);
12✔
5483

5484
                dfCurPixelCount += dfExtraPixels;
12✔
5485

5486
                for (int i = 0; i < nBands; ++i)
27✔
5487
                {
5488
                    CPLFree(apapoOverviewBands[i]);
15✔
5489
                }
5490

5491
                // Copy temporary dataset to destination overview bands
5492

5493
                if (eErr == CE_None)
12✔
5494
                {
5495
                    // Check if all papapoOverviewBands[][iOverview] bands point
5496
                    // to the same dataset. If so, we can use
5497
                    // GDALDatasetCopyWholeRaster()
5498
                    GDALDataset *poDstOvrBandDS =
5499
                        papapoOverviewBands[0][iOverview]->GetDataset();
12✔
5500
                    if (poDstOvrBandDS)
12✔
5501
                    {
5502
                        if (poDstOvrBandDS->GetRasterCount() != nBands ||
15✔
5503
                            poDstOvrBandDS->GetRasterBand(1) !=
3✔
5504
                                papapoOverviewBands[0][iOverview])
3✔
5505
                        {
5506
                            poDstOvrBandDS = nullptr;
9✔
5507
                        }
5508
                        else
5509
                        {
5510
                            for (int i = 1; poDstOvrBandDS && i < nBands; ++i)
6✔
5511
                            {
5512
                                GDALDataset *poThisDstOvrBandDS =
5513
                                    papapoOverviewBands[i][iOverview]
3✔
5514
                                        ->GetDataset();
3✔
5515
                                if (poThisDstOvrBandDS == nullptr ||
3✔
5516
                                    poThisDstOvrBandDS != poDstOvrBandDS ||
6✔
5517
                                    poThisDstOvrBandDS->GetRasterBand(i + 1) !=
3✔
5518
                                        papapoOverviewBands[i][iOverview])
3✔
5519
                                {
5520
                                    poDstOvrBandDS = nullptr;
×
5521
                                }
5522
                            }
5523
                        }
5524
                    }
5525
                    if (poDstOvrBandDS)
12✔
5526
                    {
5527
                        eErr = GDALDatasetCopyWholeRaster(
3✔
5528
                            GDALDataset::ToHandle(poTmpDS.get()),
5529
                            GDALDataset::ToHandle(poDstOvrBandDS), nullptr,
5530
                            nullptr, nullptr);
5531
                    }
5532
                    else
5533
                    {
5534
                        for (int i = 0; eErr == CE_None && i < nBands; ++i)
18✔
5535
                        {
5536
                            eErr = GDALRasterBandCopyWholeRaster(
9✔
5537
                                GDALRasterBand::ToHandle(
5538
                                    poTmpDS->GetRasterBand(i + 1)),
5539
                                GDALRasterBand::ToHandle(
5540
                                    papapoOverviewBands[i][iOverview]),
9✔
5541
                                nullptr, nullptr, nullptr);
5542
                        }
5543
                    }
5544
                }
5545

5546
                if (eErr != CE_None)
12✔
5547
                    break;
×
5548

5549
                continue;
12✔
5550
            }
5551
        }
5552

5553
        // Structure describing a resampling job
5554
        struct OvrJob
5555
        {
5556
            // Buffers to free when job is finished
5557
            std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
5558
            std::unique_ptr<PointerHolder> oSrcBufferHolder{};
5559
            std::unique_ptr<PointerHolder> oDstBufferHolder{};
5560

5561
            GDALRasterBand *poDstBand = nullptr;
5562

5563
            // Input parameters of pfnResampleFn
5564
            GDALResampleFunction pfnResampleFn = nullptr;
5565
            GDALOverviewResampleArgs args{};
5566
            const void *pChunk = nullptr;
5567

5568
            // Output values of resampling function
5569
            CPLErr eErr = CE_Failure;
5570
            void *pDstBuffer = nullptr;
5571
            GDALDataType eDstBufferDataType = GDT_Unknown;
5572

5573
            // Synchronization
5574
            bool bFinished = false;
5575
            std::mutex mutex{};
5576
            std::condition_variable cv{};
5577
        };
5578

5579
        // Thread function to resample
5580
        const auto JobResampleFunc = [](void *pData)
16,247✔
5581
        {
5582
            OvrJob *poJob = static_cast<OvrJob *>(pData);
16,247✔
5583

5584
            poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
16,247✔
5585
                                               &(poJob->pDstBuffer),
5586
                                               &(poJob->eDstBufferDataType));
5587

5588
            poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
16,246✔
5589

5590
            {
5591
                std::lock_guard<std::mutex> guard(poJob->mutex);
32,494✔
5592
                poJob->bFinished = true;
16,247✔
5593
                poJob->cv.notify_one();
16,247✔
5594
            }
5595
        };
16,247✔
5596

5597
        // Function to write resample data to target band
5598
        const auto WriteJobData = [](const OvrJob *poJob)
16,247✔
5599
        {
5600
            return poJob->poDstBand->RasterIO(
32,494✔
5601
                GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
16,247✔
5602
                poJob->args.nDstXOff2 - poJob->args.nDstXOff,
16,247✔
5603
                poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
16,247✔
5604
                poJob->args.nDstXOff2 - poJob->args.nDstXOff,
16,247✔
5605
                poJob->args.nDstYOff2 - poJob->args.nDstYOff,
16,247✔
5606
                poJob->eDstBufferDataType, 0, 0, nullptr);
16,247✔
5607
        };
5608

5609
        // Wait for completion of oldest job and serialize it
5610
        const auto WaitAndFinalizeOldestJob =
5611
            [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
14✔
5612
        {
5613
            auto poOldestJob = jobList.front().get();
14✔
5614
            {
5615
                std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
28✔
5616
                // coverity[missing_lock:FALSE]
5617
                while (!poOldestJob->bFinished)
14✔
5618
                {
5619
                    poOldestJob->cv.wait(oGuard);
×
5620
                }
5621
            }
5622
            CPLErr l_eErr = poOldestJob->eErr;
14✔
5623
            if (l_eErr == CE_None)
14✔
5624
            {
5625
                l_eErr = WriteJobData(poOldestJob);
14✔
5626
            }
5627

5628
            jobList.pop_front();
14✔
5629
            return l_eErr;
14✔
5630
        };
5631

5632
        // Queue of jobs
5633
        std::list<std::unique_ptr<OvrJob>> jobList;
1,166✔
5634

5635
        std::vector<void *> apaChunk(nBands);
1,166✔
5636
        std::vector<GByte *> apabyChunkNoDataMask(nBands);
1,166✔
5637

5638
        // Iterate on destination overview, block by block.
5639
        for (int nDstYOff = nDstYOffStart;
583✔
5640
             nDstYOff < nDstYOffEnd && eErr == CE_None;
2,229✔
5641
             nDstYOff += nDstChunkYSize)
1,646✔
5642
        {
5643
            int nDstYCount;
5644
            if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
1,646✔
5645
                nDstYCount = nDstChunkYSize;
1,248✔
5646
            else
5647
                nDstYCount = nDstYOffEnd - nDstYOff;
398✔
5648

5649
            int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1,646✔
5650
            int nChunkYOff2 = static_cast<int>(
1,646✔
5651
                ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
1,646✔
5652
            if (nChunkYOff2 > nSrcHeight ||
1,646✔
5653
                nDstYOff + nDstYCount == nDstTotalHeight)
1,646✔
5654
                nChunkYOff2 = nSrcHeight;
580✔
5655
            int nYCount = nChunkYOff2 - nChunkYOff;
1,646✔
5656
            CPLAssert(nYCount <= nFullResYChunk);
1,646✔
5657

5658
            int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
1,646✔
5659
            int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
1,646✔
5660
            if (nChunkYOffQueried < 0)
1,646✔
5661
            {
5662
                nChunkYSizeQueried += nChunkYOffQueried;
126✔
5663
                nChunkYOffQueried = 0;
126✔
5664
            }
5665
            if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
1,646✔
5666
                nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
125✔
5667
            CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
1,646✔
5668

5669
            if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount, nullptr,
1,646✔
5670
                             pProgressData))
5671
            {
5672
                CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
1✔
5673
                eErr = CE_Failure;
1✔
5674
            }
5675

5676
            // Iterate on destination overview, block by block.
5677
            for (int nDstXOff = nDstXOffStart;
1,646✔
5678
                 nDstXOff < nDstXOffEnd && eErr == CE_None;
10,071✔
5679
                 nDstXOff += nDstChunkXSize)
8,425✔
5680
            {
5681
                int nDstXCount = 0;
8,425✔
5682
                if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
8,425✔
5683
                    nDstXCount = nDstChunkXSize;
8,228✔
5684
                else
5685
                    nDstXCount = nDstXOffEnd - nDstXOff;
197✔
5686

5687
                dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
8,425✔
5688

5689
                int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
8,425✔
5690
                int nChunkXOff2 = static_cast<int>(
8,425✔
5691
                    ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
8,425✔
5692
                if (nChunkXOff2 > nSrcWidth ||
8,425✔
5693
                    nDstXOff + nDstXCount == nDstTotalWidth)
8,425✔
5694
                    nChunkXOff2 = nSrcWidth;
1,644✔
5695
                const int nXCount = nChunkXOff2 - nChunkXOff;
8,425✔
5696
                CPLAssert(nXCount <= nFullResXChunk);
8,425✔
5697

5698
                int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
8,425✔
5699
                int nChunkXSizeQueried =
8,425✔
5700
                    nXCount + 2 * nKernelRadius * nOvrFactor;
8,425✔
5701
                if (nChunkXOffQueried < 0)
8,425✔
5702
                {
5703
                    nChunkXSizeQueried += nChunkXOffQueried;
186✔
5704
                    nChunkXOffQueried = 0;
186✔
5705
                }
5706
                if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
8,425✔
5707
                    nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
189✔
5708
                CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
8,425✔
5709
#if DEBUG_VERBOSE
5710
                CPLDebug("GDAL",
5711
                         "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
5712
                         nChunkXOffQueried, nChunkYOffQueried,
5713
                         nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
5714
                         nDstYOff, nDstXCount, nDstYCount);
5715
#endif
5716

5717
                // Avoid accumulating too many tasks and exhaust RAM
5718

5719
                // Try to complete already finished jobs
5720
                while (eErr == CE_None && !jobList.empty())
16,523✔
5721
                {
5722
                    auto poOldestJob = jobList.front().get();
8,352✔
5723
                    {
5724
                        std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
8,352✔
5725
                        if (!poOldestJob->bFinished)
8,352✔
5726
                        {
5727
                            break;
254✔
5728
                        }
5729
                    }
5730
                    eErr = poOldestJob->eErr;
8,098✔
5731
                    if (eErr == CE_None)
8,098✔
5732
                    {
5733
                        eErr = WriteJobData(poOldestJob);
8,098✔
5734
                    }
5735

5736
                    jobList.pop_front();
8,098✔
5737
                }
5738

5739
                // And in case we have saturated the number of threads,
5740
                // wait for completion of tasks to go below the threshold.
5741
                while (eErr == CE_None &&
16,850✔
5742
                       jobList.size() >= static_cast<size_t>(nThreads))
8,425✔
5743
                {
5744
                    eErr = WaitAndFinalizeOldestJob(jobList);
×
5745
                }
5746

5747
                // (Re)allocate buffers if needed
5748
                for (int iBand = 0; iBand < nBands; ++iBand)
24,673✔
5749
                {
5750
                    if (apaChunk[iBand] == nullptr)
16,248✔
5751
                    {
5752
                        apaChunk[iBand] = VSI_MALLOC3_VERBOSE(
9,250✔
5753
                            nFullResXChunkQueried, nFullResYChunkQueried,
5754
                            nWrkDataTypeSize);
5755
                        if (apaChunk[iBand] == nullptr)
9,250✔
5756
                        {
5757
                            eErr = CE_Failure;
×
5758
                        }
5759
                    }
5760
                    if (bUseNoDataMask &&
24,661✔
5761
                        apabyChunkNoDataMask[iBand] == nullptr)
8,413✔
5762
                    {
5763
                        apabyChunkNoDataMask[iBand] =
16,708✔
5764
                            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
8,354✔
5765
                                nFullResXChunkQueried, nFullResYChunkQueried));
5766
                        if (apabyChunkNoDataMask[iBand] == nullptr)
8,354✔
5767
                        {
5768
                            eErr = CE_Failure;
×
5769
                        }
5770
                    }
5771
                }
5772

5773
                // Read the source buffers for all the bands.
5774
                for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
24,673✔
5775
                {
5776
                    GDALRasterBand *poSrcBand = nullptr;
16,248✔
5777
                    if (iSrcOverview == -1)
16,248✔
5778
                        poSrcBand = papoSrcBands[iBand];
15,352✔
5779
                    else
5780
                        poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
896✔
5781
                    eErr = poSrcBand->RasterIO(
16,248✔
5782
                        GF_Read, nChunkXOffQueried, nChunkYOffQueried,
5783
                        nChunkXSizeQueried, nChunkYSizeQueried, apaChunk[iBand],
16,248✔
5784
                        nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType, 0,
5785
                        0, nullptr);
5786

5787
                    if (bUseNoDataMask && eErr == CE_None)
16,248✔
5788
                    {
5789
                        auto poMaskBand = poSrcBand->IsMaskBand()
8,413✔
5790
                                              ? poSrcBand
8,413✔
5791
                                              : poSrcBand->GetMaskBand();
6,312✔
5792
                        eErr = poMaskBand->RasterIO(
8,413✔
5793
                            GF_Read, nChunkXOffQueried, nChunkYOffQueried,
5794
                            nChunkXSizeQueried, nChunkYSizeQueried,
5795
                            apabyChunkNoDataMask[iBand], nChunkXSizeQueried,
8,413✔
5796
                            nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
5797
                    }
5798
                }
5799

5800
                // Compute the resulting overview block.
5801
                for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
24,672✔
5802
                {
5803
                    auto poJob = std::make_unique<OvrJob>();
32,494✔
5804
                    poJob->pfnResampleFn = pfnResampleFn;
16,247✔
5805
                    poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
16,247✔
5806
                    poJob->args.eOvrDataType =
32,494✔
5807
                        poJob->poDstBand->GetRasterDataType();
16,247✔
5808
                    poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
16,247✔
5809
                    poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
16,247✔
5810
                    const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
16,247✔
5811
                        "NBITS", "IMAGE_STRUCTURE");
16,247✔
5812
                    poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
16,247✔
5813
                    poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
16,247✔
5814
                    poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
16,247✔
5815
                    poJob->args.eWrkDataType = eWrkDataType;
16,247✔
5816
                    poJob->pChunk = apaChunk[iBand];
16,247✔
5817
                    poJob->args.pabyChunkNodataMask =
16,247✔
5818
                        apabyChunkNoDataMask[iBand];
16,247✔
5819
                    poJob->args.nChunkXOff = nChunkXOffQueried;
16,247✔
5820
                    poJob->args.nChunkXSize = nChunkXSizeQueried;
16,247✔
5821
                    poJob->args.nChunkYOff = nChunkYOffQueried;
16,247✔
5822
                    poJob->args.nChunkYSize = nChunkYSizeQueried;
16,247✔
5823
                    poJob->args.nDstXOff = nDstXOff;
16,247✔
5824
                    poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
16,247✔
5825
                    poJob->args.nDstYOff = nDstYOff;
16,247✔
5826
                    poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
16,247✔
5827
                    poJob->args.pszResampling = pszResampling;
16,247✔
5828
                    poJob->args.bHasNoData = pabHasNoData[iBand];
16,247✔
5829
                    poJob->args.dfNoDataValue = padfNoDataValue[iBand];
16,247✔
5830
                    poJob->args.eSrcDataType = eDataType;
16,247✔
5831
                    poJob->args.bPropagateNoData = bPropagateNoData;
16,247✔
5832

5833
                    if (poJobQueue)
16,247✔
5834
                    {
5835
                        poJob->oSrcMaskBufferHolder.reset(
16,224✔
5836
                            new PointerHolder(apabyChunkNoDataMask[iBand]));
8,112✔
5837
                        apabyChunkNoDataMask[iBand] = nullptr;
8,112✔
5838

5839
                        poJob->oSrcBufferHolder.reset(
16,224✔
5840
                            new PointerHolder(apaChunk[iBand]));
8,112✔
5841
                        apaChunk[iBand] = nullptr;
8,112✔
5842

5843
                        poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
8,112✔
5844
                        jobList.emplace_back(std::move(poJob));
8,112✔
5845
                    }
5846
                    else
5847
                    {
5848
                        JobResampleFunc(poJob.get());
8,135✔
5849
                        eErr = poJob->eErr;
8,135✔
5850
                        if (eErr == CE_None)
8,135✔
5851
                        {
5852
                            eErr = WriteJobData(poJob.get());
8,135✔
5853
                        }
5854
                    }
5855
                }
5856
            }
5857
        }
5858

5859
        // Wait for all pending jobs to complete
5860
        while (!jobList.empty())
597✔
5861
        {
5862
            const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
14✔
5863
            if (l_eErr != CE_None && eErr == CE_None)
14✔
5864
                eErr = l_eErr;
×
5865
        }
5866

5867
        // Flush the data to overviews.
5868
        for (int iBand = 0; iBand < nBands; ++iBand)
1,735✔
5869
        {
5870
            CPLFree(apaChunk[iBand]);
1,152✔
5871
            papapoOverviewBands[iBand][iOverview]->FlushCache(false);
1,152✔
5872

5873
            CPLFree(apabyChunkNoDataMask[iBand]);
1,152✔
5874
        }
5875
    }
5876

5877
    CPLFree(pabHasNoData);
360✔
5878
    CPLFree(padfNoDataValue);
360✔
5879

5880
    if (eErr == CE_None)
360✔
5881
        pfnProgress(1.0, nullptr, pProgressData);
358✔
5882

5883
    return eErr;
360✔
5884
}
5885

5886
/************************************************************************/
5887
/*            GDALRegenerateOverviewsMultiBand()                        */
5888
/************************************************************************/
5889

5890
/**
5891
 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5892
 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5893
 *
5894
 * This function will generate one or more overview images from a base
5895
 * image using the requested downsampling algorithm.  Its primary use
5896
 * is for generating overviews via GDALDataset::BuildOverviews(), but it
5897
 * can also be used to generate downsampled images in one file from another
5898
 * outside the overview architecture.
5899
 *
5900
 * The output bands need to exist in advance and share the same characteristics
5901
 * (type, dimensions)
5902
 *
5903
 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5904
 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5905
 *
5906
 * It does not support color tables or complex data types.
5907
 *
5908
 * The pseudo-algorithm used by the function is :
5909
 *    for each overview
5910
 *       iterate on lines of the source by a step of deltay
5911
 *           iterate on columns of the source  by a step of deltax
5912
 *               read the source data of size deltax * deltay for all the bands
5913
 *               generate the corresponding overview block for all the bands
5914
 *
5915
 * This function will honour properly NODATA_VALUES tuples (special dataset
5916
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5917
 * considered as the nodata value and not each value of the triplet
5918
 * independently per band.
5919
 *
5920
 * The GDAL_NUM_THREADS configuration option can be set
5921
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5922
 * overview computation.
5923
 *
5924
 * @param apoSrcBands the list of source bands to downsample
5925
 * @param aapoOverviewBands bidimension array of bands. First dimension is
5926
 *                          indexed by bands. Second dimension is indexed by
5927
 *                          overview levels. All aapoOverviewBands[i] arrays
5928
 *                          must have the same size (i.e. same number of
5929
 *                          overviews)
5930
 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5931
 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5932
 * @param pfnProgress progress report function.
5933
 * @param pProgressData progress function callback data.
5934
 * @param papszOptions NULL terminated list of options as
5935
 *                     key=value pairs, or NULL
5936
 *                     The XOFF, YOFF, XSIZE and YSIZE
5937
 *                     options can be specified to express that overviews should
5938
 *                     be regenerated only in the specified subset of the source
5939
 *                     dataset.
5940
 * @return CE_None on success or CE_Failure on failure.
5941
 * @since 3.10
5942
 */
5943

5944
CPLErr GDALRegenerateOverviewsMultiBand(
5✔
5945
    const std::vector<GDALRasterBand *> &apoSrcBands,
5946
    const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
5947
    const char *pszResampling, GDALProgressFunc pfnProgress,
5948
    void *pProgressData, CSLConstList papszOptions)
5949
{
5950
    CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
5✔
5951
    for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
15✔
5952
    {
5953
        CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
10✔
5954
    }
5955

5956
    if (aapoOverviewBands.empty())
5✔
5957
        return CE_None;
×
5958

5959
    std::vector<GDALRasterBand **> apapoOverviewBands;
5✔
5960
    for (auto &apoOverviewBands : aapoOverviewBands)
20✔
5961
    {
5962
        auto papoOverviewBands = static_cast<GDALRasterBand **>(
5963
            CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
15✔
5964
        for (size_t i = 0; i < apoOverviewBands.size(); ++i)
30✔
5965
        {
5966
            papoOverviewBands[i] = apoOverviewBands[i];
15✔
5967
        }
5968
        apapoOverviewBands.push_back(papoOverviewBands);
15✔
5969
    }
5970
    const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
10✔
5971
        static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
5✔
5972
        static_cast<int>(aapoOverviewBands[0].size()),
5✔
5973
        apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
5✔
5974
        papszOptions);
5975
    for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
20✔
5976
        CPLFree(papoOverviewBands);
15✔
5977
    return eErr;
5✔
5978
}
5979

5980
/************************************************************************/
5981
/*                        GDALComputeBandStats()                        */
5982
/************************************************************************/
5983

5984
/** Undocumented
5985
 * @param hSrcBand undocumented.
5986
 * @param nSampleStep Step between scanlines used to compute statistics.
5987
 *                    When nSampleStep is equal to 1, all scanlines will
5988
 *                    be processed.
5989
 * @param pdfMean undocumented.
5990
 * @param pdfStdDev undocumented.
5991
 * @param pfnProgress undocumented.
5992
 * @param pProgressData undocumented.
5993
 * @return undocumented
5994
 */
5995
CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
16✔
5996
                                        int nSampleStep, double *pdfMean,
5997
                                        double *pdfStdDev,
5998
                                        GDALProgressFunc pfnProgress,
5999
                                        void *pProgressData)
6000

6001
{
6002
    VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
16✔
6003

6004
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
16✔
6005

6006
    if (pfnProgress == nullptr)
16✔
6007
        pfnProgress = GDALDummyProgress;
16✔
6008

6009
    const int nWidth = poSrcBand->GetXSize();
16✔
6010
    const int nHeight = poSrcBand->GetYSize();
16✔
6011

6012
    if (nSampleStep >= nHeight || nSampleStep < 1)
16✔
6013
        nSampleStep = 1;
3✔
6014

6015
    GDALDataType eWrkType = GDT_Unknown;
16✔
6016
    float *pafData = nullptr;
16✔
6017
    GDALDataType eType = poSrcBand->GetRasterDataType();
16✔
6018
    const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
16✔
6019
    if (bComplex)
16✔
6020
    {
6021
        pafData = static_cast<float *>(
6022
            VSI_MALLOC_VERBOSE(nWidth * 2 * sizeof(float)));
×
6023
        eWrkType = GDT_CFloat32;
×
6024
    }
6025
    else
6026
    {
6027
        pafData =
6028
            static_cast<float *>(VSI_MALLOC_VERBOSE(nWidth * sizeof(float)));
16✔
6029
        eWrkType = GDT_Float32;
16✔
6030
    }
6031

6032
    if (nWidth == 0 || pafData == nullptr)
16✔
6033
    {
6034
        VSIFree(pafData);
×
6035
        return CE_Failure;
×
6036
    }
6037

6038
    /* -------------------------------------------------------------------- */
6039
    /*      Loop over all sample lines.                                     */
6040
    /* -------------------------------------------------------------------- */
6041
    double dfSum = 0.0;
16✔
6042
    double dfSum2 = 0.0;
16✔
6043
    int iLine = 0;
16✔
6044
    GIntBig nSamples = 0;
16✔
6045

6046
    do
2,143✔
6047
    {
6048
        if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
2,159✔
6049
                         pProgressData))
6050
        {
6051
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
×
6052
            CPLFree(pafData);
×
6053
            return CE_Failure;
×
6054
        }
6055

6056
        const CPLErr eErr =
6057
            poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
2,159✔
6058
                                1, eWrkType, 0, 0, nullptr);
6059
        if (eErr != CE_None)
2,159✔
6060
        {
6061
            CPLFree(pafData);
1✔
6062
            return eErr;
1✔
6063
        }
6064

6065
        for (int iPixel = 0; iPixel < nWidth; ++iPixel)
725,204✔
6066
        {
6067
            float fValue = 0.0f;
723,046✔
6068

6069
            if (bComplex)
723,046✔
6070
            {
6071
                // Compute the magnitude of the complex value.
6072
                fValue =
6073
                    std::hypot(pafData[iPixel * 2], pafData[iPixel * 2 + 1]);
×
6074
            }
6075
            else
6076
            {
6077
                fValue = pafData[iPixel];
723,046✔
6078
            }
6079

6080
            dfSum += fValue;
723,046✔
6081
            dfSum2 += static_cast<double>(fValue) * fValue;
723,046✔
6082
        }
6083

6084
        nSamples += nWidth;
2,158✔
6085
        iLine += nSampleStep;
2,158✔
6086
    } while (iLine < nHeight);
2,158✔
6087

6088
    if (!pfnProgress(1.0, nullptr, pProgressData))
15✔
6089
    {
6090
        CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
×
6091
        CPLFree(pafData);
×
6092
        return CE_Failure;
×
6093
    }
6094

6095
    /* -------------------------------------------------------------------- */
6096
    /*      Produce the result values.                                      */
6097
    /* -------------------------------------------------------------------- */
6098
    if (pdfMean != nullptr)
15✔
6099
        *pdfMean = dfSum / nSamples;
15✔
6100

6101
    if (pdfStdDev != nullptr)
15✔
6102
    {
6103
        const double dfMean = dfSum / nSamples;
15✔
6104

6105
        *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
15✔
6106
    }
6107

6108
    CPLFree(pafData);
15✔
6109

6110
    return CE_None;
15✔
6111
}
6112

6113
/************************************************************************/
6114
/*                  GDALOverviewMagnitudeCorrection()                   */
6115
/*                                                                      */
6116
/*      Correct the mean and standard deviation of the overviews of     */
6117
/*      the given band to match the base layer approximately.           */
6118
/************************************************************************/
6119

6120
/** Undocumented
6121
 * @param hBaseBand undocumented.
6122
 * @param nOverviewCount undocumented.
6123
 * @param pahOverviews undocumented.
6124
 * @param pfnProgress undocumented.
6125
 * @param pProgressData undocumented.
6126
 * @return undocumented
6127
 */
6128
CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
×
6129
                                       int nOverviewCount,
6130
                                       GDALRasterBandH *pahOverviews,
6131
                                       GDALProgressFunc pfnProgress,
6132
                                       void *pProgressData)
6133

6134
{
6135
    VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
×
6136

6137
    /* -------------------------------------------------------------------- */
6138
    /*      Compute mean/stddev for source raster.                          */
6139
    /* -------------------------------------------------------------------- */
6140
    double dfOrigMean = 0.0;
×
6141
    double dfOrigStdDev = 0.0;
×
6142
    {
6143
        const CPLErr eErr =
6144
            GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
×
6145
                                 pfnProgress, pProgressData);
6146

6147
        if (eErr != CE_None)
×
6148
            return eErr;
×
6149
    }
6150

6151
    /* -------------------------------------------------------------------- */
6152
    /*      Loop on overview bands.                                         */
6153
    /* -------------------------------------------------------------------- */
6154
    for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
×
6155
    {
6156
        GDALRasterBand *poOverview =
6157
            GDALRasterBand::FromHandle(pahOverviews[iOverview]);
×
6158
        double dfOverviewMean, dfOverviewStdDev;
6159

6160
        const CPLErr eErr =
6161
            GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
×
6162
                                 &dfOverviewStdDev, pfnProgress, pProgressData);
6163

6164
        if (eErr != CE_None)
×
6165
            return eErr;
×
6166

6167
        double dfGain = 1.0;
×
6168
        if (dfOrigStdDev >= 0.0001)
×
6169
            dfGain = dfOrigStdDev / dfOverviewStdDev;
×
6170

6171
        /* --------------------------------------------------------------------
6172
         */
6173
        /*      Apply gain and offset. */
6174
        /* --------------------------------------------------------------------
6175
         */
6176
        const int nWidth = poOverview->GetXSize();
×
6177
        const int nHeight = poOverview->GetYSize();
×
6178

6179
        GDALDataType eWrkType = GDT_Unknown;
×
6180
        float *pafData = nullptr;
×
6181
        const GDALDataType eType = poOverview->GetRasterDataType();
×
6182
        const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
×
6183
        if (bComplex)
×
6184
        {
6185
            pafData = static_cast<float *>(
6186
                VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
×
6187
            eWrkType = GDT_CFloat32;
×
6188
        }
6189
        else
6190
        {
6191
            pafData = static_cast<float *>(
6192
                VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
×
6193
            eWrkType = GDT_Float32;
×
6194
        }
6195

6196
        if (pafData == nullptr)
×
6197
        {
6198
            return CE_Failure;
×
6199
        }
6200

6201
        for (int iLine = 0; iLine < nHeight; ++iLine)
×
6202
        {
6203
            if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
×
6204
                             pProgressData))
6205
            {
6206
                CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
×
6207
                CPLFree(pafData);
×
6208
                return CE_Failure;
×
6209
            }
6210

6211
            if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
×
6212
                                     nWidth, 1, eWrkType, 0, 0,
6213
                                     nullptr) != CE_None)
×
6214
            {
6215
                CPLFree(pafData);
×
6216
                return CE_Failure;
×
6217
            }
6218

6219
            for (int iPixel = 0; iPixel < nWidth; ++iPixel)
×
6220
            {
6221
                if (bComplex)
×
6222
                {
6223
                    pafData[iPixel * 2] *= static_cast<float>(dfGain);
×
6224
                    pafData[iPixel * 2 + 1] *= static_cast<float>(dfGain);
×
6225
                }
6226
                else
6227
                {
6228
                    pafData[iPixel] = static_cast<float>(
×
6229
                        (pafData[iPixel] - dfOverviewMean) * dfGain +
×
6230
                        dfOrigMean);
6231
                }
6232
            }
6233

6234
            if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
×
6235
                                     nWidth, 1, eWrkType, 0, 0,
6236
                                     nullptr) != CE_None)
×
6237
            {
6238
                CPLFree(pafData);
×
6239
                return CE_Failure;
×
6240
            }
6241
        }
6242

6243
        if (!pfnProgress(1.0, nullptr, pProgressData))
×
6244
        {
6245
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
×
6246
            CPLFree(pafData);
×
6247
            return CE_Failure;
×
6248
        }
6249

6250
        CPLFree(pafData);
×
6251
    }
6252

6253
    return CE_None;
×
6254
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc