• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

OSGeo / gdal / 13836648005

13 Mar 2025 02:09PM UTC coverage: 70.436% (-0.01%) from 70.446%
13836648005

push

github

web-flow
New Transform type: Homography (#11949)

Add new transform type, Homography.
Add functions to compute homography from a list of GCPs.
Add functions to serialize and deserialize a homography
Automatically select homography transfrom when there are 4 or 5 GCPs present.

Fixes #11940

231 of 274 new or added lines in 2 files covered. (84.31%)

16257 existing lines in 42 files now uncovered.

553736 of 786159 relevant lines covered (70.44%)

221595.72 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

86.91
/gcore/overview.cpp
1

2
/******************************************************************************
3
 *
4
 * Project:  GDAL Core
5
 * Purpose:  Helper code to implement overview support in different drivers.
6
 * Author:   Frank Warmerdam, warmerdam@pobox.com
7
 *
8
 ******************************************************************************
9
 * Copyright (c) 2000, Frank Warmerdam
10
 * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11
 *
12
 * SPDX-License-Identifier: MIT
13
 ****************************************************************************/
14

15
#include "cpl_port.h"
16
#include "gdal_priv.h"
17

18
#include <cmath>
19
#include <cstddef>
20
#include <cstdlib>
21

22
#include <algorithm>
23
#include <complex>
24
#include <condition_variable>
25
#include <limits>
26
#include <list>
27
#include <memory>
28
#include <mutex>
29
#include <vector>
30

31
#include "cpl_conv.h"
32
#include "cpl_error.h"
33
#include "cpl_float.h"
34
#include "cpl_progress.h"
35
#include "cpl_vsi.h"
36
#include "gdal.h"
37
#include "gdal_thread_pool.h"
38
#include "gdalwarper.h"
39

40
#ifdef USE_NEON_OPTIMIZATIONS
41
#include "include_sse2neon.h"
42
#define USE_SSE2
43

44
#include "gdalsse_priv.h"
45

46
// Restrict to 64bit processors because they are guaranteed to have SSE2,
47
// or if __AVX2__ is defined.
48
#elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
49
#define USE_SSE2
50

51
#include "gdalsse_priv.h"
52

53
#ifdef __SSE3__
54
#include <pmmintrin.h>
55
#endif
56
#ifdef __SSSE3__
57
#include <tmmintrin.h>
58
#endif
59
#ifdef __SSE4_1__
60
#include <smmintrin.h>
61
#endif
62
#ifdef __AVX2__
63
#include <immintrin.h>
64
#endif
65

66
#endif
67

68
// To be included after above USE_SSE2 and include gdalsse_priv.h
69
// to avoid build issue on Windows x86
70
#include "gdal_priv_templates.hpp"
71

72
/************************************************************************/
73
/*                      GDALResampleChunk_Near()                        */
74
/************************************************************************/
75

76
template <class T>
77
static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
6,095✔
78
                                      const T *pChunk, T **ppDstBuffer)
79

80
{
81
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
6,095✔
82
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
6,095✔
83
    const GDALDataType eWrkDataType = args.eWrkDataType;
6,095✔
84
    const int nChunkXOff = args.nChunkXOff;
6,095✔
85
    const int nChunkXSize = args.nChunkXSize;
6,095✔
86
    const int nChunkYOff = args.nChunkYOff;
6,095✔
87
    const int nDstXOff = args.nDstXOff;
6,095✔
88
    const int nDstXOff2 = args.nDstXOff2;
6,095✔
89
    const int nDstYOff = args.nDstYOff;
6,095✔
90
    const int nDstYOff2 = args.nDstYOff2;
6,095✔
91
    const int nDstXWidth = nDstXOff2 - nDstXOff;
6,095✔
92

93
    /* -------------------------------------------------------------------- */
94
    /*      Allocate buffers.                                               */
95
    /* -------------------------------------------------------------------- */
96
    *ppDstBuffer = static_cast<T *>(
6,095✔
97
        VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
6,095✔
98
                            GDALGetDataTypeSizeBytes(eWrkDataType)));
99
    if (*ppDstBuffer == nullptr)
6,095✔
100
    {
101
        return CE_Failure;
×
102
    }
103
    T *const pDstBuffer = *ppDstBuffer;
6,095✔
104

105
    int *panSrcXOff =
106
        static_cast<int *>(VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(int)));
6,095✔
107

108
    if (panSrcXOff == nullptr)
6,095✔
109
    {
110
        VSIFree(panSrcXOff);
×
111
        return CE_Failure;
×
112
    }
113

114
    /* ==================================================================== */
115
    /*      Precompute inner loop constants.                                */
116
    /* ==================================================================== */
117
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
592,860✔
118
    {
119
        int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
586,765✔
120
        if (nSrcXOff < nChunkXOff)
586,765✔
121
            nSrcXOff = nChunkXOff;
×
122

123
        panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
586,765✔
124
    }
125

126
    /* ==================================================================== */
127
    /*      Loop over destination scanlines.                                */
128
    /* ==================================================================== */
129
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
216,591✔
130
    {
131
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
210,496✔
132
        if (nSrcYOff < nChunkYOff)
210,496✔
133
            nSrcYOff = nChunkYOff;
×
134

135
        const T *const pSrcScanline =
210,496✔
136
            pChunk +
137
            (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
210,496✔
138
            nChunkXOff;
208,026✔
139

140
        /* --------------------------------------------------------------------
141
         */
142
        /*      Loop over destination pixels */
143
        /* --------------------------------------------------------------------
144
         */
145
        T *pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
210,496✔
146
        for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
119,221,034✔
147
        {
148
            pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
119,010,564✔
149
        }
150
    }
151

152
    CPLFree(panSrcXOff);
6,095✔
153

154
    return CE_None;
6,095✔
155
}
156

157
static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
6,095✔
158
                                     const void *pChunk, void **ppDstBuffer,
159
                                     GDALDataType *peDstBufferDataType)
160
{
161
    *peDstBufferDataType = args.eWrkDataType;
6,095✔
162
    switch (args.eWrkDataType)
6,095✔
163
    {
164
        // For nearest resampling, as no computation is done, only the
165
        // size of the data type matters.
166
        case GDT_Byte:
5,967✔
167
        case GDT_Int8:
168
        {
169
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
5,967✔
170
            return GDALResampleChunk_NearT(
5,967✔
171
                args, static_cast<const uint8_t *>(pChunk),
172
                reinterpret_cast<uint8_t **>(ppDstBuffer));
5,967✔
173
        }
174

175
        case GDT_Int16:
26✔
176
        case GDT_UInt16:
177
        case GDT_Float16:
178
        {
179
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
26✔
180
            return GDALResampleChunk_NearT(
26✔
181
                args, static_cast<const uint16_t *>(pChunk),
182
                reinterpret_cast<uint16_t **>(ppDstBuffer));
26✔
183
        }
184

185
        case GDT_CInt16:
55✔
186
        case GDT_CFloat16:
187
        case GDT_Int32:
188
        case GDT_UInt32:
189
        case GDT_Float32:
190
        {
191
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
55✔
192
            return GDALResampleChunk_NearT(
55✔
193
                args, static_cast<const uint32_t *>(pChunk),
194
                reinterpret_cast<uint32_t **>(ppDstBuffer));
55✔
195
        }
196

197
        case GDT_CInt32:
43✔
198
        case GDT_CFloat32:
199
        case GDT_Int64:
200
        case GDT_UInt64:
201
        case GDT_Float64:
202
        {
203
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
43✔
204
            return GDALResampleChunk_NearT(
43✔
205
                args, static_cast<const uint64_t *>(pChunk),
206
                reinterpret_cast<uint64_t **>(ppDstBuffer));
43✔
207
        }
208

209
        case GDT_CFloat64:
4✔
210
        {
211
            return GDALResampleChunk_NearT(
4✔
212
                args, static_cast<const std::complex<double> *>(pChunk),
213
                reinterpret_cast<std::complex<double> **>(ppDstBuffer));
4✔
214
        }
215

216
        case GDT_Unknown:
×
217
        case GDT_TypeCount:
218
            break;
×
219
    }
220
    CPLAssert(false);
×
221
    return CE_Failure;
222
}
223

224
namespace
225
{
226

227
// Find in the color table the entry whose RGB value is the closest
228
// (using quadratic distance) to the test color, ignoring transparent entries.
229
int BestColorEntry(const std::vector<GDALColorEntry> &entries,
3,837✔
230
                   const GDALColorEntry &test)
231
{
232
    int nMinDist = std::numeric_limits<int>::max();
3,837✔
233
    size_t bestEntry = 0;
3,837✔
234
    for (size_t i = 0; i < entries.size(); ++i)
986,109✔
235
    {
236
        const GDALColorEntry &entry = entries[i];
982,272✔
237
        // Ignore transparent entries
238
        if (entry.c4 == 0)
982,272✔
239
            continue;
3,237✔
240

241
        int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
979,035✔
242
                    ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
979,035✔
243
                    ((test.c3 - entry.c3) * (test.c3 - entry.c3));
979,035✔
244
        if (nDist < nMinDist)
979,035✔
245
        {
246
            nMinDist = nDist;
15,847✔
247
            bestEntry = i;
15,847✔
248
        }
249
    }
250
    return static_cast<int>(bestEntry);
3,837✔
251
}
252

253
std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
7✔
254
                                           int &transparentIdx)
255
{
256
    std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
7✔
257

258
    transparentIdx = -1;
7✔
259
    int i = 0;
7✔
260
    for (auto &entry : entries)
1,799✔
261
    {
262
        table.GetColorEntryAsRGB(i, &entry);
1,792✔
263
        if (transparentIdx < 0 && entry.c4 == 0)
1,792✔
264
            transparentIdx = i;
1✔
265
        ++i;
1,792✔
266
    }
267
    return entries;
7✔
268
}
269

270
}  // unnamed  namespace
271

272
/************************************************************************/
273
/*                             SQUARE()                                 */
274
/************************************************************************/
275

276
template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
3,721✔
277
{
278
    return static_cast<Tsquare>(val) * val;
3,721✔
279
}
280

281
/************************************************************************/
282
/*                          ComputeIntegerRMS()                         */
283
/************************************************************************/
284
// Compute rms = sqrt(sumSquares / weight) in such a way that it is the
285
// integer that minimizes abs(rms**2 - sumSquares / weight)
286
template <class T, class Twork>
287
inline T ComputeIntegerRMS(double sumSquares, double weight)
42✔
288
{
289
    const double sumDivWeight = sumSquares / weight;
42✔
290
    T rms = static_cast<T>(sqrt(sumDivWeight));
42✔
291

292
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
293
    // Naive version:
294
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
295
    if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
42✔
296
        2 * sumDivWeight)
42✔
297
        rms += 1;
6✔
298
    return rms;
42✔
299
}
300

301
template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
×
302
{
303
    CPLAssert(false);
×
304
    return 0;
305
}
306

307
template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
24✔
308
{
309
    // It has been verified that given the correction on rms below, using
310
    // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
311
    // is equivalent, so use the former as it is used twice.
312
    const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
24✔
313
    const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
24✔
314
    GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
24✔
315

316
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
317
    // Naive version:
318
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
319
    // Optimized version for integer case and weight == 4
320
    if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
24✔
321
        rms += 1;
5✔
322
    return rms;
24✔
323
}
324

325
template <>
326
inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
20✔
327
{
328
    const double sumDivWeight = sumSquares * 0.25;
20✔
329
    GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
20✔
330

331
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
332
    // Naive version:
333
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
334
    // Optimized version for integer case and weight == 4
335
    if (static_cast<GUInt32>(rms) * (rms + 1) <
20✔
336
        static_cast<GUInt32>(sumDivWeight + 0.25))
20✔
337
        rms += 1;
4✔
338
    return rms;
20✔
339
}
340

341
#ifdef USE_SSE2
342

343
/************************************************************************/
344
/*                   QuadraticMeanByteSSE2OrAVX2()                      */
345
/************************************************************************/
346

347
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
348
#define sse2_packus_epi32 _mm_packus_epi32
349
#else
350
inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
516,119✔
351
{
352
    const auto minus32768_32 = _mm_set1_epi32(-32768);
516,119✔
353
    const auto minus32768_16 = _mm_set1_epi16(-32768);
516,119✔
354
    a = _mm_add_epi32(a, minus32768_32);
516,119✔
355
    b = _mm_add_epi32(b, minus32768_32);
516,119✔
356
    a = _mm_packs_epi32(a, b);
516,119✔
357
    a = _mm_sub_epi16(a, minus32768_16);
516,119✔
358
    return a;
516,119✔
359
}
360
#endif
361

362
#if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
363
#define sse2_hadd_epi16 _mm_hadd_epi16
364
#else
365
inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
3,700,830✔
366
{
367
    // Horizontal addition of adjacent pairs
368
    const auto mask = _mm_set1_epi32(0xFFFF);
3,700,830✔
369
    const auto horizLo =
370
        _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
11,102,500✔
371
    const auto horizHi =
372
        _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
11,102,500✔
373

374
    // Recombine low and high parts
375
    return _mm_packs_epi32(horizLo, horizHi);
3,700,830✔
376
}
377
#endif
378

379
#ifdef __AVX2__
380

381
#define DEST_ELTS 16
382
#define set1_epi16 _mm256_set1_epi16
383
#define set1_epi32 _mm256_set1_epi32
384
#define setzero _mm256_setzero_si256
385
#define set1_ps _mm256_set1_ps
386
#define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
387
#define unpacklo_epi8 _mm256_unpacklo_epi8
388
#define unpackhi_epi8 _mm256_unpackhi_epi8
389
#define madd_epi16 _mm256_madd_epi16
390
#define add_epi32 _mm256_add_epi32
391
#define mul_ps _mm256_mul_ps
392
#define cvtepi32_ps _mm256_cvtepi32_ps
393
#define sqrt_ps _mm256_sqrt_ps
394
#define cvttps_epi32 _mm256_cvttps_epi32
395
#define packs_epi32 _mm256_packs_epi32
396
#define packus_epi32 _mm256_packus_epi32
397
#define srli_epi32 _mm256_srli_epi32
398
#define mullo_epi16 _mm256_mullo_epi16
399
#define srli_epi16 _mm256_srli_epi16
400
#define cmpgt_epi16 _mm256_cmpgt_epi16
401
#define add_epi16 _mm256_add_epi16
402
#define sub_epi16 _mm256_sub_epi16
403
#define packus_epi16 _mm256_packus_epi16
404
/* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
405
/* to get the lower 128-bit bits of what would be a true 256-bit vector register
406
 */
407
#define store_lo(x, y)                                                         \
408
    _mm_storeu_si128(reinterpret_cast<__m128i *>(x),                           \
409
                     _mm256_extracti128_si256(                                 \
410
                         _mm256_permute4x64_epi64((y), 0 | (2 << 2)), 0))
411
#define hadd_epi16 _mm256_hadd_epi16
412
#define zeroupper() _mm256_zeroupper()
413
#else
414
#define DEST_ELTS 8
415
#define set1_epi16 _mm_set1_epi16
416
#define set1_epi32 _mm_set1_epi32
417
#define setzero _mm_setzero_si128
418
#define set1_ps _mm_set1_ps
419
#define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
420
#define unpacklo_epi8 _mm_unpacklo_epi8
421
#define unpackhi_epi8 _mm_unpackhi_epi8
422
#define madd_epi16 _mm_madd_epi16
423
#define add_epi32 _mm_add_epi32
424
#define mul_ps _mm_mul_ps
425
#define cvtepi32_ps _mm_cvtepi32_ps
426
#define sqrt_ps _mm_sqrt_ps
427
#define cvttps_epi32 _mm_cvttps_epi32
428
#define packs_epi32 _mm_packs_epi32
429
#define packus_epi32 sse2_packus_epi32
430
#define srli_epi32 _mm_srli_epi32
431
#define mullo_epi16 _mm_mullo_epi16
432
#define srli_epi16 _mm_srli_epi16
433
#define cmpgt_epi16 _mm_cmpgt_epi16
434
#define add_epi16 _mm_add_epi16
435
#define sub_epi16 _mm_sub_epi16
436
#define packus_epi16 _mm_packus_epi16
437
#define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
438
#define hadd_epi16 sse2_hadd_epi16
439
#define zeroupper() (void)0
440
#endif
441

442
#if defined(__GNUC__) && defined(__AVX2__)
443
// Disabling inlining works around a bug with gcc 9.3 (Ubuntu 20.04) in
444
// -O2 -mavx2 mode in QuadraticMeanFloatSSE2(),
445
// where the registry that contains minus_zero is correctly
446
// loaded the first time the function is called (looking at the disassembly,
447
// one sees it is loaded much earlier than the function), but gets corrupted
448
// (zeroed) in following iterations.
449
// It appears the bug is due to the explicit zeroupper() call at the end of
450
// the function.
451
// The bug is at least solved in gcc 10.2.
452
// Inlining doesn't bring much here to performance.
453
// This is also needed with gcc 9.3 on QuadraticMeanByteSSE2OrAVX2() in
454
// -O3 -mavx2 mode
455
#define NOINLINE __attribute__((noinline))
456
#else
457
#define NOINLINE
458
#endif
459

460
template <class T>
461
static int NOINLINE
462
QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
5,385✔
463
                            const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
464
                            T *CPL_RESTRICT pDstScanline)
465
{
466
    // Optimized implementation for RMS on Byte by
467
    // processing by group of 8 output pixels, so as to use
468
    // a single _mm_sqrt_ps() call for 4 output pixels
469
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
5,385✔
470

471
    int iDstPixel = 0;
5,385✔
472
    const auto one16 = set1_epi16(1);
5,385✔
473
    const auto one32 = set1_epi32(1);
5,385✔
474
    const auto zero = setzero();
5,385✔
475
    const auto minus32768 = set1_epi16(-32768);
5,385✔
476

477
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
521,496✔
478
    {
479
        // Load 2 * DEST_ELTS bytes from each line
480
        auto firstLine = loadu_int(pSrcScanlineShifted);
516,111✔
481
        auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
1,032,220✔
482
        // Extend those Bytes as UInt16s
483
        auto firstLineLo = unpacklo_epi8(firstLine, zero);
516,111✔
484
        auto firstLineHi = unpackhi_epi8(firstLine, zero);
516,111✔
485
        auto secondLineLo = unpacklo_epi8(secondLine, zero);
516,111✔
486
        auto secondLineHi = unpackhi_epi8(secondLine, zero);
516,111✔
487

488
        // Multiplication of 16 bit values and horizontal
489
        // addition of 32 bit results
490
        // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
491
        firstLineLo = madd_epi16(firstLineLo, firstLineLo);
516,111✔
492
        firstLineHi = madd_epi16(firstLineHi, firstLineHi);
516,111✔
493
        secondLineLo = madd_epi16(secondLineLo, secondLineLo);
516,111✔
494
        secondLineHi = madd_epi16(secondLineHi, secondLineHi);
516,111✔
495

496
        // Vertical addition
497
        const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
516,111✔
498
        const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
516,111✔
499

500
        const auto sumSquaresPlusOneDiv4Lo =
501
            srli_epi32(add_epi32(sumSquaresLo, one32), 2);
1,032,220✔
502
        const auto sumSquaresPlusOneDiv4Hi =
503
            srli_epi32(add_epi32(sumSquaresHi, one32), 2);
1,032,220✔
504

505
        // Take square root and truncate/floor to int32
506
        const auto rmsLo =
507
            cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
1,548,330✔
508
        const auto rmsHi =
509
            cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
1,548,330✔
510

511
        // Merge back low and high registers with each RMS value
512
        // as a 16 bit value.
513
        auto rms = packs_epi32(rmsLo, rmsHi);
516,111✔
514

515
        // Round to upper value if it minimizes the
516
        // error |rms^2 - sumSquares/4|
517
        // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
518
        //    rms += 1;
519
        // which is equivalent to:
520
        // if( rms * (rms + 1) < (sumSquares+1) / 4 )
521
        //    rms += 1;
522
        // And both left and right parts fit on 16 (unsigned) bits
523
        const auto sumSquaresPlusOneDiv4 =
524
            packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
516,111✔
525
        // cmpgt_epi16 operates on signed int16, but here
526
        // we have unsigned values, so shift them by -32768 before
527
        auto mask = cmpgt_epi16(
2,580,560✔
528
            add_epi16(sumSquaresPlusOneDiv4, minus32768),
529
            add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
530
        // The value of the mask will be -1 when the correction needs to be
531
        // applied
532
        rms = sub_epi16(rms, mask);
516,111✔
533

534
        // Pack each 16 bit RMS value to 8 bits
535
        rms = packus_epi16(rms, rms /* could be anything */);
516,111✔
536
        store_lo(&pDstScanline[iDstPixel], rms);
516,111✔
537
        pSrcScanlineShifted += 2 * DEST_ELTS;
516,111✔
538
    }
539
    zeroupper();
540

541
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
5,385✔
542
    return iDstPixel;
5,385✔
543
}
544

545
/************************************************************************/
546
/*                      AverageByteSSE2OrAVX2()                         */
547
/************************************************************************/
548

549
template <class T>
550
static int
551
AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
107,181✔
552
                      const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
553
                      T *CPL_RESTRICT pDstScanline)
554
{
555
    // Optimized implementation for average on Byte by
556
    // processing by group of 8 output pixels.
557

558
    const auto zero = setzero();
107,181✔
559
    const auto two16 = set1_epi16(2);
107,181✔
560
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
107,181✔
561

562
    int iDstPixel = 0;
107,181✔
563
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
3,808,020✔
564
    {
565
        // Load 2 * DEST_ELTS bytes from each line
566
        const auto firstLine = loadu_int(pSrcScanlineShifted);
3,700,830✔
567
        const auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
7,401,670✔
568
        // Extend those Bytes as UInt16s
569
        const auto firstLineLo = unpacklo_epi8(firstLine, zero);
3,700,830✔
570
        const auto firstLineHi = unpackhi_epi8(firstLine, zero);
3,700,830✔
571
        const auto secondLineLo = unpacklo_epi8(secondLine, zero);
3,700,830✔
572
        const auto secondLineHi = unpackhi_epi8(secondLine, zero);
3,700,830✔
573

574
        // Vertical addition
575
        const auto sumLo = add_epi16(firstLineLo, secondLineLo);
3,700,830✔
576
        const auto sumHi = add_epi16(firstLineHi, secondLineHi);
3,700,830✔
577

578
        // Horizontal addition of adjacent pairs, and recombine low and high
579
        // parts
580
        const auto sum = hadd_epi16(sumLo, sumHi);
3,700,830✔
581

582
        // average = (sum + 2) / 4
583
        auto average = srli_epi16(add_epi16(sum, two16), 2);
7,401,670✔
584

585
        // Pack each 16 bit average value to 8 bits
586
        average = packus_epi16(average, average /* could be anything */);
3,700,830✔
587
        store_lo(&pDstScanline[iDstPixel], average);
3,700,830✔
588
        pSrcScanlineShifted += 2 * DEST_ELTS;
3,700,830✔
589
    }
590
    zeroupper();
591

592
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
107,181✔
593
    return iDstPixel;
107,181✔
594
}
595

596
/************************************************************************/
597
/*                     QuadraticMeanUInt16SSE2()                        */
598
/************************************************************************/
599

600
#ifdef __SSE3__
601
#define sse2_hadd_pd _mm_hadd_pd
602
#else
603
inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
8✔
604
{
605
    auto aLo_bLo =
606
        _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
32✔
607
    auto aHi_bHi =
608
        _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
32✔
609
    return _mm_add_pd(aLo_bLo, aHi_bHi);  // (aLo + aHi, bLo + bHi)
8✔
610
}
611
#endif
612

613
inline __m128d SQUARE(__m128d x)
40✔
614
{
615
    return _mm_mul_pd(x, x);
40✔
616
}
617

618
#ifdef __AVX2__
619

620
inline __m256d SQUARE(__m256d x)
621
{
622
    return _mm256_mul_pd(x, x);
623
}
624

625
inline __m256d FIXUP_LANES(__m256d x)
626
{
627
    return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
628
}
629

630
inline __m256 FIXUP_LANES(__m256 x)
631
{
632
    return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
633
}
634

635
#endif
636

637
template <class T>
638
static int
639
QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
10✔
640
                        const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
641
                        T *CPL_RESTRICT pDstScanline)
642
{
643
    // Optimized implementation for RMS on UInt16 by
644
    // processing by group of 4 output pixels.
645
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
10✔
646

647
    int iDstPixel = 0;
10✔
648
    const auto zero = _mm_setzero_si128();
10✔
649

650
#ifdef __AVX2__
651
    const auto zeroDot25 = _mm256_set1_pd(0.25);
652
    const auto zeroDot5 = _mm256_set1_pd(0.5);
653

654
    // The first four 0's could be anything, as we only take the bottom
655
    // 128 bits.
656
    const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
657
#else
658
    const auto zeroDot25 = _mm_set1_pd(0.25);
10✔
659
    const auto zeroDot5 = _mm_set1_pd(0.5);
10✔
660
#endif
661

662
    for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
40✔
663
    {
664
        // Load 8 UInt16 from each line
665
        const auto firstLine = _mm_loadu_si128(
30✔
666
            reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
667
        const auto secondLine =
668
            _mm_loadu_si128(reinterpret_cast<__m128i const *>(
30✔
669
                pSrcScanlineShifted + nChunkXSize));
30✔
670

671
        // Detect if all of the source values fit in 14 bits.
672
        // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
673
        // and we can do a much faster implementation.
674
        const auto maskTmp =
675
            _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
60✔
676
#if defined(__i386__) || defined(_M_IX86)
677
        uint64_t nMaskFitsIn14Bits = 0;
678
        _mm_storel_epi64(
679
            reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
680
            _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
681
#else
682
        const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
30✔
683
            _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
684
#endif
685
        if (nMaskFitsIn14Bits == 0)
30✔
686
        {
687
            // Multiplication of 16 bit values and horizontal
688
            // addition of 32 bit results
689
            const auto firstLineHSumSquare =
690
                _mm_madd_epi16(firstLine, firstLine);
26✔
691
            const auto secondLineHSumSquare =
692
                _mm_madd_epi16(secondLine, secondLine);
26✔
693
            // Vertical addition
694
            const auto sumSquares =
695
                _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
26✔
696
            // In theory we should take sqrt(sumSquares * 0.25f)
697
            // but given the rounding we do, this is equivalent to
698
            // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
699
            // sumSquares <= 4 * 16383^2
700
            const auto one32 = _mm_set1_epi32(1);
26✔
701
            const auto sumSquaresPlusOneDiv4 =
702
                _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
52✔
703
            // Take square root and truncate/floor to int32
704
            auto rms = _mm_cvttps_epi32(
78✔
705
                _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
706

707
            // Round to upper value if it minimizes the
708
            // error |rms^2 - sumSquares/4|
709
            // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
710
            //    rms += 1;
711
            // which is equivalent to:
712
            // if( rms * rms + rms < (sumSquares+1) / 4 )
713
            //    rms += 1;
714
            auto mask =
715
                _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
78✔
716
                                _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
717
            rms = _mm_sub_epi32(rms, mask);
26✔
718
            // Pack each 32 bit RMS value to 16 bits
719
            rms = _mm_packs_epi32(rms, rms /* could be anything */);
26✔
720
            _mm_storel_epi64(
721
                reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
26✔
722
            pSrcScanlineShifted += 8;
26✔
723
            continue;
26✔
724
        }
725

726
        // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
727
        // to 32 bit would result in 4 multiplications instead of 8, but
728
        // mullo/mulhi have a worse throughput than mul_pd.
729

730
        // Extend those UInt16s as UInt32s
731
        const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
4✔
732
        const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
4✔
733
        const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
4✔
734
        const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
4✔
735

736
#ifdef __AVX2__
737
        // Multiplication of 32 bit values previously converted to 64 bit double
738
        const auto firstLineLoDbl = SQUARE(_mm256_cvtepi32_pd(firstLineLo));
739
        const auto firstLineHiDbl = SQUARE(_mm256_cvtepi32_pd(firstLineHi));
740
        const auto secondLineLoDbl = SQUARE(_mm256_cvtepi32_pd(secondLineLo));
741
        const auto secondLineHiDbl = SQUARE(_mm256_cvtepi32_pd(secondLineHi));
742

743
        // Vertical addition of squares
744
        const auto sumSquaresLo =
745
            _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
746
        const auto sumSquaresHi =
747
            _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
748

749
        // Horizontal addition of squares
750
        const auto sumSquares =
751
            FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
752

753
        const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
754

755
        // Take square root and truncate/floor to int32
756
        auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
757
        const auto rmsDouble = _mm256_cvtepi32_pd(rms);
758
        const auto right = _mm256_sub_pd(
759
            sumDivWeight, _mm256_add_pd(SQUARE(rmsDouble), rmsDouble));
760

761
        auto mask =
762
            _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
763
        // Extract 32-bit from each of the 4 64-bit masks
764
        // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
765
        // _MM_SHUFFLE(2,0,2,0)));
766
        mask = _mm256_permutevar8x32_ps(mask, permutation);
767
        const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
768

769
        // Apply the correction
770
        rms = _mm_sub_epi32(rms, maskI);
771

772
        // Pack each 32 bit RMS value to 16 bits
773
        rms = _mm_packus_epi32(rms, rms /* could be anything */);
774
#else
775
        // Multiplication of 32 bit values previously converted to 64 bit double
776
        const auto firstLineLoLo = SQUARE(_mm_cvtepi32_pd(firstLineLo));
4✔
777
        const auto firstLineLoHi =
778
            SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
8✔
779
        const auto firstLineHiLo = SQUARE(_mm_cvtepi32_pd(firstLineHi));
4✔
780
        const auto firstLineHiHi =
781
            SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
8✔
782

783
        const auto secondLineLoLo = SQUARE(_mm_cvtepi32_pd(secondLineLo));
4✔
784
        const auto secondLineLoHi =
785
            SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
8✔
786
        const auto secondLineHiLo = SQUARE(_mm_cvtepi32_pd(secondLineHi));
4✔
787
        const auto secondLineHiHi =
788
            SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
8✔
789

790
        // Vertical addition of squares
791
        const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
4✔
792
        const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
4✔
793
        const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
4✔
794
        const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
4✔
795

796
        // Horizontal addition of squares
797
        const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
4✔
798
        const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
4✔
799

800
        const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
4✔
801
        const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
4✔
802
        // Take square root and truncate/floor to int32
803
        const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
8✔
804
        const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
8✔
805

806
        // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
807
        // if( 0.5 < sumDivWeight - (rms * rms + rms) )
808
        //     rms += 1;
809
        const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
4✔
810
        const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
4✔
811
        const auto rightLo = _mm_sub_pd(
8✔
812
            sumDivWeightLo, _mm_add_pd(SQUARE(rmsLoDouble), rmsLoDouble));
813
        const auto rightHi = _mm_sub_pd(
12✔
814
            sumDivWeightHi, _mm_add_pd(SQUARE(rmsHiDouble), rmsHiDouble));
815

816
        const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
8✔
817
        const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
4✔
818
        // The value of the mask will be -1 when the correction needs to be
819
        // applied
820
        const auto mask = _mm_castps_si128(_mm_shuffle_ps(
8✔
821
            maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
822

823
        auto rms = _mm_castps_si128(
16✔
824
            _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
825
        // Apply the correction
826
        rms = _mm_sub_epi32(rms, mask);
4✔
827

828
        // Pack each 32 bit RMS value to 16 bits
829
        rms = sse2_packus_epi32(rms, rms /* could be anything */);
4✔
830
#endif
831

832
        _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
4✔
833
                         rms);
834
        pSrcScanlineShifted += 8;
4✔
835
    }
836

837
    zeroupper();
838

839
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
10✔
840
    return iDstPixel;
10✔
841
}
842

843
/************************************************************************/
844
/*                         AverageUInt16SSE2()                          */
845
/************************************************************************/
846

847
template <class T>
848
static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
9✔
849
                             const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
850
                             T *CPL_RESTRICT pDstScanline)
851
{
852
    // Optimized implementation for average on UInt16 by
853
    // processing by group of 8 output pixels.
854

855
    const auto mask = _mm_set1_epi32(0xFFFF);
9✔
856
    const auto two = _mm_set1_epi32(2);
9✔
857
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
9✔
858

859
    int iDstPixel = 0;
9✔
860
    for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
13✔
861
    {
862
        __m128i averageLow;
863
        // Load 8 UInt16 from each line
864
        {
865
            const auto firstLine = _mm_loadu_si128(
4✔
866
                reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
867
            const auto secondLine =
868
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
4✔
869
                    pSrcScanlineShifted + nChunkXSize));
4✔
870

871
            // Horizontal addition and extension to 32 bit
872
            const auto horizAddFirstLine = _mm_add_epi32(
12✔
873
                _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
874
            const auto horizAddSecondLine =
875
                _mm_add_epi32(_mm_and_si128(secondLine, mask),
12✔
876
                              _mm_srli_epi32(secondLine, 16));
877

878
            // Vertical addition and average computation
879
            // average = (sum + 2) >> 2
880
            const auto sum = _mm_add_epi32(
8✔
881
                _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
882
            averageLow = _mm_srli_epi32(sum, 2);
4✔
883
        }
884
        // Load 8 UInt16 from each line
885
        __m128i averageHigh;
886
        {
887
            const auto firstLine = _mm_loadu_si128(
4✔
888
                reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
4✔
889
            const auto secondLine =
890
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
4✔
891
                    pSrcScanlineShifted + 8 + nChunkXSize));
4✔
892

893
            // Horizontal addition and extension to 32 bit
894
            const auto horizAddFirstLine = _mm_add_epi32(
12✔
895
                _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
896
            const auto horizAddSecondLine =
897
                _mm_add_epi32(_mm_and_si128(secondLine, mask),
12✔
898
                              _mm_srli_epi32(secondLine, 16));
899

900
            // Vertical addition and average computation
901
            // average = (sum + 2) >> 2
902
            const auto sum = _mm_add_epi32(
8✔
903
                _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
904
            averageHigh = _mm_srli_epi32(sum, 2);
4✔
905
        }
906

907
        // Pack each 32 bit average value to 16 bits
908
        auto average = sse2_packus_epi32(averageLow, averageHigh);
4✔
909
        _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
4✔
910
                         average);
911
        pSrcScanlineShifted += 16;
4✔
912
    }
913

914
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
9✔
915
    return iDstPixel;
9✔
916
}
917

918
/************************************************************************/
919
/*                      QuadraticMeanFloatSSE2()                        */
920
/************************************************************************/
921

922
#ifdef __AVX2__
923
#define RMS_FLOAT_ELTS 8
924
#define set1_ps _mm256_set1_ps
925
#define loadu_ps _mm256_loadu_ps
926
#define andnot_ps _mm256_andnot_ps
927
#define and_ps _mm256_and_ps
928
#define max_ps _mm256_max_ps
929
#define shuffle_ps _mm256_shuffle_ps
930
#define div_ps _mm256_div_ps
931
#define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
932
#define mul_ps _mm256_mul_ps
933
#define add_ps _mm256_add_ps
934
#define hadd_ps _mm256_hadd_ps
935
#define sqrt_ps _mm256_sqrt_ps
936
#define or_ps _mm256_or_ps
937
#define unpacklo_ps _mm256_unpacklo_ps
938
#define unpackhi_ps _mm256_unpackhi_ps
939
#define storeu_ps _mm256_storeu_ps
940

941
inline __m256 SQUARE(__m256 x)
942
{
943
    return _mm256_mul_ps(x, x);
944
}
945

946
#else
947

948
#ifdef __SSE3__
949
#define sse2_hadd_ps _mm_hadd_ps
950
#else
951
inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
952
{
953
    auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
954
    auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
955
    return _mm_add_ps(aEven_bEven, aOdd_bOdd);  // (aEven + aOdd, bEven + bOdd)
956
}
957
#endif
958

959
#define RMS_FLOAT_ELTS 4
960
#define set1_ps _mm_set1_ps
961
#define loadu_ps _mm_loadu_ps
962
#define andnot_ps _mm_andnot_ps
963
#define and_ps _mm_and_ps
964
#define max_ps _mm_max_ps
965
#define shuffle_ps _mm_shuffle_ps
966
#define div_ps _mm_div_ps
967
#define cmpeq_ps _mm_cmpeq_ps
968
#define mul_ps _mm_mul_ps
969
#define add_ps _mm_add_ps
970
#define hadd_ps sse2_hadd_ps
971
#define sqrt_ps _mm_sqrt_ps
972
#define or_ps _mm_or_ps
973
#define unpacklo_ps _mm_unpacklo_ps
974
#define unpackhi_ps _mm_unpackhi_ps
975
#define storeu_ps _mm_storeu_ps
976

977
inline __m128 SQUARE(__m128 x)
272✔
978
{
979
    return _mm_mul_ps(x, x);
272✔
980
}
981

982
inline __m128 FIXUP_LANES(__m128 x)
68✔
983
{
984
    return x;
68✔
985
}
986

987
#endif
988

989
template <class T>
990
static int NOINLINE
991
QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
34✔
992
                       const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
993
                       T *CPL_RESTRICT pDstScanline)
994
{
995
    // Optimized implementation for RMS on Float32 by
996
    // processing by group of RMS_FLOAT_ELTS output pixels.
997
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
34✔
998

999
    int iDstPixel = 0;
34✔
1000
    const auto minus_zero = set1_ps(-0.0f);
34✔
1001
    const auto zeroDot25 = set1_ps(0.25f);
34✔
1002
    const auto one = set1_ps(1.0f);
34✔
1003
    const auto infv = set1_ps(std::numeric_limits<float>::infinity());
68✔
1004

1005
    for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
102✔
1006
         iDstPixel += RMS_FLOAT_ELTS)
1007
    {
1008
        // Load 2*RMS_FLOAT_ELTS Float32 from each line
1009
        auto firstLineLo =
1010
            loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
68✔
1011
        auto firstLineHi = loadu_ps(reinterpret_cast<float const *>(
68✔
1012
            pSrcScanlineShifted + RMS_FLOAT_ELTS));
68✔
1013
        auto secondLineLo = loadu_ps(
68✔
1014
            reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
68✔
1015
        auto secondLineHi = loadu_ps(reinterpret_cast<float const *>(
68✔
1016
            pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize));
68✔
1017

1018
        // Take the absolute value
1019
        firstLineLo = andnot_ps(minus_zero, firstLineLo);
68✔
1020
        firstLineHi = andnot_ps(minus_zero, firstLineHi);
68✔
1021
        secondLineLo = andnot_ps(minus_zero, secondLineLo);
68✔
1022
        secondLineHi = andnot_ps(minus_zero, secondLineHi);
68✔
1023

1024
        auto firstLineEven =
1025
            shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
68✔
1026
        auto firstLineOdd =
1027
            shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
68✔
1028
        auto secondLineEven =
1029
            shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
68✔
1030
        auto secondLineOdd =
1031
            shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
68✔
1032

1033
        // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
1034
        const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
204✔
1035
                                 max_ps(secondLineEven, secondLineEven));
1036

1037
        // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
1038
        // This step is important to avoid that the square evaluates to infinity
1039
        // for sufficiently big input.
1040
        auto invMax = div_ps(one, maxV);
68✔
1041
        // Deal with 0 being the maximum to correct division by zero
1042
        // note: comparing to -0 leads to identical results as to comparing with
1043
        // 0
1044
        invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
136✔
1045

1046
        firstLineEven = mul_ps(firstLineEven, invMax);
68✔
1047
        firstLineOdd = mul_ps(firstLineOdd, invMax);
68✔
1048
        secondLineEven = mul_ps(secondLineEven, invMax);
68✔
1049
        secondLineOdd = mul_ps(secondLineOdd, invMax);
68✔
1050

1051
        // Compute squares
1052
        firstLineEven = SQUARE(firstLineEven);
68✔
1053
        firstLineOdd = SQUARE(firstLineOdd);
68✔
1054
        secondLineEven = SQUARE(secondLineEven);
68✔
1055
        secondLineOdd = SQUARE(secondLineOdd);
68✔
1056

1057
        const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
204✔
1058
                                       add_ps(secondLineEven, secondLineOdd));
1059

1060
        auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
204✔
1061

1062
        // Deal with infinity being the maximum
1063
        const auto maskIsInf = cmpeq_ps(maxV, infv);
68✔
1064
        rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
136✔
1065

1066
        rms = FIXUP_LANES(rms);
68✔
1067

1068
        // coverity[incompatible_cast]
1069
        storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]), rms);
68✔
1070
        pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
68✔
1071
    }
1072

1073
    zeroupper();
1074

1075
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
34✔
1076
    return iDstPixel;
34✔
1077
}
1078

1079
/************************************************************************/
1080
/*                        AverageFloatSSE2()                            */
1081
/************************************************************************/
1082

1083
template <class T>
1084
static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
14✔
1085
                            const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1086
                            T *CPL_RESTRICT pDstScanline)
1087
{
1088
    // Optimized implementation for average on Float32 by
1089
    // processing by group of 4 output pixels.
1090
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
14✔
1091

1092
    int iDstPixel = 0;
14✔
1093
    const auto zeroDot25 = _mm_set1_ps(0.25f);
14✔
1094

1095
    for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
32✔
1096
    {
1097
        // Load 8 Float32 from each line
1098
        const auto firstLineLo =
1099
            _mm_loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
18✔
1100
        const auto firstLineHi = _mm_loadu_ps(
18✔
1101
            reinterpret_cast<float const *>(pSrcScanlineShifted + 4));
18✔
1102
        const auto secondLineLo = _mm_loadu_ps(
18✔
1103
            reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
18✔
1104
        const auto secondLineHi = _mm_loadu_ps(reinterpret_cast<float const *>(
18✔
1105
            pSrcScanlineShifted + 4 + nChunkXSize));
18✔
1106

1107
        // Vertical addition
1108
        const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
18✔
1109
        const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
18✔
1110

1111
        // Horizontal addition
1112
        const auto A =
1113
            _mm_shuffle_ps(sumLo, sumHi, 0 | (2 << 2) | (0 << 4) | (2 << 6));
18✔
1114
        const auto B =
1115
            _mm_shuffle_ps(sumLo, sumHi, 1 | (3 << 2) | (1 << 4) | (3 << 6));
18✔
1116
        const auto sum = _mm_add_ps(A, B);
18✔
1117

1118
        const auto average = _mm_mul_ps(sum, zeroDot25);
18✔
1119

1120
        // coverity[incompatible_cast]
1121
        _mm_storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]),
18✔
1122
                      average);
1123
        pSrcScanlineShifted += 8;
18✔
1124
    }
1125

1126
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
14✔
1127
    return iDstPixel;
14✔
1128
}
1129

1130
#endif
1131

1132
/************************************************************************/
1133
/*                    GDALResampleChunk_AverageOrRMS()                  */
1134
/************************************************************************/
1135

1136
template <class T, class Tsum, GDALDataType eWrkDataType>
1137
static CPLErr
1138
GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
10,384✔
1139
                                 const T *pChunk, void **ppDstBuffer)
1140
{
1141
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
10,384✔
1142
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
10,384✔
1143
    const double dfSrcXDelta = args.dfSrcXDelta;
10,384✔
1144
    const double dfSrcYDelta = args.dfSrcYDelta;
10,384✔
1145
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
10,384✔
1146
    const int nChunkXOff = args.nChunkXOff;
10,384✔
1147
    const int nChunkYOff = args.nChunkYOff;
10,384✔
1148
    const int nChunkXSize = args.nChunkXSize;
10,384✔
1149
    const int nChunkYSize = args.nChunkYSize;
10,384✔
1150
    const int nDstXOff = args.nDstXOff;
10,384✔
1151
    const int nDstXOff2 = args.nDstXOff2;
10,384✔
1152
    const int nDstYOff = args.nDstYOff;
10,384✔
1153
    const int nDstYOff2 = args.nDstYOff2;
10,384✔
1154
    const char *pszResampling = args.pszResampling;
10,384✔
1155
    bool bHasNoData = args.bHasNoData;
10,384✔
1156
    const double dfNoDataValue = args.dfNoDataValue;
10,384✔
1157
    const GDALColorTable *poColorTable = args.poColorTable;
10,384✔
1158
    const bool bPropagateNoData = args.bPropagateNoData;
10,384✔
1159

1160
    // AVERAGE_BIT2GRAYSCALE
1161
    const bool bBit2Grayscale =
1162
        CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
10,384✔
1163
    const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
10,383✔
1164
    if (bBit2Grayscale)
10,384✔
1165
        poColorTable = nullptr;
9✔
1166

1167
    T tNoDataValue;
1168
    if (!bHasNoData)
10,384✔
1169
        tNoDataValue = 0;
10,333✔
1170
    else
1171
        tNoDataValue = static_cast<T>(dfNoDataValue);
51✔
1172
    const T tReplacementVal =
10,384✔
1173
        bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
107✔
1174
                         args.eOvrDataType, dfNoDataValue))
51✔
1175
                   : 0;
1176

1177
    int nChunkRightXOff = nChunkXOff + nChunkXSize;
10,384✔
1178
    int nChunkBottomYOff = nChunkYOff + nChunkYSize;
10,384✔
1179
    int nDstXWidth = nDstXOff2 - nDstXOff;
10,384✔
1180

1181
    /* -------------------------------------------------------------------- */
1182
    /*      Allocate buffers.                                               */
1183
    /* -------------------------------------------------------------------- */
1184
    *ppDstBuffer = static_cast<T *>(
10,384✔
1185
        VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
10,384✔
1186
                            GDALGetDataTypeSizeBytes(eWrkDataType)));
1187
    if (*ppDstBuffer == nullptr)
10,384✔
1188
    {
1189
        return CE_Failure;
×
1190
    }
1191
    T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
10,384✔
1192

1193
    struct PrecomputedXValue
1194
    {
1195
        int nLeftXOffShifted;
1196
        int nRightXOffShifted;
1197
        double dfLeftWeight;
1198
        double dfRightWeight;
1199
        double dfTotalWeightFullLine;
1200
    };
1201

1202
    PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1203
        VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(PrecomputedXValue)));
10,384✔
1204

1205
    if (pasSrcX == nullptr)
10,382✔
1206
    {
1207
        VSIFree(pasSrcX);
×
1208
        return CE_Failure;
×
1209
    }
1210

1211
    int nTransparentIdx = -1;
10,382✔
1212
    std::vector<GDALColorEntry> colorEntries;
10,382✔
1213
    if (poColorTable)
10,381✔
1214
        colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
5✔
1215

1216
    // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1217
    // it as nodata value
1218
    if (bHasNoData && dfNoDataValue >= 0.0f &&
10,411✔
1219
        tNoDataValue < colorEntries.size())
27✔
1220
        colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1✔
1221

1222
    // Or if we have no explicit nodata, but a color table entry that is
1223
    // transparent, consider it as the nodata value
1224
    else if (!bHasNoData && nTransparentIdx >= 0)
10,383✔
1225
    {
1226
        bHasNoData = true;
×
1227
        tNoDataValue = static_cast<T>(nTransparentIdx);
×
1228
    }
1229

1230
    /* ==================================================================== */
1231
    /*      Precompute inner loop constants.                                */
1232
    /* ==================================================================== */
1233
    bool bSrcXSpacingIsTwo = true;
10,384✔
1234
    int nLastSrcXOff2 = -1;
10,384✔
1235
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
837,082✔
1236
    {
1237
        double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
826,698✔
1238
        // Apply some epsilon to avoid numerical precision issues
1239
        int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
826,698✔
1240
        double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
826,698✔
1241
        int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
826,698✔
1242

1243
        if (nSrcXOff < nChunkXOff)
826,698✔
1244
            nSrcXOff = nChunkXOff;
×
1245
        if (nSrcXOff2 == nSrcXOff)
826,698✔
1246
            nSrcXOff2++;
×
1247
        if (nSrcXOff2 > nChunkRightXOff)
826,698✔
1248
            nSrcXOff2 = nChunkRightXOff;
1✔
1249

1250
        pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
826,698✔
1251
        pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
826,698✔
1252
            nSrcXOff2 - nChunkXOff;
826,698✔
1253
        pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
21✔
1254
            (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
826,698✔
1255
        pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
826,698✔
1256
            1 - (nSrcXOff2 - dfSrcXOff2);
826,698✔
1257
        pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
826,698✔
1258
            pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
826,698✔
1259
        if (nSrcXOff + 1 < nSrcXOff2)
826,698✔
1260
        {
1261
            pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
826,677✔
1262
                nSrcXOff2 - nSrcXOff - 2;
826,677✔
1263
            pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
826,677✔
1264
                pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
826,677✔
1265
        }
1266

1267
        if (nSrcXOff2 - nSrcXOff != 2 ||
826,698✔
1268
            (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
697,223✔
1269
        {
1270
            bSrcXSpacingIsTwo = false;
120,592✔
1271
        }
1272
        nLastSrcXOff2 = nSrcXOff2;
826,698✔
1273
    }
1274

1275
    /* ==================================================================== */
1276
    /*      Loop over destination scanlines.                                */
1277
    /* ==================================================================== */
1278
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
749,004✔
1279
    {
1280
        double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
738,621✔
1281
        int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
738,621✔
1282
        if (nSrcYOff < nChunkYOff)
738,621✔
1283
            nSrcYOff = nChunkYOff;
×
1284

1285
        double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
738,621✔
1286
        int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
738,621✔
1287
        if (nSrcYOff2 == nSrcYOff)
738,621✔
1288
            ++nSrcYOff2;
×
1289
        if (nSrcYOff2 > nChunkBottomYOff)
738,621✔
1290
            nSrcYOff2 = nChunkBottomYOff;
3✔
1291

1292
        T *const pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
738,621✔
1293

1294
        /* --------------------------------------------------------------------
1295
         */
1296
        /*      Loop over destination pixels */
1297
        /* --------------------------------------------------------------------
1298
         */
1299
        if (poColorTable == nullptr)
738,621✔
1300
        {
1301
            if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
738,509✔
1302
                pabyChunkNodataMask == nullptr)
1303
            {
1304
                if (eWrkDataType == GDT_Byte || eWrkDataType == GDT_UInt16)
1305
                {
1306
                    // Optimized case : no nodata, overview by a factor of 2 and
1307
                    // regular x and y src spacing.
1308
                    const T *pSrcScanlineShifted =
112,585✔
1309
                        pChunk + pasSrcX[0].nLeftXOffShifted +
112,585✔
1310
                        static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
112,585✔
1311
                            nChunkXSize;
112,585✔
1312
                    int iDstPixel = 0;
112,585✔
1313
#ifdef USE_SSE2
1314
                    if (bQuadraticMean && eWrkDataType == GDT_Byte)
112,566✔
1315
                    {
1316
                        iDstPixel = QuadraticMeanByteSSE2OrAVX2(
5,385✔
1317
                            nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1318
                            pDstScanline);
1319
                    }
1320
                    else if (bQuadraticMean /* && eWrkDataType == GDT_UInt16 */)
107,200✔
1321
                    {
1322
                        iDstPixel = QuadraticMeanUInt16SSE2(
10✔
1323
                            nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1324
                            pDstScanline);
1325
                    }
1326
                    else if (/* !bQuadraticMean && */ eWrkDataType == GDT_Byte)
1327
                    {
1328
                        iDstPixel = AverageByteSSE2OrAVX2(
107,181✔
1329
                            nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1330
                            pDstScanline);
1331
                    }
1332
                    else /* if( !bQuadraticMean && eWrkDataType == GDT_UInt16 )
1333
                          */
1334
                    {
1335
                        iDstPixel = AverageUInt16SSE2(nDstXWidth, nChunkXSize,
9✔
1336
                                                      pSrcScanlineShifted,
1337
                                                      pDstScanline);
1338
                    }
1339
#endif
1340
                    for (; iDstPixel < nDstXWidth; ++iDstPixel)
274,941✔
1341
                    {
1342
                        Tsum nTotal = 0;
162,356✔
1343
                        T nVal;
1344
                        if (bQuadraticMean)
162,356✔
1345
                            nTotal =
44✔
1346
                                SQUARE<Tsum>(pSrcScanlineShifted[0]) +
44✔
1347
                                SQUARE<Tsum>(pSrcScanlineShifted[1]) +
44✔
1348
                                SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
44✔
1349
                                SQUARE<Tsum>(
44✔
1350
                                    pSrcScanlineShifted[1 + nChunkXSize]);
44✔
1351
                        else
1352
                            nTotal = pSrcScanlineShifted[0] +
162,312✔
1353
                                     pSrcScanlineShifted[1] +
162,312✔
1354
                                     pSrcScanlineShifted[nChunkXSize] +
162,312✔
1355
                                     pSrcScanlineShifted[1 + nChunkXSize];
162,312✔
1356

1357
                        constexpr int nTotalWeight = 4;
162,356✔
1358
                        if (bQuadraticMean)
162,356✔
1359
                            nVal = ComputeIntegerRMS_4values<T>(nTotal);
44✔
1360
                        else
1361
                            nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
162,312✔
1362
                                                  nTotalWeight);
1363

1364
                        // No need to compare nVal against tNoDataValue as we
1365
                        // are in a case where pabyChunkNodataMask == nullptr
1366
                        // implies the absence of nodata value.
1367
                        pDstScanline[iDstPixel] = nVal;
162,356✔
1368
                        pSrcScanlineShifted += 2;
162,356✔
1369
                    }
1370
                }
1371
                else
1372
                {
1373
                    CPLAssert(eWrkDataType == GDT_Float32 ||
1374
                              eWrkDataType == GDT_Float64);
1375
                    const T *pSrcScanlineShifted =
70✔
1376
                        pChunk + pasSrcX[0].nLeftXOffShifted +
70✔
1377
                        static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
70✔
1378
                            nChunkXSize;
70✔
1379
                    int iDstPixel = 0;
70✔
1380
#ifdef USE_SSE2
1381
                    if (eWrkDataType == GDT_Float32)
1382
                    {
1383
                        if (bQuadraticMean)
48✔
1384
                        {
1385
                            iDstPixel = QuadraticMeanFloatSSE2(
34✔
1386
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1387
                                pDstScanline);
1388
                        }
1389
                        else
1390
                        {
1391
                            iDstPixel = AverageFloatSSE2(
14✔
1392
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1393
                                pDstScanline);
1394
                        }
1395
                    }
1396
#endif
1397

1398
                    for (; iDstPixel < nDstXWidth; ++iDstPixel)
268✔
1399
                    {
1400
                        T nVal;
1401
                        if (bQuadraticMean)
198✔
1402
                        {
1403
                            // Cast to double to avoid overflows
1404
                            // (using std::hypot() is much slower)
1405
                            nVal = static_cast<T>(std::sqrt(
100✔
1406
                                0.25 *
1407
                                (SQUARE<double>(pSrcScanlineShifted[0]) +
100✔
1408
                                 SQUARE<double>(pSrcScanlineShifted[1]) +
100✔
1409
                                 SQUARE<double>(
100✔
1410
                                     pSrcScanlineShifted[nChunkXSize]) +
200✔
1411
                                 SQUARE<double>(
100✔
1412
                                     pSrcScanlineShifted[1 + nChunkXSize]))));
100✔
1413
                        }
1414
                        else
1415
                        {
1416
                            nVal = static_cast<T>(
98✔
1417
                                0.25f * (pSrcScanlineShifted[0] +
98✔
1418
                                         pSrcScanlineShifted[1] +
98✔
1419
                                         pSrcScanlineShifted[nChunkXSize] +
98✔
1420
                                         pSrcScanlineShifted[1 + nChunkXSize]));
98✔
1421
                        }
1422

1423
                        // No need to compare nVal against tNoDataValue as we
1424
                        // are in a case where pabyChunkNodataMask == nullptr
1425
                        // implies the absence of nodata value.
1426
                        pDstScanline[iDstPixel] = nVal;
198✔
1427
                        pSrcScanlineShifted += 2;
198✔
1428
                    }
1429
                }
112,655✔
1430
            }
1431
            else
1432
            {
1433
                const double dfBottomWeight =
24✔
1434
                    (nSrcYOff + 1 == nSrcYOff2) ? 1.0
625,854✔
1435
                                                : 1.0 - (dfSrcYOff - nSrcYOff);
625,830✔
1436
                const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
625,854✔
1437
                nSrcYOff -= nChunkYOff;
625,854✔
1438
                nSrcYOff2 -= nChunkYOff;
625,854✔
1439

1440
                double dfTotalWeightFullColumn = dfBottomWeight;
625,854✔
1441
                if (nSrcYOff + 1 < nSrcYOff2)
625,854✔
1442
                {
1443
                    dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
625,832✔
1444
                    dfTotalWeightFullColumn += dfTopWeight;
625,832✔
1445
                }
1446

1447
                for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
18,585,056✔
1448
                {
1449
                    const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
17,946,481✔
1450
                    const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
17,946,481✔
1451

1452
                    double dfTotal = 0;
17,946,481✔
1453
                    double dfTotalWeight = 0;
17,946,481✔
1454
                    if (pabyChunkNodataMask == nullptr)
17,946,481✔
1455
                    {
1456
                        auto pChunkShifted =
1,746,435✔
1457
                            pChunk +
115✔
1458
                            static_cast<GPtrDiff_t>(nSrcYOff) * nChunkXSize;
1,746,435✔
1459
                        int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1,746,435✔
1460
                        double dfWeightY = dfBottomWeight;
1,746,435✔
1461
                        while (true)
3,493,427✔
1462
                        {
1463
                            double dfTotalLine;
1464
                            if (bQuadraticMean)
5,239,852✔
1465
                            {
1466
                                // Left pixel
1467
                                {
1468
                                    const T val = pChunkShifted[nSrcXOff];
104✔
1469
                                    dfTotalLine =
104✔
1470
                                        SQUARE<double>(val) *
104✔
1471
                                        pasSrcX[iDstPixel].dfLeftWeight;
104✔
1472
                                }
1473

1474
                                if (nSrcXOff + 1 < nSrcXOff2)
104✔
1475
                                {
1476
                                    // Middle pixels
1477
                                    for (int iX = nSrcXOff + 1;
104✔
1478
                                         iX + 1 < nSrcXOff2; ++iX)
424✔
1479
                                    {
1480
                                        const T val = pChunkShifted[iX];
320✔
1481
                                        dfTotalLine += SQUARE<double>(val);
320✔
1482
                                    }
1483

1484
                                    // Right pixel
1485
                                    {
1486
                                        const T val =
104✔
1487
                                            pChunkShifted[nSrcXOff2 - 1];
104✔
1488
                                        dfTotalLine +=
104✔
1489
                                            SQUARE<double>(val) *
104✔
1490
                                            pasSrcX[iDstPixel].dfRightWeight;
104✔
1491
                                    }
1492
                                }
1493
                            }
1494
                            else
1495
                            {
1496
                                // Left pixel
1497
                                {
1498
                                    const T val = pChunkShifted[nSrcXOff];
5,239,756✔
1499
                                    dfTotalLine =
5,239,756✔
1500
                                        val * pasSrcX[iDstPixel].dfLeftWeight;
5,239,756✔
1501
                                }
1502

1503
                                if (nSrcXOff + 1 < nSrcXOff2)
5,239,756✔
1504
                                {
1505
                                    // Middle pixels
1506
                                    for (int iX = nSrcXOff + 1;
4,239,330✔
1507
                                         iX + 1 < nSrcXOff2; ++iX)
64,183,126✔
1508
                                    {
1509
                                        const T val = pChunkShifted[iX];
59,943,836✔
1510
                                        dfTotalLine += val;
59,943,836✔
1511
                                    }
1512

1513
                                    // Right pixel
1514
                                    {
1515
                                        const T val =
4,239,330✔
1516
                                            pChunkShifted[nSrcXOff2 - 1];
4,239,330✔
1517
                                        dfTotalLine +=
4,239,330✔
1518
                                            val *
4,239,330✔
1519
                                            pasSrcX[iDstPixel].dfRightWeight;
4,239,330✔
1520
                                    }
1521
                                }
1522
                            }
1523

1524
                            dfTotal += dfTotalLine * dfWeightY;
5,239,852✔
1525
                            --nCounterY;
5,239,852✔
1526
                            if (nCounterY < 0)
5,239,852✔
1527
                                break;
1,746,435✔
1528
                            pChunkShifted += nChunkXSize;
3,493,427✔
1529
                            dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
3,493,427✔
1530
                        }
1531

1532
                        dfTotalWeight =
1,746,435✔
1533
                            pasSrcX[iDstPixel].dfTotalWeightFullLine *
1,746,435✔
1534
                            dfTotalWeightFullColumn;
1535
                    }
1536
                    else
1537
                    {
1538
                        GPtrDiff_t nCount = 0;
16,200,066✔
1539
                        for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
71,183,998✔
1540
                        {
1541
                            const auto pChunkShifted =
54,971,732✔
1542
                                pChunk +
132✔
1543
                                static_cast<GPtrDiff_t>(iY) * nChunkXSize;
54,971,732✔
1544

1545
                            double dfTotalLine = 0;
54,971,732✔
1546
                            double dfTotalWeightLine = 0;
54,971,732✔
1547
                            // Left pixel
1548
                            {
1549
                                const int iX = nSrcXOff;
54,971,732✔
1550
                                const T val = pChunkShifted[iX];
54,971,732✔
1551
                                if (pabyChunkNodataMask[iX + iY * nChunkXSize])
54,971,732✔
1552
                                {
1553
                                    nCount++;
23,418,181✔
1554
                                    const double dfWeightX =
23,418,181✔
1555
                                        pasSrcX[iDstPixel].dfLeftWeight;
23,418,181✔
1556
                                    dfTotalWeightLine = dfWeightX;
23,418,181✔
1557
                                    if (bQuadraticMean)
23,418,181✔
1558
                                        dfTotalLine =
60✔
1559
                                            SQUARE<double>(val) * dfWeightX;
60✔
1560
                                    else
1561
                                        dfTotalLine = val * dfWeightX;
23,418,081✔
1562
                                }
1563
                            }
1564

1565
                            if (nSrcXOff + 1 < nSrcXOff2)
54,971,732✔
1566
                            {
1567
                                // Middle pixels
1568
                                for (int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2;
145,167,132✔
1569
                                     ++iX)
1570
                                {
1571
                                    const T val = pChunkShifted[iX];
90,194,300✔
1572
                                    if (pabyChunkNodataMask[iX +
90,194,300✔
1573
                                                            iY * nChunkXSize])
90,194,300✔
1574
                                    {
1575
                                        nCount++;
39,727,900✔
1576
                                        dfTotalWeightLine += 1;
39,727,900✔
1577
                                        if (bQuadraticMean)
39,727,900✔
1578
                                            dfTotalLine += SQUARE<double>(val);
×
1579
                                        else
1580
                                            dfTotalLine += val;
39,727,900✔
1581
                                    }
1582
                                }
1583

1584
                                // Right pixel
1585
                                {
1586
                                    const int iX = nSrcXOff2 - 1;
54,972,332✔
1587
                                    const T val = pChunkShifted[iX];
54,972,332✔
1588
                                    if (pabyChunkNodataMask[iX +
54,972,332✔
1589
                                                            iY * nChunkXSize])
54,972,332✔
1590
                                    {
1591
                                        nCount++;
23,417,247✔
1592
                                        const double dfWeightX =
23,417,247✔
1593
                                            pasSrcX[iDstPixel].dfRightWeight;
23,417,247✔
1594
                                        dfTotalWeightLine += dfWeightX;
23,417,247✔
1595
                                        if (bQuadraticMean)
23,417,247✔
1596
                                            dfTotalLine +=
10,910✔
1597
                                                SQUARE<double>(val) * dfWeightX;
61✔
1598
                                        else
1599
                                            dfTotalLine += val * dfWeightX;
23,417,246✔
1600
                                    }
1601
                                }
1602
                            }
1603

1604
                            const double dfWeightY =
93,745,398✔
1605
                                (iY == nSrcYOff)        ? dfBottomWeight
1606
                                : (iY + 1 == nSrcYOff2) ? dfTopWeight
38,761,566✔
1607
                                                        : 1.0;
1608
                            dfTotal += dfTotalLine * dfWeightY;
54,983,932✔
1609
                            dfTotalWeight += dfTotalWeightLine * dfWeightY;
54,983,932✔
1610
                        }
1611

1612
                        if (nCount == 0 ||
16,212,166✔
1613
                            (bPropagateNoData &&
8✔
1614
                             nCount <
1615
                                 static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
8✔
1616
                                     (nSrcXOff2 - nSrcXOff)))
8✔
1617
                        {
1618
                            pDstScanline[iDstPixel] = tNoDataValue;
9,461,262✔
1619
                            continue;
9,461,262✔
1620
                        }
1621
                    }
1622
                    if (eWrkDataType == GDT_Byte)
1623
                    {
1624
                        T nVal;
1625
                        if (bQuadraticMean)
8,497,210✔
1626
                            nVal = ComputeIntegerRMS<T, int>(dfTotal,
38✔
1627
                                                             dfTotalWeight);
1628
                        else
1629
                            nVal =
8,497,170✔
1630
                                static_cast<T>(dfTotal / dfTotalWeight + 0.5);
8,497,170✔
1631
                        if (bHasNoData && nVal == tNoDataValue)
8,497,760✔
1632
                            nVal = tReplacementVal;
×
1633
                        pDstScanline[iDstPixel] = nVal;
8,497,760✔
1634
                    }
1635
                    else if (eWrkDataType == GDT_UInt16)
1636
                    {
1637
                        T nVal;
1638
                        if (bQuadraticMean)
8✔
1639
                            nVal = ComputeIntegerRMS<T, uint64_t>(
4✔
1640
                                dfTotal, dfTotalWeight);
1641
                        else
1642
                            nVal =
4✔
1643
                                static_cast<T>(dfTotal / dfTotalWeight + 0.5);
4✔
1644
                        if (bHasNoData && nVal == tNoDataValue)
8✔
1645
                            nVal = tReplacementVal;
×
1646
                        pDstScanline[iDstPixel] = nVal;
8✔
1647
                    }
1648
                    else
1649
                    {
1650
                        T nVal;
1651
                        if (bQuadraticMean)
151✔
1652
                            nVal =
20✔
1653
                                static_cast<T>(sqrt(dfTotal / dfTotalWeight));
25✔
1654
                        else
1655
                            nVal = static_cast<T>(dfTotal / dfTotalWeight);
126✔
1656
                        if (bHasNoData && nVal == tNoDataValue)
151✔
1657
                            nVal = tReplacementVal;
2✔
1658
                        pDstScanline[iDstPixel] = nVal;
151✔
1659
                    }
1660
                }
1661
            }
1662
        }
1663
        else
1664
        {
1665
            nSrcYOff -= nChunkYOff;
112✔
1666
            nSrcYOff2 -= nChunkYOff;
112✔
1667

UNCOV
1668
            for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
×
1669
            {
1670
                const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
6,475✔
1671
                const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
6,475✔
1672

1673
                GPtrDiff_t nTotalR = 0;
6,475✔
1674
                GPtrDiff_t nTotalG = 0;
6,475✔
1675
                GPtrDiff_t nTotalB = 0;
6,475✔
1676
                GPtrDiff_t nCount = 0;
6,475✔
1677

1678
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
19,425✔
1679
                {
1680
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
38,850✔
1681
                    {
1682
                        const T val = pChunk[iX + static_cast<GPtrDiff_t>(iY) *
25,900✔
1683
                                                      nChunkXSize];
25,900✔
1684
                        // cppcheck-suppress unsignedLessThanZero
1685
                        if (val < 0 || val >= colorEntries.size())
25,900✔
1686
                            continue;
×
1687
                        size_t idx = static_cast<size_t>(val);
25,900✔
1688
                        const auto &entry = colorEntries[idx];
25,900✔
1689
                        if (entry.c4)
25,900✔
1690
                        {
1691
                            if (bQuadraticMean)
14,128✔
1692
                            {
1693
                                nTotalR += SQUARE<int>(entry.c1);
800✔
1694
                                nTotalG += SQUARE<int>(entry.c2);
800✔
1695
                                nTotalB += SQUARE<int>(entry.c3);
800✔
1696
                                ++nCount;
800✔
1697
                            }
1698
                            else
1699
                            {
1700
                                nTotalR += entry.c1;
13,328✔
1701
                                nTotalG += entry.c2;
13,328✔
1702
                                nTotalB += entry.c3;
13,328✔
1703
                                ++nCount;
13,328✔
1704
                            }
1705
                        }
1706
                    }
1707
                }
1708

1709
                if (nCount == 0 ||
6,475✔
1710
                    (bPropagateNoData &&
×
1711
                     nCount < static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
×
1712
                                  (nSrcXOff2 - nSrcXOff)))
×
1713
                {
1714
                    pDstScanline[iDstPixel] = tNoDataValue;
2,838✔
1715
                }
1716
                else
1717
                {
1718
                    GDALColorEntry color;
1719
                    if (bQuadraticMean)
3,637✔
1720
                    {
1721
                        color.c1 =
200✔
1722
                            static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
200✔
1723
                        color.c2 =
200✔
1724
                            static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
200✔
1725
                        color.c3 =
200✔
1726
                            static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
200✔
1727
                    }
1728
                    else
1729
                    {
1730
                        color.c1 =
3,437✔
1731
                            static_cast<short>((nTotalR + nCount / 2) / nCount);
3,437✔
1732
                        color.c2 =
3,437✔
1733
                            static_cast<short>((nTotalG + nCount / 2) / nCount);
3,437✔
1734
                        color.c3 =
3,437✔
1735
                            static_cast<short>((nTotalB + nCount / 2) / nCount);
3,437✔
1736
                    }
UNCOV
1737
                    pDstScanline[iDstPixel] =
×
1738
                        static_cast<T>(BestColorEntry(colorEntries, color));
3,637✔
1739
                }
1740
            }
1741
        }
1742
    }
1743

1744
    CPLFree(pasSrcX);
10,383✔
1745

1746
    return CE_None;
10,384✔
1747
}
1748

1749
static CPLErr
1750
GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
10,383✔
1751
                               const void *pChunk, void **ppDstBuffer,
1752
                               GDALDataType *peDstBufferDataType)
1753
{
1754
    *peDstBufferDataType = args.eWrkDataType;
10,383✔
1755
    switch (args.eWrkDataType)
10,383✔
1756
    {
1757
        case GDT_Byte:
10,316✔
1758
        {
1759
            return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
10,316✔
1760
                args, static_cast<const GByte *>(pChunk), ppDstBuffer);
10,318✔
1761
        }
1762

1763
        case GDT_UInt16:
9✔
1764
        {
1765
            if (EQUAL(args.pszResampling, "RMS"))
9✔
1766
            {
1767
                // Use double as accumulation type, because UInt32 could overflow
1768
                return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
1769
                                                        GDT_UInt16>(
5✔
1770
                    args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
5✔
1771
            }
1772
            else
1773
            {
1774
                return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
1775
                                                        GDT_UInt16>(
4✔
1776
                    args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
4✔
1777
            }
1778
        }
1779

1780
        case GDT_Float32:
39✔
1781
        {
1782
            return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
39✔
1783
                args, static_cast<const float *>(pChunk), ppDstBuffer);
39✔
1784
        }
1785

1786
        case GDT_Float64:
17✔
1787
        {
1788
            return GDALResampleChunk_AverageOrRMS_T<double, double,
1789
                                                    GDT_Float64>(
17✔
1790
                args, static_cast<const double *>(pChunk), ppDstBuffer);
17✔
1791
        }
1792

1793
        default:
2✔
1794
            break;
2✔
1795
    }
1796

1797
    CPLAssert(false);
2✔
1798
    return CE_Failure;
1799
}
1800

1801
/************************************************************************/
1802
/*                     GDALResampleChunk_Gauss()                        */
1803
/************************************************************************/
1804

1805
static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
86✔
1806
                                      const void *pChunk, void **ppDstBuffer,
1807
                                      GDALDataType *peDstBufferDataType)
1808

1809
{
1810
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
86✔
1811
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
86✔
1812
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
86✔
1813
    const int nChunkXOff = args.nChunkXOff;
86✔
1814
    const int nChunkXSize = args.nChunkXSize;
86✔
1815
    const int nChunkYOff = args.nChunkYOff;
86✔
1816
    const int nChunkYSize = args.nChunkYSize;
86✔
1817
    const int nDstXOff = args.nDstXOff;
86✔
1818
    const int nDstXOff2 = args.nDstXOff2;
86✔
1819
    const int nDstYOff = args.nDstYOff;
86✔
1820
    const int nDstYOff2 = args.nDstYOff2;
86✔
1821
    const bool bHasNoData = args.bHasNoData;
86✔
1822
    double dfNoDataValue = args.dfNoDataValue;
86✔
1823
    const GDALColorTable *poColorTable = args.poColorTable;
86✔
1824

1825
    const double *const padfChunk = static_cast<const double *>(pChunk);
86✔
1826

1827
    *ppDstBuffer =
86✔
1828
        VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
86✔
1829
                            GDALGetDataTypeSizeBytes(GDT_Float64));
1830
    if (*ppDstBuffer == nullptr)
86✔
1831
    {
1832
        return CE_Failure;
×
1833
    }
1834
    *peDstBufferDataType = GDT_Float64;
86✔
1835
    double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
86✔
1836

1837
    /* -------------------------------------------------------------------- */
1838
    /*      Create the filter kernel and allocate scanline buffer.          */
1839
    /* -------------------------------------------------------------------- */
1840
    int nGaussMatrixDim = 3;
86✔
1841
    const int *panGaussMatrix;
1842
    constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
86✔
1843
    constexpr int anGaussMatrix5x5[] = {1,  4, 6,  4,  1,  4, 16, 24, 16,
86✔
1844
                                        4,  6, 24, 36, 24, 6, 4,  16, 24,
1845
                                        16, 4, 1,  4,  6,  4, 1};
1846
    constexpr int anGaussMatrix7x7[] = {
86✔
1847
        1,   6,  15, 20,  15,  6,   1,   6,  36, 90,  120, 90,  36,
1848
        6,   15, 90, 225, 300, 225, 90,  15, 20, 120, 300, 400, 300,
1849
        120, 20, 15, 90,  225, 300, 225, 90, 15, 6,   36,  90,  120,
1850
        90,  36, 6,  1,   6,   15,  20,  15, 6,  1};
1851

1852
    const int nOXSize = args.nOvrXSize;
86✔
1853
    const int nOYSize = args.nOvrYSize;
86✔
1854
    const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
86✔
1855

1856
    // matrix for gauss filter
1857
    if (nResYFactor <= 2)
86✔
1858
    {
1859
        panGaussMatrix = anGaussMatrix3x3;
85✔
1860
        nGaussMatrixDim = 3;
85✔
1861
    }
1862
    else if (nResYFactor <= 4)
1✔
1863
    {
1864
        panGaussMatrix = anGaussMatrix5x5;
×
1865
        nGaussMatrixDim = 5;
×
1866
    }
1867
    else
1868
    {
1869
        panGaussMatrix = anGaussMatrix7x7;
1✔
1870
        nGaussMatrixDim = 7;
1✔
1871
    }
1872

1873
#ifdef DEBUG_OUT_OF_BOUND_ACCESS
1874
    int *panGaussMatrixDup = static_cast<int *>(
1875
        CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
1876
    memcpy(panGaussMatrixDup, panGaussMatrix,
1877
           sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
1878
    panGaussMatrix = panGaussMatrixDup;
1879
#endif
1880

1881
    if (!bHasNoData)
86✔
1882
        dfNoDataValue = 0.0;
79✔
1883

1884
    std::vector<GDALColorEntry> colorEntries;
86✔
1885
    int nTransparentIdx = -1;
86✔
1886
    if (poColorTable)
86✔
1887
        colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
2✔
1888

1889
    // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1890
    // it as nodata value.
1891
    if (bHasNoData && dfNoDataValue >= 0.0f &&
92✔
1892
        dfNoDataValue < colorEntries.size())
6✔
1893
        colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
×
1894

1895
    // Or if we have no explicit nodata, but a color table entry that is
1896
    // transparent, consider it as the nodata value.
1897
    else if (!bHasNoData && nTransparentIdx >= 0)
86✔
1898
    {
1899
        dfNoDataValue = nTransparentIdx;
×
1900
    }
1901

1902
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
86✔
1903
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
86✔
1904
    const int nDstXWidth = nDstXOff2 - nDstXOff;
86✔
1905

1906
    /* ==================================================================== */
1907
    /*      Loop over destination scanlines.                                */
1908
    /* ==================================================================== */
1909
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
16,488✔
1910
    {
1911
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
16,402✔
1912
        int nSrcYOff2 =
16,402✔
1913
            static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
16,402✔
1914

1915
        if (nSrcYOff < nChunkYOff)
16,402✔
1916
        {
1917
            nSrcYOff = nChunkYOff;
×
1918
            nSrcYOff2++;
×
1919
        }
1920

1921
        const int iSizeY = nSrcYOff2 - nSrcYOff;
16,402✔
1922
        nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
16,402✔
1923
        nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
16,402✔
1924

1925
        if (nSrcYOff2 > nChunkBottomYOff ||
16,402✔
1926
            (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
16,359✔
1927
        {
1928
            nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
44✔
1929
        }
1930

1931
        int nYShiftGaussMatrix = 0;
16,402✔
1932
        if (nSrcYOff < nChunkYOff)
16,402✔
1933
        {
1934
            nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
×
1935
            nSrcYOff = nChunkYOff;
×
1936
        }
1937

1938
        const double *const padfSrcScanline =
16,402✔
1939
            padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
16,402✔
1940
        const GByte *pabySrcScanlineNodataMask = nullptr;
16,402✔
1941
        if (pabyChunkNodataMask != nullptr)
16,402✔
1942
            pabySrcScanlineNodataMask =
152✔
1943
                pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
152✔
1944

1945
        /* --------------------------------------------------------------------
1946
         */
1947
        /*      Loop over destination pixels */
1948
        /* --------------------------------------------------------------------
1949
         */
1950
        double *const padfDstScanline =
16,402✔
1951
            padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
16,402✔
1952
        for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
4,149,980✔
1953
        {
1954
            int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
4,133,580✔
1955
            int nSrcXOff2 =
4,133,580✔
1956
                static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
4,133,580✔
1957

1958
            if (nSrcXOff < nChunkXOff)
4,133,580✔
1959
            {
1960
                nSrcXOff = nChunkXOff;
×
1961
                nSrcXOff2++;
×
1962
            }
1963

1964
            const int iSizeX = nSrcXOff2 - nSrcXOff;
4,133,580✔
1965
            nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
4,133,580✔
1966
            nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
4,133,580✔
1967

1968
            if (nSrcXOff2 > nChunkRightXOff ||
4,133,580✔
1969
                (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
4,127,930✔
1970
            {
1971
                nSrcXOff2 =
5,650✔
1972
                    std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
5,650✔
1973
            }
1974

1975
            int nXShiftGaussMatrix = 0;
4,133,580✔
1976
            if (nSrcXOff < nChunkXOff)
4,133,580✔
1977
            {
1978
                nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
×
1979
                nSrcXOff = nChunkXOff;
×
1980
            }
1981

1982
            if (poColorTable == nullptr)
4,133,580✔
1983
            {
1984
                double dfTotal = 0.0;
4,133,380✔
1985
                GInt64 nCount = 0;
4,133,380✔
1986
                const int *panLineWeight =
4,133,380✔
1987
                    panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
4,133,380✔
1988
                    nXShiftGaussMatrix;
1989

1990
                for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
16,527,900✔
1991
                     ++iY, ++j, panLineWeight += nGaussMatrixDim)
12,394,500✔
1992
                {
1993
                    for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
49,561,300✔
1994
                    {
1995
                        const double val =
37,166,800✔
1996
                            padfSrcScanline[iX - nChunkXOff +
37,166,800✔
1997
                                            static_cast<GPtrDiff_t>(iY -
37,166,800✔
1998
                                                                    nSrcYOff) *
37,166,800✔
1999
                                                nChunkXSize];
37,166,800✔
2000
                        if (pabySrcScanlineNodataMask == nullptr ||
37,166,800✔
2001
                            pabySrcScanlineNodataMask[iX - nChunkXOff +
32,872✔
2002
                                                      static_cast<GPtrDiff_t>(
32,872✔
2003
                                                          iY - nSrcYOff) *
32,872✔
2004
                                                          nChunkXSize])
32,872✔
2005
                        {
2006
                            const int nWeight = panLineWeight[i];
37,146,100✔
2007
                            dfTotal += val * nWeight;
37,146,100✔
2008
                            nCount += nWeight;
37,146,100✔
2009
                        }
2010
                    }
2011
                }
2012

2013
                if (nCount == 0)
4,133,380✔
2014
                {
2015
                    padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2,217✔
2016
                }
2017
                else
2018
                {
2019
                    padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
4,131,160✔
2020
                }
2021
            }
2022
            else
2023
            {
2024
                GInt64 nTotalR = 0;
200✔
2025
                GInt64 nTotalG = 0;
200✔
2026
                GInt64 nTotalB = 0;
200✔
2027
                GInt64 nTotalWeight = 0;
200✔
2028
                const int *panLineWeight =
200✔
2029
                    panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
200✔
2030
                    nXShiftGaussMatrix;
2031

2032
                for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
780✔
2033
                     ++iY, ++j, panLineWeight += nGaussMatrixDim)
580✔
2034
                {
2035
                    for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2,262✔
2036
                    {
2037
                        const double val =
1,682✔
2038
                            padfSrcScanline[iX - nChunkXOff +
1,682✔
2039
                                            static_cast<GPtrDiff_t>(iY -
1,682✔
2040
                                                                    nSrcYOff) *
1,682✔
2041
                                                nChunkXSize];
1,682✔
2042
                        if (val < 0 || val >= colorEntries.size())
1,682✔
2043
                            continue;
×
2044

2045
                        size_t idx = static_cast<size_t>(val);
1,682✔
2046
                        if (colorEntries[idx].c4)
1,682✔
2047
                        {
2048
                            const int nWeight = panLineWeight[i];
1,682✔
2049
                            nTotalR +=
1,682✔
2050
                                static_cast<GInt64>(colorEntries[idx].c1) *
1,682✔
2051
                                nWeight;
1,682✔
2052
                            nTotalG +=
1,682✔
2053
                                static_cast<GInt64>(colorEntries[idx].c2) *
1,682✔
2054
                                nWeight;
1,682✔
2055
                            nTotalB +=
1,682✔
2056
                                static_cast<GInt64>(colorEntries[idx].c3) *
1,682✔
2057
                                nWeight;
1,682✔
2058
                            nTotalWeight += nWeight;
1,682✔
2059
                        }
2060
                    }
2061
                }
2062

2063
                if (nTotalWeight == 0)
200✔
2064
                {
2065
                    padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
×
2066
                }
2067
                else
2068
                {
2069
                    GDALColorEntry color;
2070

2071
                    color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
200✔
2072
                                                  nTotalWeight);
2073
                    color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
200✔
2074
                                                  nTotalWeight);
2075
                    color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
200✔
2076
                                                  nTotalWeight);
2077
                    padfDstScanline[iDstPixel - nDstXOff] =
200✔
2078
                        BestColorEntry(colorEntries, color);
200✔
2079
                }
2080
            }
2081
        }
2082
    }
2083

2084
#ifdef DEBUG_OUT_OF_BOUND_ACCESS
2085
    CPLFree(panGaussMatrixDup);
2086
#endif
2087

2088
    return CE_None;
86✔
2089
}
2090

2091
/************************************************************************/
2092
/*                      GDALResampleChunk_Mode()                        */
2093
/************************************************************************/
2094

2095
template <class T> static inline bool IsSame(T a, T b)
4,398✔
2096
{
2097
    return a == b;
4,398✔
2098
}
2099

2100
template <> bool IsSame<float>(float a, float b)
4,854✔
2101
{
2102
    return a == b || (std::isnan(a) && std::isnan(b));
4,854✔
2103
}
2104

2105
template <> bool IsSame<double>(double a, double b)
504✔
2106
{
2107
    return a == b || (std::isnan(a) && std::isnan(b));
504✔
2108
}
2109

2110
template <>
2111
bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
480✔
2112
{
2113
    return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
960✔
2114
                      std::isnan(b.real()) && std::isnan(b.imag()));
960✔
2115
}
2116

2117
template <>
2118
bool IsSame<std::complex<double>>(std::complex<double> a,
480✔
2119
                                  std::complex<double> b)
2120
{
2121
    return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
960✔
2122
                      std::isnan(b.real()) && std::isnan(b.imag()));
960✔
2123
}
2124

2125
template <class T>
2126
static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
136✔
2127
                                      const T *pChunk, T *const pDstBuffer)
2128

2129
{
2130
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
136✔
2131
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
136✔
2132
    const double dfSrcXDelta = args.dfSrcXDelta;
136✔
2133
    const double dfSrcYDelta = args.dfSrcYDelta;
136✔
2134
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
136✔
2135
    const int nChunkXOff = args.nChunkXOff;
136✔
2136
    const int nChunkXSize = args.nChunkXSize;
136✔
2137
    const int nChunkYOff = args.nChunkYOff;
136✔
2138
    const int nChunkYSize = args.nChunkYSize;
136✔
2139
    const int nDstXOff = args.nDstXOff;
136✔
2140
    const int nDstXOff2 = args.nDstXOff2;
136✔
2141
    const int nDstYOff = args.nDstYOff;
136✔
2142
    const int nDstYOff2 = args.nDstYOff2;
136✔
2143
    const bool bHasNoData = args.bHasNoData;
136✔
2144
    const GDALColorTable *poColorTable = args.poColorTable;
136✔
2145
    const int nDstXSize = nDstXOff2 - nDstXOff;
136✔
2146

2147
    T tNoDataValue;
8✔
2148
    if constexpr (std::is_same<T, std::complex<float>>::value ||
2149
                  std::is_same<T, std::complex<double>>::value)
2150
    {
2151
        using BaseT = typename T::value_type;
2152
        tNoDataValue =
8✔
2153
            std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2154
                                std::numeric_limits<BaseT>::quiet_NaN());
2155
    }
2156
    else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
128✔
2157
        tNoDataValue = 0;
127✔
2158
    else
2159
        tNoDataValue = static_cast<T>(args.dfNoDataValue);
1✔
2160

2161
    size_t nMaxNumPx = 0;
136✔
2162
    T *paVals = nullptr;
136✔
2163
    int *panSums = nullptr;
136✔
2164

2165
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
136✔
2166
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
136✔
2167
    std::vector<int> anVals(256, 0);
272✔
2168

2169
    /* ==================================================================== */
2170
    /*      Loop over destination scanlines.                                */
2171
    /* ==================================================================== */
2172
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
7,531✔
2173
    {
2174
        double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
7,395✔
2175
        int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
7,395✔
2176
#ifdef only_pixels_with_more_than_10_pct_participation
2177
        // When oversampling, don't take into account pixels that have a tiny
2178
        // participation in the resulting pixel
2179
        if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2180
            nSrcYOff < nChunkBottomYOff)
2181
            nSrcYOff++;
2182
#endif
2183
        if (nSrcYOff < nChunkYOff)
7,395✔
2184
            nSrcYOff = nChunkYOff;
×
2185

2186
        double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
7,395✔
2187
        int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
7,395✔
2188
#ifdef only_pixels_with_more_than_10_pct_participation
2189
        // When oversampling, don't take into account pixels that have a tiny
2190
        // participation in the resulting pixel
2191
        if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2192
            nSrcYOff2 > nChunkYOff)
2193
            nSrcYOff2--;
2194
#endif
2195
        if (nSrcYOff2 == nSrcYOff)
7,395✔
2196
            ++nSrcYOff2;
×
2197
        if (nSrcYOff2 > nChunkBottomYOff)
7,395✔
2198
            nSrcYOff2 = nChunkBottomYOff;
×
2199

2200
        const T *const paSrcScanline =
7,395✔
2201
            pChunk +
149✔
2202
            (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
7,395✔
2203
        const GByte *pabySrcScanlineNodataMask = nullptr;
7,395✔
2204
        if (pabyChunkNodataMask != nullptr)
7,395✔
2205
            pabySrcScanlineNodataMask =
1,810✔
2206
                pabyChunkNodataMask +
2207
                static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
1,810✔
2208

2209
        T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
7,395✔
2210
        /* --------------------------------------------------------------------
2211
         */
2212
        /*      Loop over destination pixels */
2213
        /* --------------------------------------------------------------------
2214
         */
2215
        for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
4,259,580✔
2216
        {
2217
            double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
4,252,187✔
2218
            // Apply some epsilon to avoid numerical precision issues
2219
            int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
4,252,187✔
2220
#ifdef only_pixels_with_more_than_10_pct_participation
2221
            // When oversampling, don't take into account pixels that have a
2222
            // tiny participation in the resulting pixel
2223
            if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2224
                nSrcXOff < nChunkRightXOff)
2225
                nSrcXOff++;
2226
#endif
2227
            if (nSrcXOff < nChunkXOff)
4,252,187✔
2228
                nSrcXOff = nChunkXOff;
×
2229

2230
            double dfSrcXOff2 =
4,252,187✔
2231
                dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
4,252,187✔
2232
            int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
4,252,187✔
2233
#ifdef only_pixels_with_more_than_10_pct_participation
2234
            // When oversampling, don't take into account pixels that have a
2235
            // tiny participation in the resulting pixel
2236
            if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2237
                nSrcXOff2 > nChunkXOff)
2238
                nSrcXOff2--;
2239
#endif
2240
            if (nSrcXOff2 == nSrcXOff)
4,252,187✔
2241
                nSrcXOff2++;
×
2242
            if (nSrcXOff2 > nChunkRightXOff)
4,252,187✔
2243
                nSrcXOff2 = nChunkRightXOff;
×
2244

2245
            bool bRegularProcessing = false;
4,252,187✔
2246
            if constexpr (!std::is_same<T, GByte>::value)
2247
                bRegularProcessing = true;
827✔
2248
            else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
4,251,360✔
2249
                bRegularProcessing = true;
×
2250

2251
            if (bRegularProcessing)
4,252,187✔
2252
            {
2253
                // Not sure how much sense it makes to run a majority
2254
                // filter on floating point data, but here it is for the sake
2255
                // of compatibility. It won't look right on RGB images by the
2256
                // nature of the filter.
2257

2258
                if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
827✔
2259
                    nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
2,481✔
2260
                    static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
827✔
2261
                            static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
827✔
2262
                        std::numeric_limits<size_t>::max() / sizeof(float))
827✔
2263
                {
2264
                    CPLError(CE_Failure, CPLE_NotSupported,
×
2265
                             "Too big downsampling factor");
2266
                    CPLFree(paVals);
×
2267
                    CPLFree(panSums);
×
2268
                    return CE_Failure;
×
2269
                }
2270
                const size_t nNumPx =
827✔
2271
                    static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
827✔
2272
                    static_cast<size_t>(nSrcXOff2 - nSrcXOff);
827✔
2273
                size_t iMaxInd = 0;
827✔
2274
                size_t iMaxVal = 0;
827✔
2275
                bool biMaxValdValid = false;
827✔
2276

2277
                if (paVals == nullptr || nNumPx > nMaxNumPx)
827✔
2278
                {
2279
                    T *paValsNew = static_cast<T *>(
2280
                        VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
71✔
2281
                    int *panSumsNew = static_cast<int *>(
2282
                        VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
71✔
2283
                    if (paValsNew != nullptr)
71✔
2284
                        paVals = paValsNew;
71✔
2285
                    if (panSumsNew != nullptr)
71✔
2286
                        panSums = panSumsNew;
71✔
2287
                    if (paValsNew == nullptr || panSumsNew == nullptr)
71✔
2288
                    {
2289
                        CPLFree(paVals);
×
2290
                        CPLFree(panSums);
×
2291
                        return CE_Failure;
×
2292
                    }
2293
                    nMaxNumPx = nNumPx;
71✔
2294
                }
2295

2296
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2,585✔
2297
                {
2298
                    const GPtrDiff_t iTotYOff =
1,758✔
2299
                        static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
1,758✔
2300
                        nChunkXOff;
1,758✔
2301
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
5,690✔
2302
                    {
2303
                        if (pabySrcScanlineNodataMask == nullptr ||
3,932✔
2304
                            pabySrcScanlineNodataMask[iX + iTotYOff])
16✔
2305
                        {
2306
                            const T val = paSrcScanline[iX + iTotYOff];
3,917✔
2307
                            size_t i = 0;  // Used after for.
3,917✔
2308

2309
                            // Check array for existing entry.
2310
                            for (; i < iMaxInd; ++i)
14,387✔
2311
                                if (IsSame(paVals[i], val) &&
17,626✔
2312
                                    ++panSums[i] > panSums[iMaxVal])
6,910✔
2313
                                {
2314
                                    iMaxVal = i;
246✔
2315
                                    biMaxValdValid = true;
246✔
2316
                                    break;
246✔
2317
                                }
2318

2319
                            // Add to arr if entry not already there.
2320
                            if (i == iMaxInd)
3,917✔
2321
                            {
2322
                                paVals[iMaxInd] = val;
3,671✔
2323
                                panSums[iMaxInd] = 1;
3,671✔
2324

2325
                                if (!biMaxValdValid)
3,671✔
2326
                                {
2327
                                    iMaxVal = iMaxInd;
824✔
2328
                                    biMaxValdValid = true;
824✔
2329
                                }
2330

2331
                                ++iMaxInd;
3,671✔
2332
                            }
2333
                        }
2334
                    }
2335
                }
2336

2337
                if (!biMaxValdValid)
827✔
2338
                    paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
3✔
2339
                else
2340
                    paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
824✔
2341
            }
2342
            else if constexpr (std::is_same<T, GByte>::value)
2343
            // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
2344
            {
2345
                // So we go here for a paletted or non-paletted byte band.
2346
                // The input values are then between 0 and 255.
2347
                int nMaxVal = 0;
4,251,360✔
2348
                int iMaxInd = -1;
4,251,360✔
2349

2350
                // The cost of this zeroing might be high. Perhaps we should
2351
                // just use the above generic case, and go to this one if the
2352
                // number of source pixels is large enough
2353
                std::fill(anVals.begin(), anVals.end(), 0);
4,251,360✔
2354

2355
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
12,777,700✔
2356
                {
2357
                    const GPtrDiff_t iTotYOff =
8,526,370✔
2358
                        static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
8,526,370✔
2359
                        nChunkXOff;
8,526,370✔
2360
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
25,649,400✔
2361
                    {
2362
                        const T val = paSrcScanline[iX + iTotYOff];
17,123,000✔
2363
                        if (!bHasNoData || val != tNoDataValue)
17,123,000✔
2364
                        {
2365
                            int nVal = static_cast<int>(val);
17,123,000✔
2366
                            if (++anVals[nVal] > nMaxVal)
17,123,000✔
2367
                            {
2368
                                // Sum the density.
2369
                                // Is it the most common value so far?
2370
                                iMaxInd = nVal;
17,006,300✔
2371
                                nMaxVal = anVals[nVal];
17,006,300✔
2372
                            }
2373
                        }
2374
                    }
2375
                }
2376

2377
                if (iMaxInd == -1)
4,251,360✔
2378
                    paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
×
2379
                else
2380
                    paDstScanline[iDstPixel - nDstXOff] =
4,251,360✔
2381
                        static_cast<T>(iMaxInd);
2382
            }
2383
        }
2384
    }
2385

2386
    CPLFree(paVals);
136✔
2387
    CPLFree(panSums);
136✔
2388

2389
    return CE_None;
136✔
2390
}
2391

2392
static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
136✔
2393
                                     const void *pChunk, void **ppDstBuffer,
2394
                                     GDALDataType *peDstBufferDataType)
2395
{
2396
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(
136✔
2397
        args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2398
        GDALGetDataTypeSizeBytes(args.eWrkDataType));
2399
    if (*ppDstBuffer == nullptr)
136✔
2400
    {
2401
        return CE_Failure;
×
2402
    }
2403

2404
    CPLAssert(args.eSrcDataType == args.eWrkDataType);
136✔
2405

2406
    *peDstBufferDataType = args.eWrkDataType;
136✔
2407
    switch (args.eWrkDataType)
136✔
2408
    {
2409
        // For mode resampling, as no computation is done, only the
2410
        // size of the data type matters... except for Byte where we have
2411
        // special processing. And for floating point values
2412
        case GDT_Byte:
65✔
2413
        {
2414
            return GDALResampleChunk_ModeT(args,
65✔
2415
                                           static_cast<const GByte *>(pChunk),
2416
                                           static_cast<GByte *>(*ppDstBuffer));
65✔
2417
        }
2418

2419
        case GDT_Int8:
4✔
2420
        {
2421
            return GDALResampleChunk_ModeT(args,
4✔
2422
                                           static_cast<const int8_t *>(pChunk),
2423
                                           static_cast<int8_t *>(*ppDstBuffer));
4✔
2424
        }
2425

2426
        case GDT_Int16:
9✔
2427
        case GDT_UInt16:
2428
        case GDT_Float16:
2429
        {
2430
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
9✔
2431
            return GDALResampleChunk_ModeT(
9✔
2432
                args, static_cast<const uint16_t *>(pChunk),
2433
                static_cast<uint16_t *>(*ppDstBuffer));
9✔
2434
        }
2435

2436
        case GDT_CInt16:
15✔
2437
        case GDT_CFloat16:
2438
        case GDT_Int32:
2439
        case GDT_UInt32:
2440
        {
2441
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
15✔
2442
            return GDALResampleChunk_ModeT(
15✔
2443
                args, static_cast<const uint32_t *>(pChunk),
2444
                static_cast<uint32_t *>(*ppDstBuffer));
15✔
2445
        }
2446

2447
        case GDT_Float32:
17✔
2448
        {
2449
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
17✔
2450
            return GDALResampleChunk_ModeT(args,
17✔
2451
                                           static_cast<const float *>(pChunk),
2452
                                           static_cast<float *>(*ppDstBuffer));
17✔
2453
        }
2454

2455
        case GDT_CInt32:
12✔
2456
        case GDT_Int64:
2457
        case GDT_UInt64:
2458
        {
2459
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
12✔
2460
            return GDALResampleChunk_ModeT(
12✔
2461
                args, static_cast<const uint64_t *>(pChunk),
2462
                static_cast<uint64_t *>(*ppDstBuffer));
12✔
2463
        }
2464

2465
        case GDT_Float64:
6✔
2466
        {
2467
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
6✔
2468
            return GDALResampleChunk_ModeT(args,
6✔
2469
                                           static_cast<const double *>(pChunk),
2470
                                           static_cast<double *>(*ppDstBuffer));
6✔
2471
        }
2472

2473
        case GDT_CFloat32:
4✔
2474
        {
2475
            return GDALResampleChunk_ModeT(
4✔
2476
                args, static_cast<const std::complex<float> *>(pChunk),
2477
                static_cast<std::complex<float> *>(*ppDstBuffer));
4✔
2478
        }
2479

2480
        case GDT_CFloat64:
4✔
2481
        {
2482
            return GDALResampleChunk_ModeT(
4✔
2483
                args, static_cast<const std::complex<double> *>(pChunk),
2484
                static_cast<std::complex<double> *>(*ppDstBuffer));
4✔
2485
        }
2486

2487
        case GDT_Unknown:
×
2488
        case GDT_TypeCount:
2489
            break;
×
2490
    }
2491

2492
    CPLAssert(false);
×
2493
    return CE_Failure;
2494
}
2495

2496
/************************************************************************/
2497
/*                  GDALResampleConvolutionHorizontal()                 */
2498
/************************************************************************/
2499

2500
template <class T>
2501
static inline double
2502
GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
44,642✔
2503
                                  int nSrcPixelCount)
2504
{
2505
    double dfVal1 = 0.0;
44,642✔
2506
    double dfVal2 = 0.0;
44,642✔
2507
    int i = 0;  // Used after for.
44,642✔
2508
    // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2509
    // manually (untypical) unrolled loop in -O2 and -O3:
2510
    // https://github.com/OSGeo/gdal/issues/9508
2511
#if !defined(__INTEL_CLANG_COMPILER)
2512
    for (; i + 3 < nSrcPixelCount; i += 4)
89,044✔
2513
    {
2514
        dfVal1 += pChunk[i] * padfWeights[i];
44,402✔
2515
        dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
44,402✔
2516
        dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
44,402✔
2517
        dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
44,402✔
2518
    }
2519
#endif
2520
    for (; i < nSrcPixelCount; ++i)
46,066✔
2521
    {
2522
        dfVal1 += pChunk[i] * padfWeights[i];
1,424✔
2523
    }
2524
    return dfVal1 + dfVal2;
44,642✔
2525
}
2526

2527
template <class T>
2528
static inline void GDALResampleConvolutionHorizontalWithMask(
48✔
2529
    const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2530
    int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2531
{
2532
    dfVal = 0;
48✔
2533
    dfWeightSum = 0;
48✔
2534
    int i = 0;
48✔
2535
    for (; i + 3 < nSrcPixelCount; i += 4)
48✔
2536
    {
2537
        const double dfWeight0 = padfWeights[i] * pabyMask[i];
×
2538
        const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
×
2539
        const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
×
2540
        const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
×
2541
        dfVal += pChunk[i] * dfWeight0;
×
2542
        dfVal += pChunk[i + 1] * dfWeight1;
×
2543
        dfVal += pChunk[i + 2] * dfWeight2;
×
2544
        dfVal += pChunk[i + 3] * dfWeight3;
×
2545
        dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
×
2546
    }
2547
    for (; i < nSrcPixelCount; ++i)
178✔
2548
    {
2549
        const double dfWeight = padfWeights[i] * pabyMask[i];
130✔
2550
        dfVal += pChunk[i] * dfWeight;
130✔
2551
        dfWeightSum += dfWeight;
130✔
2552
    }
2553
}
48✔
2554

2555
template <class T>
2556
static inline void GDALResampleConvolutionHorizontal_3rows(
1,330,334✔
2557
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2558
    const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2559
    double &dfRes2, double &dfRes3)
2560
{
2561
    double dfVal1 = 0.0;
1,330,334✔
2562
    double dfVal2 = 0.0;
1,330,334✔
2563
    double dfVal3 = 0.0;
1,330,334✔
2564
    double dfVal4 = 0.0;
1,330,334✔
2565
    double dfVal5 = 0.0;
1,330,334✔
2566
    double dfVal6 = 0.0;
1,330,334✔
2567
    int i = 0;  // Used after for.
1,330,334✔
2568
    for (; i + 3 < nSrcPixelCount; i += 4)
2,715,057✔
2569
    {
2570
        dfVal1 += pChunkRow1[i] * padfWeights[i];
1,384,722✔
2571
        dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
1,384,722✔
2572
        dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
1,384,722✔
2573
        dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
1,384,722✔
2574
        dfVal3 += pChunkRow2[i] * padfWeights[i];
1,384,722✔
2575
        dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
1,384,722✔
2576
        dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
1,384,722✔
2577
        dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
1,384,722✔
2578
        dfVal5 += pChunkRow3[i] * padfWeights[i];
1,384,722✔
2579
        dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
1,384,722✔
2580
        dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
1,384,722✔
2581
        dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
1,384,722✔
2582
    }
2583
    for (; i < nSrcPixelCount; ++i)
1,366,941✔
2584
    {
2585
        dfVal1 += pChunkRow1[i] * padfWeights[i];
36,607✔
2586
        dfVal3 += pChunkRow2[i] * padfWeights[i];
36,607✔
2587
        dfVal5 += pChunkRow3[i] * padfWeights[i];
36,607✔
2588
    }
2589
    dfRes1 = dfVal1 + dfVal2;
1,330,334✔
2590
    dfRes2 = dfVal3 + dfVal4;
1,330,334✔
2591
    dfRes3 = dfVal5 + dfVal6;
1,330,334✔
2592
}
1,330,334✔
2593

2594
template <class T>
2595
static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
18,188✔
2596
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2597
    const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2598
    double &dfRes2, double &dfRes3)
2599
{
2600
    GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
18,188✔
2601
                                            padfWeights, nSrcPixelCount, dfRes1,
2602
                                            dfRes2, dfRes3);
2603
}
18,188✔
2604

2605
template <class T>
2606
static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
1,247,346✔
2607
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2608
    const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2609
{
2610
    GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
1,247,346✔
2611
                                            padfWeights, 4, dfRes1, dfRes2,
2612
                                            dfRes3);
2613
}
1,247,346✔
2614

2615
/************************************************************************/
2616
/*                  GDALResampleConvolutionVertical()                   */
2617
/************************************************************************/
2618

2619
template <class T>
2620
static inline double
2621
GDALResampleConvolutionVertical(const T *pChunk, int nStride,
463,535✔
2622
                                const double *padfWeights, int nSrcLineCount)
2623
{
2624
    double dfVal1 = 0.0;
463,535✔
2625
    double dfVal2 = 0.0;
463,535✔
2626
    int i = 0;
463,535✔
2627
    int j = 0;
463,535✔
2628
    for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
912,671✔
2629
    {
2630
        dfVal1 += pChunk[j] * padfWeights[i];
449,136✔
2631
        dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
449,136✔
2632
        dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
449,136✔
2633
        dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
449,136✔
2634
    }
2635
    for (; i < nSrcLineCount; ++i, j += nStride)
516,518✔
2636
    {
2637
        dfVal1 += pChunk[j] * padfWeights[i];
52,983✔
2638
    }
2639
    return dfVal1 + dfVal2;
463,535✔
2640
}
2641

2642
template <class T>
2643
static inline void GDALResampleConvolutionVertical_2cols(
2,880,000✔
2644
    const T *pChunk, int nStride, const double *padfWeights, int nSrcLineCount,
2645
    double &dfRes1, double &dfRes2)
2646
{
2647
    double dfVal1 = 0.0;
2,880,000✔
2648
    double dfVal2 = 0.0;
2,880,000✔
2649
    double dfVal3 = 0.0;
2,880,000✔
2650
    double dfVal4 = 0.0;
2,880,000✔
2651
    int i = 0;
2,880,000✔
2652
    int j = 0;
2,880,000✔
2653
    for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
5,716,800✔
2654
    {
2655
        dfVal1 += pChunk[j] * padfWeights[i];
2,836,800✔
2656
        dfVal3 += pChunk[j + 1] * padfWeights[i];
2,836,800✔
2657
        dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
2,836,800✔
2658
        dfVal3 += pChunk[j + 1 + nStride] * padfWeights[i + 1];
2,836,800✔
2659
        dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2,836,800✔
2660
        dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2,836,800✔
2661
        dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2,836,800✔
2662
        dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2,836,800✔
2663
    }
2664
    for (; i < nSrcLineCount; ++i, j += nStride)
2,995,210✔
2665
    {
2666
        dfVal1 += pChunk[j] * padfWeights[i];
115,210✔
2667
        dfVal3 += pChunk[j + 1] * padfWeights[i];
115,210✔
2668
    }
2669
    dfRes1 = dfVal1 + dfVal2;
2,880,000✔
2670
    dfRes2 = dfVal3 + dfVal4;
2,880,000✔
2671
}
2,880,000✔
2672

2673
#ifdef USE_SSE2
2674

2675
#ifdef __AVX__
2676
/************************************************************************/
2677
/*             GDALResampleConvolutionVertical_16cols<T>                */
2678
/************************************************************************/
2679

2680
template <class T>
2681
static inline void
2682
GDALResampleConvolutionVertical_16cols(const T *pChunk, int nStride,
2683
                                       const double *padfWeights,
2684
                                       int nSrcLineCount, float *afDest)
2685
{
2686
    int i = 0;
2687
    int j = 0;
2688
    XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2689
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2690
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2691
    XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2692
    for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2693
    {
2694
        XMMReg4Double w0 =
2695
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2696
        XMMReg4Double w1 =
2697
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2698
        XMMReg4Double w2 =
2699
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2700
        XMMReg4Double w3 =
2701
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2702
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2703
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2704
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2705
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2706
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2707
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2708
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2709
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2710
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2711
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2712
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2713
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2714
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2715
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2716
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2717
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2718
    }
2719
    for (; i < nSrcLineCount; ++i, j += nStride)
2720
    {
2721
        XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2722
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2723
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2724
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2725
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2726
    }
2727
    v_acc0.Store4Val(afDest);
2728
    v_acc1.Store4Val(afDest + 4);
2729
    v_acc2.Store4Val(afDest + 8);
2730
    v_acc3.Store4Val(afDest + 12);
2731
}
2732

2733
template <class T>
2734
static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2735
                                                          const double *, int,
2736
                                                          double *)
2737
{
2738
    // Cannot be reached
2739
    CPLAssert(false);
2740
}
2741

2742
#else
2743

2744
/************************************************************************/
2745
/*              GDALResampleConvolutionVertical_8cols<T>                */
2746
/************************************************************************/
2747

2748
template <class T>
2749
static inline void
2750
GDALResampleConvolutionVertical_8cols(const T *pChunk, int nStride,
18,850,300✔
2751
                                      const double *padfWeights,
2752
                                      int nSrcLineCount, float *afDest)
2753
{
2754
    int i = 0;
18,850,300✔
2755
    int j = 0;
18,850,300✔
2756
    XMMReg4Double v_acc0 = XMMReg4Double::Zero();
18,850,300✔
2757
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
18,717,000✔
2758
    for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
34,237,400✔
2759
    {
2760
        XMMReg4Double w0 =
15,403,700✔
2761
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
15,403,700✔
2762
        XMMReg4Double w1 =
15,351,900✔
2763
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
15,351,900✔
2764
        XMMReg4Double w2 =
15,331,100✔
2765
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
15,331,100✔
2766
        XMMReg4Double w3 =
15,359,600✔
2767
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
15,359,600✔
2768
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
15,360,700✔
2769
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
15,359,600✔
2770
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
15,368,600✔
2771
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
15,396,800✔
2772
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
15,406,800✔
2773
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
15,386,200✔
2774
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
15,395,900✔
2775
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
15,405,900✔
2776
    }
2777
    for (; i < nSrcLineCount; ++i, j += nStride)
30,245,600✔
2778
    {
2779
        XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
11,412,000✔
2780
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
11,411,900✔
2781
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
11,412,000✔
2782
    }
2783
    v_acc0.Store4Val(afDest);
18,833,700✔
2784
    v_acc1.Store4Val(afDest + 4);
18,805,800✔
2785
}
18,828,300✔
2786

2787
template <class T>
2788
static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
2789
                                                         const double *, int,
2790
                                                         double *)
2791
{
2792
    // Cannot be reached
2793
    CPLAssert(false);
2794
}
2795

2796
#endif  // __AVX__
2797

2798
/************************************************************************/
2799
/*              GDALResampleConvolutionHorizontalSSE2<T>                */
2800
/************************************************************************/
2801

2802
template <class T>
2803
static inline double GDALResampleConvolutionHorizontalSSE2(
2,752,301✔
2804
    const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2805
{
2806
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2,752,301✔
2807
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2,751,706✔
2808
    int i = 0;  // Used after for.
2,751,971✔
2809
    for (; i + 7 < nSrcPixelCount; i += 8)
2,828,145✔
2810
    {
2811
        // Retrieve the pixel & accumulate
2812
        const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
76,095✔
2813
        const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
76,095✔
2814
        const XMMReg4Double v_weight1 =
76,095✔
2815
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
76,095✔
2816
        const XMMReg4Double v_weight2 =
76,095✔
2817
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
76,095✔
2818

2819
        v_acc1 += v_pixels1 * v_weight1;
76,095✔
2820
        v_acc2 += v_pixels2 * v_weight2;
76,095✔
2821
    }
2822

2823
    v_acc1 += v_acc2;
2,752,053✔
2824

2825
    double dfVal = v_acc1.GetHorizSum();
2,751,752✔
2826
    for (; i < nSrcPixelCount; ++i)
9,572,570✔
2827
    {
2828
        dfVal += pChunk[i] * padfWeightsAligned[i];
6,820,810✔
2829
    }
2830
    return dfVal;
2,751,755✔
2831
}
2832

2833
/************************************************************************/
2834
/*              GDALResampleConvolutionHorizontal<GByte>                */
2835
/************************************************************************/
2836

2837
template <>
2838
inline double GDALResampleConvolutionHorizontal<GByte>(
2,203,930✔
2839
    const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2840
{
2841
    return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2,203,930✔
2842
                                                 nSrcPixelCount);
2,203,940✔
2843
}
2844

2845
template <>
2846
inline double GDALResampleConvolutionHorizontal<GUInt16>(
548,483✔
2847
    const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2848
{
2849
    return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
548,483✔
2850
                                                 nSrcPixelCount);
548,605✔
2851
}
2852

2853
/************************************************************************/
2854
/*              GDALResampleConvolutionHorizontalWithMaskSSE2<T>        */
2855
/************************************************************************/
2856

2857
template <class T>
2858
static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
5,806,833✔
2859
    const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
2860
    int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2861
{
2862
    int i = 0;  // Used after for.
5,806,833✔
2863
    XMMReg4Double v_acc = XMMReg4Double::Zero();
5,806,833✔
2864
    XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
5,806,833✔
2865
    for (; i + 3 < nSrcPixelCount; i += 4)
16,456,921✔
2866
    {
2867
        const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
10,650,058✔
2868
        const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
10,650,058✔
2869
        XMMReg4Double v_weight =
10,650,058✔
2870
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
10,650,058✔
2871
        v_weight *= v_mask;
10,650,058✔
2872
        v_acc += v_pixels * v_weight;
10,650,058✔
2873
        v_acc_weight += v_weight;
10,650,058✔
2874
    }
2875

2876
    dfVal = v_acc.GetHorizSum();
5,806,833✔
2877
    dfWeightSum = v_acc_weight.GetHorizSum();
5,806,833✔
2878
    for (; i < nSrcPixelCount; ++i)
6,005,033✔
2879
    {
2880
        const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
198,202✔
2881
        dfVal += pChunk[i] * dfWeight;
198,202✔
2882
        dfWeightSum += dfWeight;
198,202✔
2883
    }
2884
}
5,806,833✔
2885

2886
/************************************************************************/
2887
/*              GDALResampleConvolutionHorizontalWithMask<GByte>        */
2888
/************************************************************************/
2889

2890
template <>
2891
inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
5,806,770✔
2892
    const GByte *pChunk, const GByte *pabyMask,
2893
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2894
    double &dfWeightSum)
2895
{
2896
    GDALResampleConvolutionHorizontalWithMaskSSE2(
5,806,770✔
2897
        pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2898
        dfWeightSum);
2899
}
5,806,770✔
2900

2901
template <>
2902
inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
63✔
2903
    const GUInt16 *pChunk, const GByte *pabyMask,
2904
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2905
    double &dfWeightSum)
2906
{
2907
    GDALResampleConvolutionHorizontalWithMaskSSE2(
63✔
2908
        pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2909
        dfWeightSum);
2910
}
63✔
2911

2912
/************************************************************************/
2913
/*              GDALResampleConvolutionHorizontal_3rows_SSE2<T>         */
2914
/************************************************************************/
2915

2916
template <class T>
2917
static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
10,026,430✔
2918
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2919
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2920
    double &dfRes2, double &dfRes3)
2921
{
2922
    XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
10,026,430✔
2923
                  v_acc2 = XMMReg4Double::Zero(),
10,026,430✔
2924
                  v_acc3 = XMMReg4Double::Zero();
10,026,430✔
2925
    int i = 0;
10,026,430✔
2926
    for (; i + 7 < nSrcPixelCount; i += 8)
19,995,066✔
2927
    {
2928
        // Retrieve the pixel & accumulate.
2929
        XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
9,968,656✔
2930
        XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
9,968,656✔
2931
        const XMMReg4Double v_weight1 =
9,968,656✔
2932
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
9,968,656✔
2933
        const XMMReg4Double v_weight2 =
9,968,656✔
2934
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
9,968,656✔
2935

2936
        v_acc1 += v_pixels1 * v_weight1;
9,968,656✔
2937
        v_acc1 += v_pixels2 * v_weight2;
9,968,656✔
2938

2939
        v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
9,968,656✔
2940
        v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
9,968,656✔
2941
        v_acc2 += v_pixels1 * v_weight1;
9,968,656✔
2942
        v_acc2 += v_pixels2 * v_weight2;
9,968,656✔
2943

2944
        v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
9,968,656✔
2945
        v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
9,968,656✔
2946
        v_acc3 += v_pixels1 * v_weight1;
9,968,656✔
2947
        v_acc3 += v_pixels2 * v_weight2;
9,968,656✔
2948
    }
2949

2950
    dfRes1 = v_acc1.GetHorizSum();
10,026,430✔
2951
    dfRes2 = v_acc2.GetHorizSum();
10,026,430✔
2952
    dfRes3 = v_acc3.GetHorizSum();
10,026,430✔
2953
    for (; i < nSrcPixelCount; ++i)
21,493,126✔
2954
    {
2955
        dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
11,466,796✔
2956
        dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
11,466,796✔
2957
        dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
11,466,796✔
2958
    }
2959
}
10,026,430✔
2960

2961
/************************************************************************/
2962
/*              GDALResampleConvolutionHorizontal_3rows<GByte>          */
2963
/************************************************************************/
2964

2965
template <>
2966
inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
10,026,400✔
2967
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
2968
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2969
    double &dfRes2, double &dfRes3)
2970
{
2971
    GDALResampleConvolutionHorizontal_3rows_SSE2(
10,026,400✔
2972
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2973
        dfRes1, dfRes2, dfRes3);
2974
}
10,026,400✔
2975

2976
template <>
2977
inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
30✔
2978
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
2979
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
2980
    int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
2981
{
2982
    GDALResampleConvolutionHorizontal_3rows_SSE2(
30✔
2983
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2984
        dfRes1, dfRes2, dfRes3);
2985
}
30✔
2986

2987
/************************************************************************/
2988
/*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>   */
2989
/************************************************************************/
2990

2991
template <class T>
2992
static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2,175,321✔
2993
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2994
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2995
    double &dfRes2, double &dfRes3)
2996
{
2997
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2,175,321✔
2998
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2,175,130✔
2999
    XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2,175,305✔
3000
    int i = 0;  // Use after for.
2,175,307✔
3001
    for (; i + 3 < nSrcPixelCount; i += 4)
2,178,625✔
3002
    {
3003
        // Retrieve the pixel & accumulate.
3004
        const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3,284✔
3005
        const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3,284✔
3006
        const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3,284✔
3007
        const XMMReg4Double v_weight =
3,284✔
3008
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3,284✔
3009

3010
        v_acc1 += v_pixels1 * v_weight;
3,284✔
3011
        v_acc2 += v_pixels2 * v_weight;
3,284✔
3012
        v_acc3 += v_pixels3 * v_weight;
3,284✔
3013
    }
3014

3015
    dfRes1 = v_acc1.GetHorizSum();
2,175,345✔
3016
    dfRes2 = v_acc2.GetHorizSum();
2,175,164✔
3017
    dfRes3 = v_acc3.GetHorizSum();
2,175,172✔
3018

3019
    for (; i < nSrcPixelCount; ++i)
6,502,277✔
3020
    {
3021
        dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
4,327,071✔
3022
        dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
4,327,071✔
3023
        dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
4,327,071✔
3024
    }
3025
}
2,175,216✔
3026

3027
/************************************************************************/
3028
/*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>    */
3029
/************************************************************************/
3030

3031
template <>
3032
inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
2,108,500✔
3033
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3034
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3035
    double &dfRes2, double &dfRes3)
3036
{
3037
    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2,108,500✔
3038
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3039
        dfRes1, dfRes2, dfRes3);
3040
}
2,108,530✔
3041

3042
template <>
3043
inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
66,593✔
3044
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3045
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3046
    int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3047
{
3048
    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
66,593✔
3049
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3050
        dfRes1, dfRes2, dfRes3);
3051
}
66,946✔
3052

3053
/************************************************************************/
3054
/*     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>       */
3055
/************************************************************************/
3056

3057
template <class T>
3058
static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
12,561,520✔
3059
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3060
    const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3061
    double &dfRes3)
3062
{
3063
    const XMMReg4Double v_weight =
12,561,520✔
3064
        XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3065

3066
    // Retrieve the pixel & accumulate.
3067
    const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
12,457,830✔
3068
    const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
12,579,280✔
3069
    const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
12,540,410✔
3070

3071
    XMMReg4Double v_acc1 = v_pixels1 * v_weight;
12,581,900✔
3072
    XMMReg4Double v_acc2 = v_pixels2 * v_weight;
12,495,170✔
3073
    XMMReg4Double v_acc3 = v_pixels3 * v_weight;
12,496,510✔
3074

3075
    dfRes1 = v_acc1.GetHorizSum();
12,517,940✔
3076
    dfRes2 = v_acc2.GetHorizSum();
12,489,370✔
3077
    dfRes3 = v_acc3.GetHorizSum();
12,521,200✔
3078
}
12,514,330✔
3079

3080
/************************************************************************/
3081
/*       GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>      */
3082
/************************************************************************/
3083

3084
template <>
3085
inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
6,958,540✔
3086
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3087
    const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3088
    double &dfRes3)
3089
{
3090
    GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
6,958,540✔
3091
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3092
        dfRes3);
3093
}
6,934,960✔
3094

3095
template <>
3096
inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
5,597,730✔
3097
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3098
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3099
    double &dfRes2, double &dfRes3)
3100
{
3101
    GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
5,597,730✔
3102
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3103
        dfRes3);
3104
}
5,572,030✔
3105

3106
#endif  // USE_SSE2
3107

3108
/************************************************************************/
3109
/*                    GDALResampleChunk_Convolution()                   */
3110
/************************************************************************/
3111

3112
template <class T, class Twork, GDALDataType eWrkDataType>
3113
static CPLErr GDALResampleChunk_ConvolutionT(
3,733✔
3114
    const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3115
    FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3116
    int nKernelRadius, bool bKernelWithNegativeWeights, float fMaxVal)
3117

3118
{
3119
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3,733✔
3120
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3,733✔
3121
    const double dfSrcXDelta = args.dfSrcXDelta;
3,733✔
3122
    const double dfSrcYDelta = args.dfSrcYDelta;
3,733✔
3123
    constexpr int nBands = 1;
3,733✔
3124
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3,733✔
3125
    const int nChunkXOff = args.nChunkXOff;
3,733✔
3126
    const int nChunkXSize = args.nChunkXSize;
3,733✔
3127
    const int nChunkYOff = args.nChunkYOff;
3,733✔
3128
    const int nChunkYSize = args.nChunkYSize;
3,733✔
3129
    const int nDstXOff = args.nDstXOff;
3,733✔
3130
    const int nDstXOff2 = args.nDstXOff2;
3,733✔
3131
    const int nDstYOff = args.nDstYOff;
3,733✔
3132
    const int nDstYOff2 = args.nDstYOff2;
3,733✔
3133
    const bool bHasNoData = args.bHasNoData;
3,733✔
3134
    double dfNoDataValue = args.dfNoDataValue;
3,733✔
3135

3136
    if (!bHasNoData)
3,733✔
3137
        dfNoDataValue = 0.0;
3,685✔
3138
    const auto dstDataType = args.eOvrDataType;
3,733✔
3139
    const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3,733✔
3140
    const double dfReplacementVal =
3,733✔
3141
        bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
46✔
3142
                   : dfNoDataValue;
3143
    // cppcheck-suppress unreadVariable
3144
    const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3,733✔
3145
    const bool bNoDataValueInt64Valid =
3,717✔
3146
        isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
3,729✔
3147
    const auto nNodataValueInt64 =
3,717✔
3148
        bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
3149
    constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3,717✔
3150

3151
    // TODO: we should have some generic function to do this.
3152
    Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
3,717✔
3153
    Twork fDstMax = cpl::NumericLimits<Twork>::max();
3,717✔
3154
    if (dstDataType == GDT_Byte)
3,717✔
3155
    {
3156
        fDstMin = std::numeric_limits<GByte>::min();
3,009✔
3157
        fDstMax = std::numeric_limits<GByte>::max();
3,008✔
3158
    }
3159
    else if (dstDataType == GDT_Int8)
712✔
3160
    {
3161
        fDstMin = std::numeric_limits<GInt8>::min();
1✔
3162
        fDstMax = std::numeric_limits<GInt8>::max();
1✔
3163
    }
3164
    else if (dstDataType == GDT_UInt16)
711✔
3165
    {
3166
        fDstMin = std::numeric_limits<GUInt16>::min();
388✔
3167
        fDstMax = std::numeric_limits<GUInt16>::max();
384✔
3168
    }
3169
    else if (dstDataType == GDT_Int16)
329✔
3170
    {
3171
        fDstMin = std::numeric_limits<GInt16>::min();
279✔
3172
        fDstMax = std::numeric_limits<GInt16>::max();
279✔
3173
    }
3174
    else if (dstDataType == GDT_UInt32)
50✔
3175
    {
3176
        fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
1✔
3177
        fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
1✔
3178
    }
3179
    else if (dstDataType == GDT_Int32)
49✔
3180
    {
3181
        // cppcheck-suppress unreadVariable
3182
        fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
2✔
3183
        // cppcheck-suppress unreadVariable
3184
        fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
2✔
3185
    }
3186
    else if (dstDataType == GDT_UInt64)
47✔
3187
    {
3188
        // cppcheck-suppress unreadVariable
3189
        fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
1✔
3190
        // cppcheck-suppress unreadVariable
3191
        fDstMax = static_cast<Twork>(std::numeric_limits<uint64_t>::max());
1✔
3192
    }
3193
    else if (dstDataType == GDT_Int64)
46✔
3194
    {
3195
        // cppcheck-suppress unreadVariable
3196
        fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
1✔
3197
        // cppcheck-suppress unreadVariable
3198
        fDstMax = static_cast<Twork>(std::numeric_limits<int64_t>::max());
1✔
3199
    }
3200

3201
    auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
30,807,166✔
3202
                               bNoDataValueInt64Valid, nNodataValueInt64,
3203
                               dfNoDataValue, dfReplacementVal](Twork fVal)
3204
    {
3205
        if (!bHasNoData)
14,670,600✔
3206
            return fVal;
11,444,200✔
3207

3208
        // Clamp value before comparing to nodata: this is only needed for
3209
        // kernels with negative weights (Lanczos)
3210
        Twork fClamped = fVal;
3,226,380✔
3211
        if (fClamped < fDstMin)
3,226,380✔
3212
            fClamped = fDstMin;
12,874✔
3213
        else if (fClamped > fDstMax)
3,213,500✔
3214
            fClamped = fDstMax;
12,852✔
3215
        if (isIntegerDT)
3,226,380✔
3216
        {
3217
            if (bNoDataValueInt64Valid &&
6,452,730✔
3218
                nNodataValueInt64 == static_cast<GInt64>(std::round(fClamped)))
3,226,370✔
3219
            {
3220
                // Do not use the nodata value
3221
                return static_cast<Twork>(dfReplacementVal);
13,869✔
3222
            }
3223
        }
3224
        else if (dfNoDataValue == fClamped)
9✔
3225
        {
3226
            // Do not use the nodata value
3227
            return static_cast<Twork>(dfReplacementVal);
1✔
3228
        }
3229
        return fClamped;
3,212,510✔
3230
    };
3231

3232
    /* -------------------------------------------------------------------- */
3233
    /*      Allocate work buffers.                                          */
3234
    /* -------------------------------------------------------------------- */
3235
    const int nDstXSize = nDstXOff2 - nDstXOff;
3,717✔
3236
    Twork *pafWrkScanline = nullptr;
3,717✔
3237
    if (dstDataType != eWrkDataType)
3,717✔
3238
    {
3239
        pafWrkScanline =
3240
            static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3,673✔
3241
        if (pafWrkScanline == nullptr)
3,687✔
3242
            return CE_Failure;
×
3243
    }
3244

3245
    const double dfXScale = 1.0 / dfXRatioDstToSrc;
3,731✔
3246
    const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3,731✔
3247
    const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3,731✔
3248
    const double dfYScale = 1.0 / dfYRatioDstToSrc;
3,731✔
3249
    const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3,731✔
3250
    const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3,731✔
3251

3252
    // Temporary array to store result of horizontal filter.
3253
    double *padfHorizontalFiltered = static_cast<double *>(
3254
        VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3,731✔
3255

3256
    // To store convolution coefficients.
3257
    double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3,735✔
3258
        static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
3259
                         0.5) *
3260
        sizeof(double)));
3261

3262
    GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3,734✔
3263
    if (pabyChunkNodataMask)
3,734✔
3264
        pabyChunkNodataMaskHorizontalFiltered =
3265
            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
401✔
3266
    if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3,734✔
3267
        (pabyChunkNodataMask != nullptr &&
401✔
3268
         pabyChunkNodataMaskHorizontalFiltered == nullptr))
3269
    {
3270
        VSIFree(pafWrkScanline);
3✔
3271
        VSIFree(padfHorizontalFiltered);
×
3272
        VSIFreeAligned(padfWeights);
×
3273
        VSIFree(pabyChunkNodataMaskHorizontalFiltered);
×
3274
        return CE_Failure;
×
3275
    }
3276

3277
    /* ==================================================================== */
3278
    /*      First pass: horizontal filter                                   */
3279
    /* ==================================================================== */
3280
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3,731✔
3281
#ifdef USE_SSE2
3282
    bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3,731✔
3283
#endif
3284
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2,741,032✔
3285
    {
3286
        const double dfSrcPixel =
2,737,296✔
3287
            (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
2,737,296✔
3288
        int nSrcPixelStart =
2,737,296✔
3289
            static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
2,737,296✔
3290
        if (nSrcPixelStart < nChunkXOff)
2,737,296✔
3291
            nSrcPixelStart = nChunkXOff;
55,221✔
3292
        int nSrcPixelStop =
2,737,296✔
3293
            static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
2,737,296✔
3294
        if (nSrcPixelStop > nChunkRightXOff)
2,737,296✔
3295
            nSrcPixelStop = nChunkRightXOff;
55,239✔
3296
#if 0
3297
        if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3298
        {
3299
            printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3300
        }
3301
        if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3302
        {
3303
            printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3304
        }
3305
#endif
3306
        const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
2,737,296✔
3307
        double dfWeightSum = 0.0;
2,737,296✔
3308

3309
        // Compute convolution coefficients.
3310
        int nSrcPixel = nSrcPixelStart;
2,737,296✔
3311
        double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
2,737,296✔
3312
        for (; nSrcPixel + 3 < nSrcPixelStop; nSrcPixel += 4)
3,599,498✔
3313
        {
3314
            padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
862,657✔
3315
            dfX += dfXScaleWeight;
862,657✔
3316
            padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
862,657✔
3317
            dfX += dfXScaleWeight;
862,657✔
3318
            padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
862,657✔
3319
            dfX += dfXScaleWeight;
862,657✔
3320
            padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
862,657✔
3321
            dfX += dfXScaleWeight;
862,657✔
3322
            dfWeightSum +=
862,206✔
3323
                pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
862,657✔
3324
        }
3325
        for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
6,718,624✔
3326
        {
3327
            const double dfWeight = pfnFilterFunc(dfX);
3,981,933✔
3328
            padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3,981,784✔
3329
            dfWeightSum += dfWeight;
3,981,784✔
3330
        }
3331

3332
        const int nHeight = nChunkYSize * nBands;
2,736,691✔
3333
        if (pabyChunkNodataMask == nullptr)
2,736,691✔
3334
        {
3335
            if (dfWeightSum != 0)
2,664,702✔
3336
            {
3337
                const double dfInvWeightSum = 1.0 / dfWeightSum;
2,664,723✔
3338
                for (int i = 0; i < nSrcPixelCount; ++i)
9,536,533✔
3339
                    padfWeights[i] *= dfInvWeightSum;
6,871,795✔
3340
            }
3341
            int iSrcLineOff = 0;
2,664,702✔
3342
#ifdef USE_SSE2
3343
            if (nSrcPixelCount == 4)
2,664,702✔
3344
            {
3345
                for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
14,334,496✔
3346
                {
3347
                    const GPtrDiff_t j =
13,775,086✔
3348
                        static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
13,775,086✔
3349
                        (nSrcPixelStart - nChunkXOff);
13,775,086✔
3350
                    double dfVal1 = 0.0;
13,775,086✔
3351
                    double dfVal2 = 0.0;
13,775,086✔
3352
                    double dfVal3 = 0.0;
13,775,086✔
3353
                    GDALResampleConvolutionHorizontalPixelCount4_3rows(
13,775,086✔
3354
                        pChunk + j, pChunk + j + nChunkXSize,
13,775,086✔
3355
                        pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
13,775,086✔
3356
                        dfVal2, dfVal3);
3357
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
13,783,256✔
3358
                                               nDstXSize +
13,783,256✔
3359
                                           iDstPixel - nDstXOff] = dfVal1;
13,783,256✔
3360
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
13,783,256✔
3361
                                            1) *
13,783,256✔
3362
                                               nDstXSize +
13,783,256✔
3363
                                           iDstPixel - nDstXOff] = dfVal2;
13,783,256✔
3364
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
13,783,256✔
3365
                                            2) *
13,783,256✔
3366
                                               nDstXSize +
13,783,256✔
3367
                                           iDstPixel - nDstXOff] = dfVal3;
13,783,256✔
3368
                }
3369
            }
3370
            else if (bSrcPixelCountLess8)
2,113,466✔
3371
            {
3372
                for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
4,228,463✔
3373
                {
3374
                    const GPtrDiff_t j =
2,193,341✔
3375
                        static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
2,193,341✔
3376
                        (nSrcPixelStart - nChunkXOff);
2,193,341✔
3377
                    double dfVal1 = 0.0;
2,193,341✔
3378
                    double dfVal2 = 0.0;
2,193,341✔
3379
                    double dfVal3 = 0.0;
2,193,341✔
3380
                    GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2,193,341✔
3381
                        pChunk + j, pChunk + j + nChunkXSize,
2,193,341✔
3382
                        pChunk + j + 2 * nChunkXSize, padfWeights,
2,193,341✔
3383
                        nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3384
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
2,193,630✔
3385
                                               nDstXSize +
2,193,630✔
3386
                                           iDstPixel - nDstXOff] = dfVal1;
2,193,630✔
3387
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
2,193,630✔
3388
                                            1) *
2,193,630✔
3389
                                               nDstXSize +
2,193,630✔
3390
                                           iDstPixel - nDstXOff] = dfVal2;
2,193,630✔
3391
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
2,193,630✔
3392
                                            2) *
2,193,630✔
3393
                                               nDstXSize +
2,193,630✔
3394
                                           iDstPixel - nDstXOff] = dfVal3;
2,193,630✔
3395
                }
3396
            }
3397
            else
3398
#endif
3399
            {
3400
                for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
10,169,833✔
3401
                {
3402
                    const GPtrDiff_t j =
10,091,230✔
3403
                        static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
10,091,230✔
3404
                        (nSrcPixelStart - nChunkXOff);
10,091,230✔
3405
                    double dfVal1 = 0.0;
10,091,230✔
3406
                    double dfVal2 = 0.0;
10,091,230✔
3407
                    double dfVal3 = 0.0;
10,091,230✔
3408
                    GDALResampleConvolutionHorizontal_3rows(
10,091,230✔
3409
                        pChunk + j, pChunk + j + nChunkXSize,
10,091,230✔
3410
                        pChunk + j + 2 * nChunkXSize, padfWeights,
10,091,230✔
3411
                        nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3412
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
10,091,230✔
3413
                                               nDstXSize +
10,091,230✔
3414
                                           iDstPixel - nDstXOff] = dfVal1;
10,091,230✔
3415
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
10,091,230✔
3416
                                            1) *
10,091,230✔
3417
                                               nDstXSize +
10,091,230✔
3418
                                           iDstPixel - nDstXOff] = dfVal2;
10,091,230✔
3419
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
10,091,230✔
3420
                                            2) *
10,091,230✔
3421
                                               nDstXSize +
10,091,230✔
3422
                                           iDstPixel - nDstXOff] = dfVal3;
10,091,230✔
3423
                }
3424
            }
3425
            for (; iSrcLineOff < nHeight; ++iSrcLineOff)
5,470,351✔
3426
            {
3427
                const GPtrDiff_t j =
2,797,021✔
3428
                    static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
2,797,021✔
3429
                    (nSrcPixelStart - nChunkXOff);
2,797,021✔
3430
                const double dfVal = GDALResampleConvolutionHorizontal(
5,549,582✔
3431
                    pChunk + j, padfWeights, nSrcPixelCount);
2,797,021✔
3432
                padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
2,797,200✔
3433
                                           nDstXSize +
2,797,200✔
3434
                                       iDstPixel - nDstXOff] = dfVal;
2,797,200✔
3435
            }
3436
        }
3437
        else
3438
        {
3439
            for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
18,408,372✔
3440
            {
3441
                const GPtrDiff_t j =
18,333,218✔
3442
                    static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
18,333,218✔
3443
                    (nSrcPixelStart - nChunkXOff);
18,333,218✔
3444

3445
                if (bKernelWithNegativeWeights)
18,333,218✔
3446
                {
3447
                    int nConsecutiveValid = 0;
17,852,612✔
3448
                    int nMaxConsecutiveValid = 0;
17,852,612✔
3449
                    for (int k = 0; k < nSrcPixelCount; k++)
165,500,458✔
3450
                    {
3451
                        if (pabyChunkNodataMask[j + k])
147,648,146✔
3452
                            nConsecutiveValid++;
40,762,353✔
3453
                        else if (nConsecutiveValid)
106,885,793✔
3454
                        {
3455
                            nMaxConsecutiveValid = std::max(
105,332✔
3456
                                nMaxConsecutiveValid, nConsecutiveValid);
105,332✔
3457
                            nConsecutiveValid = 0;
105,332✔
3458
                        }
3459
                    }
3460
                    nMaxConsecutiveValid =
17,852,612✔
3461
                        std::max(nMaxConsecutiveValid, nConsecutiveValid);
17,852,612✔
3462
                    if (nMaxConsecutiveValid < nSrcPixelCount / 2)
17,852,612✔
3463
                    {
3464
                        const size_t nTempOffset =
12,526,307✔
3465
                            static_cast<size_t>(iSrcLineOff) * nDstXSize +
12,526,307✔
3466
                            iDstPixel - nDstXOff;
12,526,307✔
3467
                        padfHorizontalFiltered[nTempOffset] = 0.0;
12,526,307✔
3468
                        pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
12,526,307✔
3469
                        continue;
12,526,307✔
3470
                    }
3471
                }
3472

3473
                double dfVal = 0.0;
5,806,881✔
3474
                GDALResampleConvolutionHorizontalWithMask(
5,806,881✔
3475
                    pChunk + j, pabyChunkNodataMask + j, padfWeights,
5,806,881✔
3476
                    nSrcPixelCount, dfVal, dfWeightSum);
3477
                const size_t nTempOffset =
5,809,858✔
3478
                    static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
5,809,858✔
3479
                    nDstXOff;
5,809,858✔
3480
                if (dfWeightSum > 0.0)
5,809,858✔
3481
                {
3482
                    padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
5,762,218✔
3483
                    pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
5,762,218✔
3484
                }
3485
                else
3486
                {
3487
                    padfHorizontalFiltered[nTempOffset] = 0.0;
47,701✔
3488
                    pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
47,701✔
3489
                }
3490
            }
3491
        }
3492
    }
3493

3494
    /* ==================================================================== */
3495
    /*      Second pass: vertical filter                                    */
3496
    /* ==================================================================== */
3497
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3,736✔
3498

3499
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
202,263✔
3500
    {
3501
        Twork *const pafDstScanline =
198,527✔
3502
            pafWrkScanline ? pafWrkScanline
198,527✔
3503
                           : static_cast<Twork *>(pDstBuffer) +
8,421✔
3504
                                 (iDstLine - nDstYOff) * nDstXSize;
8,421✔
3505

3506
        const double dfSrcLine =
198,527✔
3507
            (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
198,527✔
3508
        int nSrcLineStart =
198,527✔
3509
            static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
198,527✔
3510
        int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
198,527✔
3511
        if (nSrcLineStart < nChunkYOff)
198,527✔
3512
            nSrcLineStart = nChunkYOff;
2,359✔
3513
        if (nSrcLineStop > nChunkBottomYOff)
198,527✔
3514
            nSrcLineStop = nChunkBottomYOff;
2,403✔
3515
#if 0
3516
        if( nSrcLineStart < nChunkYOff &&
3517
            nChunkYOff > 0 )
3518
        {
3519
            printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3520
        }
3521
        if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3522
        {
3523
            printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3524
        }
3525
#endif
3526
        const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
198,527✔
3527
        double dfWeightSum = 0.0;
198,527✔
3528

3529
        // Compute convolution coefficients.
3530
        int nSrcLine = nSrcLineStart;  // Used after for.
198,527✔
3531
        double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
198,527✔
3532
        for (; nSrcLine + 3 < nSrcLineStop;
440,198✔
3533
             nSrcLine += 4, dfY += 4 * dfYScaleWeight)
241,671✔
3534
        {
3535
            padfWeights[nSrcLine - nSrcLineStart] = dfY;
241,619✔
3536
            padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
241,619✔
3537
            padfWeights[nSrcLine + 2 - nSrcLineStart] =
241,619✔
3538
                dfY + 2 * dfYScaleWeight;
241,619✔
3539
            padfWeights[nSrcLine + 3 - nSrcLineStart] =
241,619✔
3540
                dfY + 3 * dfYScaleWeight;
241,619✔
3541
            dfWeightSum +=
241,671✔
3542
                pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
241,619✔
3543
        }
3544
        for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
231,876✔
3545
        {
3546
            const double dfWeight = pfnFilterFunc(dfY);
33,364✔
3547
            padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
33,297✔
3548
            dfWeightSum += dfWeight;
33,297✔
3549
        }
3550

3551
        if (pabyChunkNodataMask == nullptr)
198,512✔
3552
        {
3553
            if (dfWeightSum != 0)
163,958✔
3554
            {
3555
                const double dfInvWeightSum = 1.0 / dfWeightSum;
163,959✔
3556
                for (int i = 0; i < nSrcLineCount; ++i)
921,339✔
3557
                    padfWeights[i] *= dfInvWeightSum;
757,380✔
3558
            }
3559
        }
3560

3561
        if (pabyChunkNodataMask == nullptr)
198,512✔
3562
        {
3563
            int iFilteredPixelOff = 0;  // Used after for.
163,960✔
3564
            // j used after for.
3565
            size_t j =
163,960✔
3566
                (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
163,960✔
3567
#ifdef USE_SSE2
3568
            if constexpr (eWrkDataType == GDT_Float32)
3569
            {
3570
#ifdef __AVX__
3571
                for (; iFilteredPixelOff + 15 < nDstXSize;
3572
                     iFilteredPixelOff += 16, j += 16)
3573
                {
3574
                    GDALResampleConvolutionVertical_16cols(
3575
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3576
                        nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3577
                    if (bHasNoData)
3578
                    {
3579
                        for (int k = 0; k < 16; k++)
3580
                        {
3581
                            pafDstScanline[iFilteredPixelOff + k] =
3582
                                replaceValIfNodata(
3583
                                    pafDstScanline[iFilteredPixelOff + k]);
3584
                        }
3585
                    }
3586
                }
3587
#else
3588
                for (; iFilteredPixelOff + 7 < nDstXSize;
19,017,426✔
3589
                     iFilteredPixelOff += 8, j += 8)
3590
                {
3591
                    GDALResampleConvolutionVertical_8cols(
18,863,000✔
3592
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
18,863,000✔
3593
                        nSrcLineCount, pafDstScanline + iFilteredPixelOff);
18,863,000✔
3594
                    if (bHasNoData)
18,860,750✔
3595
                    {
3596
                        for (int k = 0; k < 8; k++)
17,820✔
3597
                        {
3598
                            pafDstScanline[iFilteredPixelOff + k] =
15,840✔
3599
                                replaceValIfNodata(
15,840✔
3600
                                    pafDstScanline[iFilteredPixelOff + k]);
15,840✔
3601
                        }
3602
                    }
3603
                }
3604
#endif
3605

3606
                for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
617,996✔
3607
                {
3608
                    const Twork fVal =
463,630✔
3609
                        static_cast<Twork>(GDALResampleConvolutionVertical(
463,530✔
3610
                            padfHorizontalFiltered + j, nDstXSize, padfWeights,
463,530✔
3611
                            nSrcLineCount));
3612
                    pafDstScanline[iFilteredPixelOff] =
463,542✔
3613
                        replaceValIfNodata(fVal);
463,630✔
3614
                }
3615
            }
3616
            else
3617
#endif
3618
            {
3619
                for (; iFilteredPixelOff + 1 < nDstXSize;
2,887,210✔
3620
                     iFilteredPixelOff += 2, j += 2)
3621
                {
3622
                    double dfVal1 = 0.0;
2,880,000✔
3623
                    double dfVal2 = 0.0;
2,880,000✔
3624
                    GDALResampleConvolutionVertical_2cols(
2,880,000✔
3625
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
2,880,000✔
3626
                        nSrcLineCount, dfVal1, dfVal2);
3627
                    pafDstScanline[iFilteredPixelOff] =
5,760,010✔
3628
                        replaceValIfNodata(static_cast<Twork>(dfVal1));
2,880,000✔
3629
                    pafDstScanline[iFilteredPixelOff + 1] =
2,880,000✔
3630
                        replaceValIfNodata(static_cast<Twork>(dfVal2));
2,880,000✔
3631
                }
3632
                if (iFilteredPixelOff < nDstXSize)
7,206✔
3633
                {
3634
                    const double dfVal = GDALResampleConvolutionVertical(
2✔
3635
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
2✔
3636
                        nSrcLineCount);
3637
                    pafDstScanline[iFilteredPixelOff] =
2✔
3638
                        replaceValIfNodata(static_cast<Twork>(dfVal));
2✔
3639
                }
3640
            }
3641
        }
3642
        else
3643
        {
3644
            for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
17,349,041✔
3645
                 ++iFilteredPixelOff)
3646
            {
3647
                double dfVal = 0.0;
17,314,505✔
3648
                dfWeightSum = 0.0;
17,314,505✔
3649
                size_t j = (nSrcLineStart - nChunkYOff) *
17,314,505✔
3650
                               static_cast<size_t>(nDstXSize) +
17,314,505✔
3651
                           iFilteredPixelOff;
17,314,505✔
3652
                if (bKernelWithNegativeWeights)
17,314,505✔
3653
                {
3654
                    int nConsecutiveValid = 0;
17,089,601✔
3655
                    int nMaxConsecutiveValid = 0;
17,089,601✔
3656
                    for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
121,806,321✔
3657
                    {
3658
                        const double dfWeight =
104,717,020✔
3659
                            padfWeights[i] *
104,717,020✔
3660
                            pabyChunkNodataMaskHorizontalFiltered[j];
3661
                        if (pabyChunkNodataMaskHorizontalFiltered[j])
104,717,020✔
3662
                        {
3663
                            nConsecutiveValid++;
42,068,237✔
3664
                        }
3665
                        else if (nConsecutiveValid)
62,648,683✔
3666
                        {
3667
                            nMaxConsecutiveValid = std::max(
203,800✔
3668
                                nMaxConsecutiveValid, nConsecutiveValid);
203,800✔
3669
                            nConsecutiveValid = 0;
203,800✔
3670
                        }
3671
                        dfVal += padfHorizontalFiltered[j] * dfWeight;
104,717,020✔
3672
                        dfWeightSum += dfWeight;
104,717,020✔
3673
                    }
3674
                    nMaxConsecutiveValid =
17,089,601✔
3675
                        std::max(nMaxConsecutiveValid, nConsecutiveValid);
17,089,601✔
3676
                    if (nMaxConsecutiveValid < nSrcLineCount / 2)
17,089,601✔
3677
                    {
3678
                        pafDstScanline[iFilteredPixelOff] =
8,867,341✔
3679
                            static_cast<Twork>(dfNoDataValue);
8,867,249✔
3680
                        continue;
8,867,341✔
3681
                    }
3682
                }
3683
                else
3684
                {
3685
                    for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
1,130,262✔
3686
                    {
3687
                        const double dfWeight =
905,432✔
3688
                            padfWeights[i] *
905,432✔
3689
                            pabyChunkNodataMaskHorizontalFiltered[j];
3690
                        dfVal += padfHorizontalFiltered[j] * dfWeight;
905,432✔
3691
                        dfWeightSum += dfWeight;
905,432✔
3692
                    }
3693
                }
3694
                if (dfWeightSum > 0.0)
8,447,134✔
3695
                {
3696
                    pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
8,431,093✔
3697
                        static_cast<Twork>(dfVal / dfWeightSum));
8,431,081✔
3698
                }
3699
                else
3700
                {
3701
                    pafDstScanline[iFilteredPixelOff] =
16,045✔
3702
                        static_cast<Twork>(dfNoDataValue);
16,021✔
3703
                }
3704
            }
3705
        }
3706

3707
        if (fMaxVal != 0.0f)
196,224✔
3708
        {
3709
            for (int i = 0; i < nDstXSize; ++i)
192,324✔
3710
            {
3711
                if (pafDstScanline[i] > fMaxVal)
192,088✔
3712
                    pafDstScanline[i] = fMaxVal;
96,022✔
3713
            }
3714
        }
3715

3716
        if (pafWrkScanline)
196,224✔
3717
        {
3718
            GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
190,110✔
3719
                            static_cast<GByte *>(pDstBuffer) +
3720
                                static_cast<size_t>(iDstLine - nDstYOff) *
190,110✔
3721
                                    nDstXSize * nDstDataTypeSize,
190,110✔
3722
                            dstDataType, nDstDataTypeSize, nDstXSize);
3723
        }
3724
    }
3725

3726
    VSIFree(pafWrkScanline);
3,736✔
3727
    VSIFreeAligned(padfWeights);
3,736✔
3728
    VSIFree(padfHorizontalFiltered);
3,736✔
3729
    VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3,736✔
3730

3731
    return CE_None;
3,736✔
3732
}
3733

3734
static CPLErr
3735
GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
3,736✔
3736
                              const void *pChunk, void **ppDstBuffer,
3737
                              GDALDataType *peDstBufferDataType)
3738
{
3739
    GDALResampleAlg eResample;
3740
    bool bKernelWithNegativeWeights = false;
3,736✔
3741
    if (EQUAL(args.pszResampling, "BILINEAR"))
3,736✔
3742
        eResample = GRA_Bilinear;
2,597✔
3743
    else if (EQUAL(args.pszResampling, "CUBIC"))
1,139✔
3744
    {
3745
        eResample = GRA_Cubic;
1,059✔
3746
        bKernelWithNegativeWeights = true;
1,059✔
3747
    }
3748
    else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
80✔
3749
        eResample = GRA_CubicSpline;
23✔
3750
    else if (EQUAL(args.pszResampling, "LANCZOS"))
57✔
3751
    {
3752
        eResample = GRA_Lanczos;
54✔
3753
        bKernelWithNegativeWeights = true;
54✔
3754
    }
3755
    else
3756
    {
3757
        CPLAssert(false);
3✔
3758
        return CE_Failure;
3759
    }
3760
    const int nKernelRadius = GWKGetFilterRadius(eResample);
3,733✔
3761
    FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
3,732✔
3762
    const FilterFunc4ValuesType pfnFilterFunc4Values =
3763
        GWKGetFilterFunc4Values(eResample);
3,731✔
3764

3765
    float fMaxVal = 0.f;
3,730✔
3766
    // Cubic, etc... can have overshoots, so make sure we clamp values to the
3767
    // maximum value if NBITS is set.
3768
    if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
3,730✔
3769
        (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
8✔
3770
         args.eOvrDataType == GDT_UInt32))
×
3771
    {
3772
        int nBits = args.nOvrNBITS;
8✔
3773
        if (nBits == GDALGetDataTypeSize(args.eOvrDataType))
8✔
3774
            nBits = 0;
1✔
3775
        if (nBits > 0 && nBits < 32)
8✔
3776
            fMaxVal = static_cast<float>((1U << nBits) - 1);
7✔
3777
    }
3778

3779
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(
3,730✔
3780
        args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
3781
        GDALGetDataTypeSizeBytes(args.eOvrDataType));
3782
    if (*ppDstBuffer == nullptr)
3,735✔
3783
    {
3784
        return CE_Failure;
×
3785
    }
3786
    *peDstBufferDataType = args.eOvrDataType;
3,735✔
3787

3788
    switch (args.eWrkDataType)
3,735✔
3789
    {
3790
        case GDT_Byte:
3,010✔
3791
        {
3792
            return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
3,010✔
3793
                args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
3794
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3795
                bKernelWithNegativeWeights, fMaxVal);
3,010✔
3796
        }
3797

3798
        case GDT_UInt16:
395✔
3799
        {
3800
            return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
395✔
3801
                args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
3802
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3803
                bKernelWithNegativeWeights, fMaxVal);
396✔
3804
        }
3805

3806
        case GDT_Float32:
301✔
3807
        {
3808
            return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
301✔
3809
                args, static_cast<const float *>(pChunk), *ppDstBuffer,
3810
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3811
                bKernelWithNegativeWeights, fMaxVal);
301✔
3812
        }
3813

3814
        case GDT_Float64:
29✔
3815
        {
3816
            return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
29✔
3817
                args, static_cast<const double *>(pChunk), *ppDstBuffer,
3818
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3819
                bKernelWithNegativeWeights, fMaxVal);
29✔
3820
        }
3821

3822
        default:
×
3823
            break;
×
3824
    }
3825

3826
    CPLAssert(false);
×
3827
    return CE_Failure;
3828
}
3829

3830
/************************************************************************/
3831
/*                       GDALResampleChunkC32R()                        */
3832
/************************************************************************/
3833

3834
static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
2✔
3835
                                    const float *pafChunk, const int nChunkYOff,
3836
                                    const int nChunkYSize, const int nDstYOff,
3837
                                    const int nDstYOff2, const int nOvrXSize,
3838
                                    const int nOvrYSize, void **ppDstBuffer,
3839
                                    GDALDataType *peDstBufferDataType,
3840
                                    const char *pszResampling)
3841

3842
{
3843
    enum Method
3844
    {
3845
        NEAR,
3846
        AVERAGE,
3847
        AVERAGE_MAGPHASE,
3848
        RMS,
3849
    };
3850

3851
    Method eMethod = NEAR;
2✔
3852
    if (STARTS_WITH_CI(pszResampling, "NEAR"))
2✔
3853
    {
3854
        eMethod = NEAR;
×
3855
    }
3856
    else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
2✔
3857
    {
3858
        eMethod = AVERAGE_MAGPHASE;
×
3859
    }
3860
    else if (EQUAL(pszResampling, "RMS"))
2✔
3861
    {
3862
        eMethod = RMS;
2✔
3863
    }
3864
    else if (STARTS_WITH_CI(pszResampling, "AVER"))
×
3865
    {
3866
        eMethod = AVERAGE;
×
3867
    }
3868
    else
3869
    {
3870
        CPLError(
×
3871
            CE_Failure, CPLE_NotSupported,
3872
            "Resampling method %s is not supported for complex data types. "
3873
            "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
3874
            pszResampling);
3875
        return CE_Failure;
×
3876
    }
3877

3878
    const int nOXSize = nOvrXSize;
2✔
3879
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
2✔
3880
                                       GDALGetDataTypeSizeBytes(GDT_CFloat32));
3881
    if (*ppDstBuffer == nullptr)
2✔
3882
    {
3883
        return CE_Failure;
×
3884
    }
3885
    float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
2✔
3886
    *peDstBufferDataType = GDT_CFloat32;
2✔
3887

3888
    const int nOYSize = nOvrYSize;
2✔
3889
    const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
2✔
3890
    const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
2✔
3891

3892
    /* ==================================================================== */
3893
    /*      Loop over destination scanlines.                                */
3894
    /* ==================================================================== */
3895
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
8✔
3896
    {
3897
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
6✔
3898
        if (nSrcYOff < nChunkYOff)
6✔
3899
            nSrcYOff = nChunkYOff;
×
3900

3901
        int nSrcYOff2 =
6✔
3902
            static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
6✔
3903
        if (nSrcYOff2 == nSrcYOff)
6✔
3904
            nSrcYOff2++;
×
3905

3906
        if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
6✔
3907
        {
3908
            if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
2✔
3909
                nSrcYOff = nSrcHeight - 1;
×
3910
            nSrcYOff2 = nSrcHeight;
2✔
3911
        }
3912
        if (nSrcYOff2 > nChunkYOff + nChunkYSize)
6✔
3913
            nSrcYOff2 = nChunkYOff + nChunkYSize;
×
3914

3915
        const float *const pafSrcScanline =
6✔
3916
            pafChunk + ((nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
6✔
3917
        float *const pafDstScanline =
6✔
3918
            pafDstBuffer + (iDstLine - nDstYOff) * 2 * nOXSize;
6✔
3919

3920
        /* --------------------------------------------------------------------
3921
         */
3922
        /*      Loop over destination pixels */
3923
        /* --------------------------------------------------------------------
3924
         */
3925
        for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
18✔
3926
        {
3927
            int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
12✔
3928
            int nSrcXOff2 =
12✔
3929
                static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
12✔
3930
            if (nSrcXOff2 == nSrcXOff)
12✔
3931
                nSrcXOff2++;
×
3932
            if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
12✔
3933
            {
3934
                if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
6✔
3935
                    nSrcXOff = nSrcWidth - 1;
×
3936
                nSrcXOff2 = nSrcWidth;
6✔
3937
            }
3938

3939
            if (eMethod == NEAR)
12✔
3940
            {
3941
                pafDstScanline[iDstPixel * 2] = pafSrcScanline[nSrcXOff * 2];
×
3942
                pafDstScanline[iDstPixel * 2 + 1] =
×
3943
                    pafSrcScanline[nSrcXOff * 2 + 1];
×
3944
            }
3945
            else if (eMethod == AVERAGE_MAGPHASE)
12✔
3946
            {
3947
                double dfTotalR = 0.0;
×
3948
                double dfTotalI = 0.0;
×
3949
                double dfTotalM = 0.0;
×
3950
                int nCount = 0;
×
3951

3952
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
×
3953
                {
3954
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
×
3955
                    {
3956
                        const double dfR =
×
3957
                            pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
×
3958
                                                        iY - nSrcYOff) *
×
3959
                                                        nSrcWidth * 2];
×
3960
                        const double dfI =
×
3961
                            pafSrcScanline[iX * 2 +
×
3962
                                           static_cast<GPtrDiff_t>(iY -
×
3963
                                                                   nSrcYOff) *
×
3964
                                               nSrcWidth * 2 +
×
3965
                                           1];
×
3966
                        dfTotalR += dfR;
×
3967
                        dfTotalI += dfI;
×
3968
                        dfTotalM += std::hypot(dfR, dfI);
×
3969
                        ++nCount;
×
3970
                    }
3971
                }
3972

3973
                CPLAssert(nCount > 0);
×
3974
                if (nCount == 0)
×
3975
                {
3976
                    pafDstScanline[iDstPixel * 2] = 0.0;
×
3977
                    pafDstScanline[iDstPixel * 2 + 1] = 0.0;
×
3978
                }
3979
                else
3980
                {
3981
                    pafDstScanline[iDstPixel * 2] =
×
3982
                        static_cast<float>(dfTotalR / nCount);
×
3983
                    pafDstScanline[iDstPixel * 2 + 1] =
×
3984
                        static_cast<float>(dfTotalI / nCount);
×
3985
                    const double dfM =
3986
                        std::hypot(pafDstScanline[iDstPixel * 2],
×
3987
                                   pafDstScanline[iDstPixel * 2 + 1]);
×
3988
                    const double dfDesiredM = dfTotalM / nCount;
×
3989
                    double dfRatio = 1.0;
×
3990
                    if (dfM != 0.0)
×
3991
                        dfRatio = dfDesiredM / dfM;
×
3992

3993
                    pafDstScanline[iDstPixel * 2] *=
×
3994
                        static_cast<float>(dfRatio);
×
3995
                    pafDstScanline[iDstPixel * 2 + 1] *=
×
3996
                        static_cast<float>(dfRatio);
×
3997
                }
3998
            }
3999
            else if (eMethod == RMS)
12✔
4000
            {
4001
                double dfTotalR = 0.0;
12✔
4002
                double dfTotalI = 0.0;
12✔
4003
                int nCount = 0;
12✔
4004

4005
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
36✔
4006
                {
4007
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
72✔
4008
                    {
4009
                        const double dfR =
48✔
4010
                            pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
48✔
4011
                                                        iY - nSrcYOff) *
48✔
4012
                                                        nSrcWidth * 2];
48✔
4013
                        const double dfI =
48✔
4014
                            pafSrcScanline[iX * 2 +
48✔
4015
                                           static_cast<GPtrDiff_t>(iY -
48✔
4016
                                                                   nSrcYOff) *
48✔
4017
                                               nSrcWidth * 2 +
48✔
4018
                                           1];
48✔
4019

4020
                        dfTotalR += SQUARE(dfR);
48✔
4021
                        dfTotalI += SQUARE(dfI);
48✔
4022

4023
                        ++nCount;
48✔
4024
                    }
4025
                }
4026

4027
                CPLAssert(nCount > 0);
12✔
4028
                if (nCount == 0)
12✔
4029
                {
4030
                    pafDstScanline[iDstPixel * 2] = 0.0;
×
4031
                    pafDstScanline[iDstPixel * 2 + 1] = 0.0;
×
4032
                }
4033
                else
4034
                {
4035
                    /* compute RMS */
4036
                    pafDstScanline[iDstPixel * 2] =
12✔
4037
                        static_cast<float>(sqrt(dfTotalR / nCount));
12✔
4038
                    pafDstScanline[iDstPixel * 2 + 1] =
12✔
4039
                        static_cast<float>(sqrt(dfTotalI / nCount));
12✔
4040
                }
4041
            }
4042
            else if (eMethod == AVERAGE)
×
4043
            {
4044
                double dfTotalR = 0.0;
×
4045
                double dfTotalI = 0.0;
×
4046
                int nCount = 0;
×
4047

4048
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
×
4049
                {
4050
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
×
4051
                    {
4052
                        // TODO(schwehr): Maybe use std::complex?
4053
                        dfTotalR +=
×
4054
                            pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
×
4055
                                                        iY - nSrcYOff) *
×
4056
                                                        nSrcWidth * 2];
×
4057
                        dfTotalI += pafSrcScanline[iX * 2 +
×
4058
                                                   static_cast<GPtrDiff_t>(
×
4059
                                                       iY - nSrcYOff) *
×
4060
                                                       nSrcWidth * 2 +
×
4061
                                                   1];
×
4062
                        ++nCount;
×
4063
                    }
4064
                }
4065

4066
                CPLAssert(nCount > 0);
×
4067
                if (nCount == 0)
×
4068
                {
4069
                    pafDstScanline[iDstPixel * 2] = 0.0;
×
4070
                    pafDstScanline[iDstPixel * 2 + 1] = 0.0;
×
4071
                }
4072
                else
4073
                {
4074
                    pafDstScanline[iDstPixel * 2] =
×
4075
                        static_cast<float>(dfTotalR / nCount);
×
4076
                    pafDstScanline[iDstPixel * 2 + 1] =
×
4077
                        static_cast<float>(dfTotalI / nCount);
×
4078
                }
4079
            }
4080
        }
4081
    }
4082

4083
    return CE_None;
2✔
4084
}
4085

4086
/************************************************************************/
4087
/*                  GDALRegenerateCascadingOverviews()                  */
4088
/*                                                                      */
4089
/*      Generate a list of overviews in order from largest to           */
4090
/*      smallest, computing each from the next larger.                  */
4091
/************************************************************************/
4092

4093
static CPLErr GDALRegenerateCascadingOverviews(
42✔
4094
    GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4095
    const char *pszResampling, GDALProgressFunc pfnProgress,
4096
    void *pProgressData, CSLConstList papszOptions)
4097

4098
{
4099
    /* -------------------------------------------------------------------- */
4100
    /*      First, we must put the overviews in order from largest to       */
4101
    /*      smallest.                                                       */
4102
    /* -------------------------------------------------------------------- */
4103
    for (int i = 0; i < nOverviews - 1; ++i)
120✔
4104
    {
4105
        for (int j = 0; j < nOverviews - i - 1; ++j)
270✔
4106
        {
4107
            if (papoOvrBands[j]->GetXSize() *
192✔
4108
                    static_cast<float>(papoOvrBands[j]->GetYSize()) <
192✔
4109
                papoOvrBands[j + 1]->GetXSize() *
192✔
4110
                    static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
192✔
4111
            {
4112
                GDALRasterBand *poTempBand = papoOvrBands[j];
×
4113
                papoOvrBands[j] = papoOvrBands[j + 1];
×
4114
                papoOvrBands[j + 1] = poTempBand;
×
4115
            }
4116
        }
4117
    }
4118

4119
    /* -------------------------------------------------------------------- */
4120
    /*      Count total pixels so we can prepare appropriate scaled         */
4121
    /*      progress functions.                                             */
4122
    /* -------------------------------------------------------------------- */
4123
    double dfTotalPixels = 0.0;
42✔
4124

4125
    for (int i = 0; i < nOverviews; ++i)
162✔
4126
    {
4127
        dfTotalPixels += papoOvrBands[i]->GetXSize() *
120✔
4128
                         static_cast<double>(papoOvrBands[i]->GetYSize());
120✔
4129
    }
4130

4131
    /* -------------------------------------------------------------------- */
4132
    /*      Generate all the bands.                                         */
4133
    /* -------------------------------------------------------------------- */
4134
    double dfPixelsProcessed = 0.0;
42✔
4135

4136
    for (int i = 0; i < nOverviews; ++i)
162✔
4137
    {
4138
        GDALRasterBand *poBaseBand = poSrcBand;
120✔
4139
        if (i != 0)
120✔
4140
            poBaseBand = papoOvrBands[i - 1];
78✔
4141

4142
        double dfPixels = papoOvrBands[i]->GetXSize() *
120✔
4143
                          static_cast<double>(papoOvrBands[i]->GetYSize());
120✔
4144

4145
        void *pScaledProgressData = GDALCreateScaledProgress(
240✔
4146
            dfPixelsProcessed / dfTotalPixels,
4147
            (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
120✔
4148
            pProgressData);
4149

4150
        const CPLErr eErr = GDALRegenerateOverviewsEx(
240✔
4151
            poBaseBand, 1,
4152
            reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
120✔
4153
            pszResampling, GDALScaledProgress, pScaledProgressData,
4154
            papszOptions);
4155
        GDALDestroyScaledProgress(pScaledProgressData);
120✔
4156

4157
        if (eErr != CE_None)
120✔
4158
            return eErr;
×
4159

4160
        dfPixelsProcessed += dfPixels;
120✔
4161

4162
        // Only do the bit2grayscale promotion on the base band.
4163
        if (STARTS_WITH_CI(pszResampling,
120✔
4164
                           "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4165
            pszResampling = "AVERAGE";
8✔
4166
    }
4167

4168
    return CE_None;
42✔
4169
}
4170

4171
/************************************************************************/
4172
/*                    GDALGetResampleFunction()                         */
4173
/************************************************************************/
4174

4175
GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
3,903✔
4176
                                             int *pnRadius)
4177
{
4178
    if (pnRadius)
3,903✔
4179
        *pnRadius = 0;
3,903✔
4180
    if (STARTS_WITH_CI(pszResampling, "NEAR"))
3,903✔
4181
        return GDALResampleChunk_Near;
444✔
4182
    else if (STARTS_WITH_CI(pszResampling, "AVER") ||
3,459✔
4183
             EQUAL(pszResampling, "RMS"))
2,941✔
4184
        return GDALResampleChunk_AverageOrRMS;
548✔
4185
    else if (EQUAL(pszResampling, "GAUSS"))
2,911✔
4186
    {
4187
        if (pnRadius)
26✔
4188
            *pnRadius = 1;
26✔
4189
        return GDALResampleChunk_Gauss;
26✔
4190
    }
4191
    else if (EQUAL(pszResampling, "MODE"))
2,885✔
4192
        return GDALResampleChunk_Mode;
96✔
4193
    else if (EQUAL(pszResampling, "CUBIC"))
2,789✔
4194
    {
4195
        if (pnRadius)
413✔
4196
            *pnRadius = GWKGetFilterRadius(GRA_Cubic);
412✔
4197
        return GDALResampleChunk_Convolution;
412✔
4198
    }
4199
    else if (EQUAL(pszResampling, "CUBICSPLINE"))
2,376✔
4200
    {
4201
        if (pnRadius)
3✔
4202
            *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
3✔
4203
        return GDALResampleChunk_Convolution;
3✔
4204
    }
4205
    else if (EQUAL(pszResampling, "LANCZOS"))
2,373✔
4206
    {
4207
        if (pnRadius)
8✔
4208
            *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
8✔
4209
        return GDALResampleChunk_Convolution;
8✔
4210
    }
4211
    else if (EQUAL(pszResampling, "BILINEAR"))
2,365✔
4212
    {
4213
        if (pnRadius)
2,367✔
4214
            *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
2,367✔
4215
        return GDALResampleChunk_Convolution;
2,367✔
4216
    }
4217
    else
4218
    {
4219
        CPLError(
×
4220
            CE_Failure, CPLE_AppDefined,
4221
            "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4222
            pszResampling);
4223
        return nullptr;
×
4224
    }
4225
}
4226

4227
/************************************************************************/
4228
/*                      GDALGetOvrWorkDataType()                        */
4229
/************************************************************************/
4230

4231
GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
3,791✔
4232
                                    GDALDataType eSrcDataType)
4233
{
4234
    if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
3,791✔
4235
    {
4236
        return eSrcDataType;
527✔
4237
    }
4238
    else if (eSrcDataType == GDT_Byte &&
3,264✔
4239
             (STARTS_WITH_CI(pszResampling, "AVER") ||
2,947✔
4240
              EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
2,487✔
4241
              EQUAL(pszResampling, "CUBICSPLINE") ||
2,257✔
4242
              EQUAL(pszResampling, "LANCZOS") ||
2,254✔
4243
              EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
2,249✔
4244
    {
4245
        return GDT_Byte;
2,939✔
4246
    }
4247
    else if (eSrcDataType == GDT_UInt16 &&
325✔
4248
             (STARTS_WITH_CI(pszResampling, "AVER") ||
120✔
4249
              EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
107✔
4250
              EQUAL(pszResampling, "CUBICSPLINE") ||
3✔
4251
              EQUAL(pszResampling, "LANCZOS") ||
3✔
4252
              EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
2✔
4253
    {
4254
        return GDT_UInt16;
115✔
4255
    }
4256
    else if (EQUAL(pszResampling, "GAUSS"))
210✔
4257
        return GDT_Float64;
20✔
4258

4259
    if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
190✔
4260
        eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
184✔
4261
        eSrcDataType == GDT_Float32)
4262
    {
4263
        return GDT_Float32;
154✔
4264
    }
4265
    return GDT_Float64;
36✔
4266
}
4267

4268
namespace
4269
{
4270
// Structure to hold a pointer to free with CPLFree()
4271
struct PointerHolder
4272
{
4273
    void *ptr = nullptr;
4274

4275
    explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
34,735✔
4276
    {
4277
    }
34,735✔
4278

4279
    ~PointerHolder()
34,738✔
4280
    {
34,738✔
4281
        CPLFree(ptr);
34,738✔
4282
    }
34,738✔
4283

4284
    PointerHolder(const PointerHolder &) = delete;
4285
    PointerHolder &operator=(const PointerHolder &) = delete;
4286
};
4287
}  // namespace
4288

4289
/************************************************************************/
4290
/*                      GDALRegenerateOverviews()                       */
4291
/************************************************************************/
4292

4293
/**
4294
 * \brief Generate downsampled overviews.
4295
 *
4296
 * This function will generate one or more overview images from a base image
4297
 * using the requested downsampling algorithm.  Its primary use is for
4298
 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4299
 * used to generate downsampled images in one file from another outside the
4300
 * overview architecture.
4301
 *
4302
 * The output bands need to exist in advance.
4303
 *
4304
 * The full set of resampling algorithms is documented in
4305
 * GDALDataset::BuildOverviews().
4306
 *
4307
 * This function will honour properly NODATA_VALUES tuples (special dataset
4308
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4309
 * considered as the nodata value and not each value of the triplet
4310
 * independently per band.
4311
 *
4312
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4313
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4314
 * overview computation.
4315
 *
4316
 * @param hSrcBand the source (base level) band.
4317
 * @param nOverviewCount the number of downsampled bands being generated.
4318
 * @param pahOvrBands the list of downsampled bands to be generated.
4319
 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4320
 * @param pfnProgress progress report function.
4321
 * @param pProgressData progress function callback data.
4322
 * @return CE_None on success or CE_Failure on failure.
4323
 */
4324
CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
250✔
4325
                               GDALRasterBandH *pahOvrBands,
4326
                               const char *pszResampling,
4327
                               GDALProgressFunc pfnProgress,
4328
                               void *pProgressData)
4329

4330
{
4331
    return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
250✔
4332
                                     pszResampling, pfnProgress, pProgressData,
4333
                                     nullptr);
250✔
4334
}
4335

4336
/************************************************************************/
4337
/*                     GDALRegenerateOverviewsEx()                      */
4338
/************************************************************************/
4339

4340
/**
4341
 * \brief Generate downsampled overviews.
4342
 *
4343
 * This function will generate one or more overview images from a base image
4344
 * using the requested downsampling algorithm.  Its primary use is for
4345
 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4346
 * used to generate downsampled images in one file from another outside the
4347
 * overview architecture.
4348
 *
4349
 * The output bands need to exist in advance.
4350
 *
4351
 * The full set of resampling algorithms is documented in
4352
 * GDALDataset::BuildOverviews().
4353
 *
4354
 * This function will honour properly NODATA_VALUES tuples (special dataset
4355
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4356
 * considered as the nodata value and not each value of the triplet
4357
 * independently per band.
4358
 *
4359
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4360
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4361
 * overview computation.
4362
 *
4363
 * @param hSrcBand the source (base level) band.
4364
 * @param nOverviewCount the number of downsampled bands being generated.
4365
 * @param pahOvrBands the list of downsampled bands to be generated.
4366
 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4367
 * @param pfnProgress progress report function.
4368
 * @param pProgressData progress function callback data.
4369
 * @param papszOptions NULL terminated list of options as key=value pairs, or
4370
 * NULL
4371
 * @return CE_None on success or CE_Failure on failure.
4372
 * @since GDAL 3.6
4373
 */
4374
CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
813✔
4375
                                 GDALRasterBandH *pahOvrBands,
4376
                                 const char *pszResampling,
4377
                                 GDALProgressFunc pfnProgress,
4378
                                 void *pProgressData, CSLConstList papszOptions)
4379

4380
{
4381
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
813✔
4382
    GDALRasterBand **papoOvrBands =
813✔
4383
        reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4384

4385
    if (pfnProgress == nullptr)
813✔
4386
        pfnProgress = GDALDummyProgress;
250✔
4387

4388
    if (EQUAL(pszResampling, "NONE"))
813✔
4389
        return CE_None;
61✔
4390

4391
    int nKernelRadius = 0;
752✔
4392
    GDALResampleFunction pfnResampleFn =
4393
        GDALGetResampleFunction(pszResampling, &nKernelRadius);
752✔
4394

4395
    if (pfnResampleFn == nullptr)
752✔
4396
        return CE_Failure;
×
4397

4398
    /* -------------------------------------------------------------------- */
4399
    /*      Check color tables...                                           */
4400
    /* -------------------------------------------------------------------- */
4401
    GDALColorTable *poColorTable = nullptr;
752✔
4402

4403
    if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
391✔
4404
         EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
1,578✔
4405
        poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
446✔
4406
    {
4407
        poColorTable = poSrcBand->GetColorTable();
9✔
4408
        if (poColorTable != nullptr)
9✔
4409
        {
4410
            if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
9✔
4411
            {
4412
                CPLError(CE_Warning, CPLE_AppDefined,
×
4413
                         "Computing overviews on palette index raster bands "
4414
                         "with a palette whose color interpretation is not RGB "
4415
                         "will probably lead to unexpected results.");
4416
                poColorTable = nullptr;
×
4417
            }
4418
            else if (poColorTable->IsIdentity())
9✔
4419
            {
4420
                poColorTable = nullptr;
×
4421
            }
4422
        }
4423
        else
4424
        {
4425
            CPLError(CE_Warning, CPLE_AppDefined,
×
4426
                     "Computing overviews on palette index raster bands "
4427
                     "without a palette will probably lead to unexpected "
4428
                     "results.");
4429
        }
4430
    }
4431
    // Not ready yet
4432
    else if ((EQUAL(pszResampling, "CUBIC") ||
2,175✔
4433
              EQUAL(pszResampling, "CUBICSPLINE") ||
689✔
4434
              EQUAL(pszResampling, "LANCZOS") ||
689✔
4435
              EQUAL(pszResampling, "BILINEAR")) &&
1,489✔
4436
             poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
57✔
4437
    {
4438
        CPLError(CE_Warning, CPLE_AppDefined,
×
4439
                 "Computing %s overviews on palette index raster bands "
4440
                 "will probably lead to unexpected results.",
4441
                 pszResampling);
4442
    }
4443

4444
    // If we have a nodata mask and we are doing something more complicated
4445
    // than nearest neighbouring, we have to fetch to nodata mask.
4446

4447
    GDALRasterBand *poMaskBand = nullptr;
752✔
4448
    bool bUseNoDataMask = false;
752✔
4449
    bool bCanUseCascaded = true;
752✔
4450

4451
    if (!STARTS_WITH_CI(pszResampling, "NEAR"))
752✔
4452
    {
4453
        // Special case if we are an alpha/mask band. We want it to be
4454
        // considered as the mask band to avoid alpha=0 to be taken into account
4455
        // in average computation.
4456
        if (poSrcBand->IsMaskBand())
503✔
4457
        {
4458
            poMaskBand = poSrcBand;
90✔
4459
            bUseNoDataMask = true;
90✔
4460
        }
4461
        else
4462
        {
4463
            poMaskBand = poSrcBand->GetMaskBand();
413✔
4464
            const int nMaskFlags = poSrcBand->GetMaskFlags();
413✔
4465
            bCanUseCascaded =
413✔
4466
                (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
413✔
4467
            bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
413✔
4468
        }
4469
    }
4470

4471
    /* -------------------------------------------------------------------- */
4472
    /*      If we are operating on multiple overviews, and using            */
4473
    /*      averaging, lets do them in cascading order to reduce the        */
4474
    /*      amount of computation.                                          */
4475
    /* -------------------------------------------------------------------- */
4476

4477
    // In case the mask made be computed from another band of the dataset,
4478
    // we can't use cascaded generation, as the computation of the overviews
4479
    // of the band used for the mask band may not have yet occurred (#3033).
4480
    if ((STARTS_WITH_CI(pszResampling, "AVER") ||
752✔
4481
         EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
391✔
4482
         EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
360✔
4483
         EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
306✔
4484
         EQUAL(pszResampling, "MODE")) &&
752✔
4485
        nOverviewCount > 1 && bCanUseCascaded)
42✔
4486
        return GDALRegenerateCascadingOverviews(
42✔
4487
            poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4488
            pProgressData, papszOptions);
42✔
4489

4490
    /* -------------------------------------------------------------------- */
4491
    /*      Setup one horizontal swath to read from the raw buffer.         */
4492
    /* -------------------------------------------------------------------- */
4493
    int nFRXBlockSize = 0;
710✔
4494
    int nFRYBlockSize = 0;
710✔
4495
    poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
710✔
4496

4497
    const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
710✔
4498
    const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
1,171✔
4499
                                       EQUAL(pszResampling, "MODE") ||
1,125✔
4500
                                       !GDALDataTypeIsComplex(eSrcDataType);
415✔
4501
    const GDALDataType eWrkDataType =
4502
        bUseGenericResampleFn
4503
            ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
710✔
4504
            : GDT_CFloat32;
710✔
4505

4506
    const int nWidth = poSrcBand->GetXSize();
710✔
4507
    const int nHeight = poSrcBand->GetYSize();
710✔
4508

4509
    int nMaxOvrFactor = 1;
710✔
4510
    for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
1,501✔
4511
    {
4512
        const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
791✔
4513
        const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
791✔
4514
        nMaxOvrFactor = std::max(
791✔
4515
            nMaxOvrFactor,
4516
            static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
791✔
4517
        nMaxOvrFactor = std::max(
791✔
4518
            nMaxOvrFactor,
4519
            static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
791✔
4520
    }
4521

4522
    int nFullResYChunk = nFRYBlockSize;
710✔
4523
    int nMaxChunkYSizeQueried = 0;
710✔
4524

4525
    const auto UpdateChunkHeightAndGetChunkSize =
4526
        [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
9,232✔
4527
         eWrkDataType, nWidth]()
27,696✔
4528
    {
4529
        // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4530
        // + nFullResYChunk) / nMaxOvrFactor)
4531
        nFullResYChunk = std::max(nFullResYChunk, 2 * nMaxOvrFactor);
9,232✔
4532
        nMaxChunkYSizeQueried =
9,232✔
4533
            nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
9,232✔
4534
        return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
9,232✔
4535
               nMaxChunkYSizeQueried * nWidth;
9,232✔
4536
    };
710✔
4537

4538
    // Only configurable for debug / testing
4539
    const char *pszChunkYSize =
4540
        CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
710✔
4541
    if (pszChunkYSize)
710✔
4542
    {
4543
        // coverity[tainted_data]
4544
        nFullResYChunk = atoi(pszChunkYSize);
×
4545
    }
4546

4547
    // Only configurable for debug / testing
4548
    const int nChunkMaxSize =
4549
        atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
710✔
4550

4551
    auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
710✔
4552
    if (nChunkSize > nChunkMaxSize)
710✔
4553
    {
4554
        if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
3✔
4555
            !GDALDataTypeIsComplex(eSrcDataType) &&
9✔
4556
            (!STARTS_WITH_CI(pszResampling, "AVER") ||
3✔
4557
             EQUAL(pszResampling, "AVERAGE")))
×
4558
        {
4559
            // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
4560
            // which use a block based strategy, which is much less memory
4561
            // hungry.
4562
            return GDALRegenerateOverviewsMultiBand(
3✔
4563
                1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
4564
                pfnProgress, pProgressData, papszOptions);
3✔
4565
        }
4566
        else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
×
4567
        {
4568
            return GDALRegenerateCascadingOverviews(
×
4569
                poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
4570
                pfnProgress, pProgressData, papszOptions);
×
4571
        }
4572
    }
4573
    else if (pszChunkYSize == nullptr)
707✔
4574
    {
4575
        // Try to get as close as possible to nChunkMaxSize
4576
        while (nChunkSize * 2 < nChunkMaxSize)
9,229✔
4577
        {
4578
            nFullResYChunk *= 2;
8,522✔
4579
            nChunkSize = UpdateChunkHeightAndGetChunkSize();
8,522✔
4580
        }
4581
    }
4582

4583
    int nHasNoData = 0;
707✔
4584
    const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
707✔
4585
    const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
707✔
4586
    const bool bPropagateNoData =
4587
        CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
707✔
4588

4589
    // Structure describing a resampling job
4590
    struct OvrJob
4591
    {
4592
        // Buffers to free when job is finished
4593
        std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
4594
        std::shared_ptr<PointerHolder> oSrcBufferHolder{};
4595
        std::unique_ptr<PointerHolder> oDstBufferHolder{};
4596

4597
        GDALRasterBand *poDstBand = nullptr;
4598

4599
        // Input parameters of pfnResampleFn
4600
        GDALResampleFunction pfnResampleFn = nullptr;
4601
        int nSrcWidth = 0;
4602
        int nSrcHeight = 0;
4603
        int nDstWidth = 0;
4604
        GDALOverviewResampleArgs args{};
4605
        const void *pChunk = nullptr;
4606
        bool bUseGenericResampleFn = false;
4607

4608
        // Output values of resampling function
4609
        CPLErr eErr = CE_Failure;
4610
        void *pDstBuffer = nullptr;
4611
        GDALDataType eDstBufferDataType = GDT_Unknown;
4612

4613
        // Synchronization
4614
        bool bFinished = false;
4615
        std::mutex mutex{};
4616
        std::condition_variable cv{};
4617

4618
        void SetSrcMaskBufferHolder(
×
4619
            const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
4620
        {
4621
            oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
×
4622
        }
×
4623

4624
        void SetSrcBufferHolder(
×
4625
            const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
4626
        {
4627
            oSrcBufferHolder = oSrcBufferHolderIn;
×
4628
        }
×
4629
    };
4630

4631
    // Thread function to resample
4632
    const auto JobResampleFunc = [](void *pData)
789✔
4633
    {
4634
        OvrJob *poJob = static_cast<OvrJob *>(pData);
789✔
4635

4636
        if (poJob->bUseGenericResampleFn)
789✔
4637
        {
4638
            poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
787✔
4639
                                               &(poJob->pDstBuffer),
4640
                                               &(poJob->eDstBufferDataType));
4641
        }
4642
        else
4643
        {
4644
            poJob->eErr = GDALResampleChunkC32R(
2✔
4645
                poJob->nSrcWidth, poJob->nSrcHeight,
4646
                static_cast<const float *>(poJob->pChunk),
2✔
4647
                poJob->args.nChunkYOff, poJob->args.nChunkYSize,
4648
                poJob->args.nDstYOff, poJob->args.nDstYOff2,
4649
                poJob->args.nOvrXSize, poJob->args.nOvrYSize,
4650
                &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
4651
                poJob->args.pszResampling);
4652
        }
4653

4654
        poJob->oDstBufferHolder =
4655
            std::make_unique<PointerHolder>(poJob->pDstBuffer);
789✔
4656

4657
        {
4658
            std::lock_guard<std::mutex> guard(poJob->mutex);
1,578✔
4659
            poJob->bFinished = true;
789✔
4660
            poJob->cv.notify_one();
789✔
4661
        }
4662
    };
789✔
4663

4664
    // Function to write resample data to target band
4665
    const auto WriteJobData = [](const OvrJob *poJob)
789✔
4666
    {
4667
        return poJob->poDstBand->RasterIO(
1,578✔
4668
            GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
789✔
4669
            poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
789✔
4670
            poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
789✔
4671
            poJob->eDstBufferDataType, 0, 0, nullptr);
789✔
4672
    };
4673

4674
    // Wait for completion of oldest job and serialize it
4675
    const auto WaitAndFinalizeOldestJob =
4676
        [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
×
4677
    {
4678
        auto poOldestJob = jobList.front().get();
×
4679
        {
4680
            std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
×
4681
            // coverity[missing_lock:FALSE]
4682
            while (!poOldestJob->bFinished)
×
4683
            {
4684
                poOldestJob->cv.wait(oGuard);
×
4685
            }
4686
        }
4687
        CPLErr l_eErr = poOldestJob->eErr;
×
4688
        if (l_eErr == CE_None)
×
4689
        {
4690
            l_eErr = WriteJobData(poOldestJob);
×
4691
        }
4692

4693
        jobList.pop_front();
×
4694
        return l_eErr;
×
4695
    };
4696

4697
    // Queue of jobs
4698
    std::list<std::unique_ptr<OvrJob>> jobList;
1,414✔
4699

4700
    GByte *pabyChunkNodataMask = nullptr;
707✔
4701
    void *pChunk = nullptr;
707✔
4702

4703
    const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
707✔
4704
    const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
2,828✔
4705
                                                       ? CPLGetNumCPUs()
707✔
4706
                                                       : atoi(pszThreads)));
707✔
4707
    auto poThreadPool =
4708
        nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
707✔
4709
    auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
4710
                                   : std::unique_ptr<CPLJobQueue>(nullptr);
1,414✔
4711

4712
    /* -------------------------------------------------------------------- */
4713
    /*      Loop over image operating on chunks.                            */
4714
    /* -------------------------------------------------------------------- */
4715
    int nChunkYOff = 0;
707✔
4716
    CPLErr eErr = CE_None;
707✔
4717

4718
    for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
1,419✔
4719
         nChunkYOff += nFullResYChunk)
712✔
4720
    {
4721
        if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
712✔
4722
                         pProgressData))
4723
        {
4724
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
×
4725
            eErr = CE_Failure;
×
4726
        }
4727

4728
        if (nFullResYChunk + nChunkYOff > nHeight)
712✔
4729
            nFullResYChunk = nHeight - nChunkYOff;
705✔
4730

4731
        int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
712✔
4732
        int nChunkYSizeQueried =
712✔
4733
            nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
712✔
4734
        if (nChunkYOffQueried < 0)
712✔
4735
        {
4736
            nChunkYSizeQueried += nChunkYOffQueried;
62✔
4737
            nChunkYOffQueried = 0;
62✔
4738
        }
4739
        if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
712✔
4740
            nChunkYSizeQueried = nHeight - nChunkYOffQueried;
62✔
4741

4742
        // Avoid accumulating too many tasks and exhaust RAM
4743
        // Try to complete already finished jobs
4744
        while (eErr == CE_None && !jobList.empty())
712✔
4745
        {
4746
            auto poOldestJob = jobList.front().get();
×
4747
            {
4748
                std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
×
4749
                if (!poOldestJob->bFinished)
×
4750
                {
4751
                    break;
×
4752
                }
4753
            }
4754
            eErr = poOldestJob->eErr;
×
4755
            if (eErr == CE_None)
×
4756
            {
4757
                eErr = WriteJobData(poOldestJob);
×
4758
            }
4759

4760
            jobList.pop_front();
×
4761
        }
4762

4763
        // And in case we have saturated the number of threads,
4764
        // wait for completion of tasks to go below the threshold.
4765
        while (eErr == CE_None &&
1,424✔
4766
               jobList.size() >= static_cast<size_t>(nThreads))
712✔
4767
        {
4768
            eErr = WaitAndFinalizeOldestJob(jobList);
×
4769
        }
4770

4771
        // (Re)allocate buffers if needed
4772
        if (pChunk == nullptr)
712✔
4773
        {
4774
            pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
707✔
4775
                                         nMaxChunkYSizeQueried, nWidth);
4776
        }
4777
        if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
712✔
4778
        {
4779
            pabyChunkNodataMask = static_cast<GByte *>(
4780
                VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
274✔
4781
        }
4782

4783
        if (pChunk == nullptr ||
712✔
4784
            (bUseNoDataMask && pabyChunkNodataMask == nullptr))
274✔
4785
        {
4786
            CPLFree(pChunk);
×
4787
            CPLFree(pabyChunkNodataMask);
×
4788
            return CE_Failure;
×
4789
        }
4790

4791
        // Read chunk.
4792
        if (eErr == CE_None)
712✔
4793
            eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
712✔
4794
                                       nChunkYSizeQueried, pChunk, nWidth,
4795
                                       nChunkYSizeQueried, eWrkDataType, 0, 0,
4796
                                       nullptr);
4797
        if (eErr == CE_None && bUseNoDataMask)
712✔
4798
            eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
274✔
4799
                                        nChunkYSizeQueried, pabyChunkNodataMask,
4800
                                        nWidth, nChunkYSizeQueried, GDT_Byte, 0,
4801
                                        0, nullptr);
4802

4803
        // Special case to promote 1bit data to 8bit 0/255 values.
4804
        if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
712✔
4805
        {
4806
            if (eWrkDataType == GDT_Float32)
9✔
4807
            {
4808
                float *pafChunk = static_cast<float *>(pChunk);
×
4809
                for (GPtrDiff_t i = 0;
×
4810
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
×
4811
                     i++)
4812
                {
4813
                    if (pafChunk[i] == 1.0)
×
4814
                        pafChunk[i] = 255.0;
×
4815
                }
4816
            }
4817
            else if (eWrkDataType == GDT_Byte)
9✔
4818
            {
4819
                GByte *pabyChunk = static_cast<GByte *>(pChunk);
9✔
4820
                for (GPtrDiff_t i = 0;
168,417✔
4821
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
168,417✔
4822
                     i++)
4823
                {
4824
                    if (pabyChunk[i] == 1)
168,408✔
4825
                        pabyChunk[i] = 255;
127,437✔
4826
                }
4827
            }
4828
            else if (eWrkDataType == GDT_UInt16)
×
4829
            {
4830
                GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
×
4831
                for (GPtrDiff_t i = 0;
×
4832
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
×
4833
                     i++)
4834
                {
4835
                    if (pasChunk[i] == 1)
×
4836
                        pasChunk[i] = 255;
×
4837
                }
4838
            }
4839
            else if (eWrkDataType == GDT_Float64)
×
4840
            {
4841
                double *padfChunk = static_cast<double *>(pChunk);
×
4842
                for (GPtrDiff_t i = 0;
×
4843
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
×
4844
                     i++)
4845
                {
4846
                    if (padfChunk[i] == 1.0)
×
4847
                        padfChunk[i] = 255.0;
×
4848
                }
4849
            }
4850
            else
4851
            {
4852
                CPLAssert(false);
×
4853
            }
4854
        }
4855
        else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
703✔
4856
        {
4857
            if (eWrkDataType == GDT_Float32)
×
4858
            {
4859
                float *pafChunk = static_cast<float *>(pChunk);
×
4860
                for (GPtrDiff_t i = 0;
×
4861
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
×
4862
                     i++)
4863
                {
4864
                    if (pafChunk[i] == 1.0)
×
4865
                        pafChunk[i] = 0.0;
×
4866
                    else if (pafChunk[i] == 0.0)
×
4867
                        pafChunk[i] = 255.0;
×
4868
                }
4869
            }
4870
            else if (eWrkDataType == GDT_Byte)
×
4871
            {
4872
                GByte *pabyChunk = static_cast<GByte *>(pChunk);
×
4873
                for (GPtrDiff_t i = 0;
×
4874
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
×
4875
                     i++)
4876
                {
4877
                    if (pabyChunk[i] == 1)
×
4878
                        pabyChunk[i] = 0;
×
4879
                    else if (pabyChunk[i] == 0)
×
4880
                        pabyChunk[i] = 255;
×
4881
                }
4882
            }
4883
            else if (eWrkDataType == GDT_UInt16)
×
4884
            {
4885
                GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
×
4886
                for (GPtrDiff_t i = 0;
×
4887
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
×
4888
                     i++)
4889
                {
4890
                    if (pasChunk[i] == 1)
×
4891
                        pasChunk[i] = 0;
×
4892
                    else if (pasChunk[i] == 0)
×
4893
                        pasChunk[i] = 255;
×
4894
                }
4895
            }
4896
            else if (eWrkDataType == GDT_Float64)
×
4897
            {
4898
                double *padfChunk = static_cast<double *>(pChunk);
×
4899
                for (GPtrDiff_t i = 0;
×
4900
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
×
4901
                     i++)
4902
                {
4903
                    if (padfChunk[i] == 1.0)
×
4904
                        padfChunk[i] = 0.0;
×
4905
                    else if (padfChunk[i] == 0.0)
×
4906
                        padfChunk[i] = 255.0;
×
4907
                }
4908
            }
4909
            else
4910
            {
4911
                CPLAssert(false);
×
4912
            }
4913
        }
4914

4915
        auto oSrcBufferHolder =
4916
            std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
1,424✔
4917
        auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
4918
            poJobQueue ? pabyChunkNodataMask : nullptr);
1,424✔
4919

4920
        for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
1,501✔
4921
             ++iOverview)
4922
        {
4923
            GDALRasterBand *poDstBand = papoOvrBands[iOverview];
789✔
4924
            const int nDstWidth = poDstBand->GetXSize();
789✔
4925
            const int nDstHeight = poDstBand->GetYSize();
789✔
4926

4927
            const double dfXRatioDstToSrc =
789✔
4928
                static_cast<double>(nWidth) / nDstWidth;
789✔
4929
            const double dfYRatioDstToSrc =
789✔
4930
                static_cast<double>(nHeight) / nDstHeight;
789✔
4931

4932
            /* --------------------------------------------------------------------
4933
             */
4934
            /*      Figure out the line to start writing to, and the first line
4935
             */
4936
            /*      to not write to.  In theory this approach should ensure that
4937
             */
4938
            /*      every output line will be written if all input chunks are */
4939
            /*      processed. */
4940
            /* --------------------------------------------------------------------
4941
             */
4942
            int nDstYOff =
789✔
4943
                static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
789✔
4944
            if (nDstYOff == nDstHeight)
789✔
4945
                continue;
×
4946
            int nDstYOff2 = static_cast<int>(
789✔
4947
                0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
789✔
4948

4949
            if (nChunkYOff + nFullResYChunk == nHeight)
789✔
4950
                nDstYOff2 = nDstHeight;
782✔
4951
#if DEBUG_VERBOSE
4952
            CPLDebug("GDAL",
4953
                     "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
4954
                     nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
4955
                     nDstWidth, nDstYOff2 - nDstYOff);
4956
#endif
4957

4958
            auto poJob = std::make_unique<OvrJob>();
1,578✔
4959
            poJob->pfnResampleFn = pfnResampleFn;
789✔
4960
            poJob->bUseGenericResampleFn = bUseGenericResampleFn;
789✔
4961
            poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
789✔
4962
            poJob->args.nOvrXSize = poDstBand->GetXSize();
789✔
4963
            poJob->args.nOvrYSize = poDstBand->GetYSize();
789✔
4964
            const char *pszNBITS =
4965
                poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
789✔
4966
            poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
789✔
4967
            poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
789✔
4968
            poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
789✔
4969
            poJob->args.eWrkDataType = eWrkDataType;
789✔
4970
            poJob->pChunk = pChunk;
789✔
4971
            poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
789✔
4972
            poJob->nSrcWidth = nWidth;
789✔
4973
            poJob->nSrcHeight = nHeight;
789✔
4974
            poJob->args.nChunkXOff = 0;
789✔
4975
            poJob->args.nChunkXSize = nWidth;
789✔
4976
            poJob->args.nChunkYOff = nChunkYOffQueried;
789✔
4977
            poJob->args.nChunkYSize = nChunkYSizeQueried;
789✔
4978
            poJob->nDstWidth = nDstWidth;
789✔
4979
            poJob->args.nDstXOff = 0;
789✔
4980
            poJob->args.nDstXOff2 = nDstWidth;
789✔
4981
            poJob->args.nDstYOff = nDstYOff;
789✔
4982
            poJob->args.nDstYOff2 = nDstYOff2;
789✔
4983
            poJob->poDstBand = poDstBand;
789✔
4984
            poJob->args.pszResampling = pszResampling;
789✔
4985
            poJob->args.bHasNoData = bHasNoData;
789✔
4986
            poJob->args.dfNoDataValue = dfNoDataValue;
789✔
4987
            poJob->args.poColorTable = poColorTable;
789✔
4988
            poJob->args.eSrcDataType = eSrcDataType;
789✔
4989
            poJob->args.bPropagateNoData = bPropagateNoData;
789✔
4990

4991
            if (poJobQueue)
789✔
4992
            {
4993
                poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
×
4994
                poJob->SetSrcBufferHolder(oSrcBufferHolder);
×
4995
                poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
×
4996
                jobList.emplace_back(std::move(poJob));
×
4997
            }
4998
            else
4999
            {
5000
                JobResampleFunc(poJob.get());
789✔
5001
                eErr = poJob->eErr;
789✔
5002
                if (eErr == CE_None)
789✔
5003
                {
5004
                    eErr = WriteJobData(poJob.get());
789✔
5005
                }
5006
            }
5007
        }
5008

5009
        if (poJobQueue)
712✔
5010
        {
5011
            pChunk = nullptr;
×
5012
            pabyChunkNodataMask = nullptr;
×
5013
        }
5014
    }
5015

5016
    VSIFree(pChunk);
707✔
5017
    VSIFree(pabyChunkNodataMask);
707✔
5018

5019
    // Wait for all pending jobs to complete
5020
    while (!jobList.empty())
707✔
5021
    {
5022
        const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
×
5023
        if (l_eErr != CE_None && eErr == CE_None)
×
5024
            eErr = l_eErr;
×
5025
    }
5026

5027
    /* -------------------------------------------------------------------- */
5028
    /*      Renormalized overview mean / stddev if needed.                  */
5029
    /* -------------------------------------------------------------------- */
5030
    if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
707✔
5031
    {
5032
        GDALOverviewMagnitudeCorrection(
×
5033
            poSrcBand, nOverviewCount,
5034
            reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5035
            GDALDummyProgress, nullptr);
5036
    }
5037

5038
    /* -------------------------------------------------------------------- */
5039
    /*      It can be important to flush out data to overviews.             */
5040
    /* -------------------------------------------------------------------- */
5041
    for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
1,489✔
5042
         ++iOverview)
5043
    {
5044
        eErr = papoOvrBands[iOverview]->FlushCache(false);
782✔
5045
    }
5046

5047
    if (eErr == CE_None)
707✔
5048
        pfnProgress(1.0, nullptr, pProgressData);
707✔
5049

5050
    return eErr;
707✔
5051
}
5052

5053
/************************************************************************/
5054
/*            GDALRegenerateOverviewsMultiBand()                        */
5055
/************************************************************************/
5056

5057
/**
5058
 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5059
 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5060
 *
5061
 * This function will generate one or more overview images from a base
5062
 * image using the requested downsampling algorithm.  Its primary use
5063
 * is for generating overviews via GDALDataset::BuildOverviews(), but it
5064
 * can also be used to generate downsampled images in one file from another
5065
 * outside the overview architecture.
5066
 *
5067
 * The output bands need to exist in advance and share the same characteristics
5068
 * (type, dimensions)
5069
 *
5070
 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5071
 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5072
 *
5073
 * It does not support color tables or complex data types.
5074
 *
5075
 * The pseudo-algorithm used by the function is :
5076
 *    for each overview
5077
 *       iterate on lines of the source by a step of deltay
5078
 *           iterate on columns of the source  by a step of deltax
5079
 *               read the source data of size deltax * deltay for all the bands
5080
 *               generate the corresponding overview block for all the bands
5081
 *
5082
 * This function will honour properly NODATA_VALUES tuples (special dataset
5083
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5084
 * considered as the nodata value and not each value of the triplet
5085
 * independently per band.
5086
 *
5087
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5088
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5089
 * overview computation.
5090
 *
5091
 * @param nBands the number of bands, size of papoSrcBands and size of
5092
 *               first dimension of papapoOverviewBands
5093
 * @param papoSrcBands the list of source bands to downsample
5094
 * @param nOverviews the number of downsampled overview levels being generated.
5095
 * @param papapoOverviewBands bidimension array of bands. First dimension is
5096
 *                            indexed by nBands. Second dimension is indexed by
5097
 *                            nOverviews.
5098
 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5099
 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5100
 * @param pfnProgress progress report function.
5101
 * @param pProgressData progress function callback data.
5102
 * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5103
 *                     key=value pairs, or NULL
5104
 *                     Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5105
 *                     options can be specified to express that overviews should
5106
 *                     be regenerated only in the specified subset of the source
5107
 *                     dataset.
5108
 * @return CE_None on success or CE_Failure on failure.
5109
 */
5110

5111
CPLErr GDALRegenerateOverviewsMultiBand(
374✔
5112
    int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5113
    GDALRasterBand *const *const *papapoOverviewBands,
5114
    const char *pszResampling, GDALProgressFunc pfnProgress,
5115
    void *pProgressData, CSLConstList papszOptions)
5116
{
5117
    CPL_IGNORE_RET_VAL(papszOptions);
374✔
5118

5119
    if (pfnProgress == nullptr)
374✔
5120
        pfnProgress = GDALDummyProgress;
6✔
5121

5122
    if (EQUAL(pszResampling, "NONE"))
374✔
5123
        return CE_None;
2✔
5124

5125
    // Sanity checks.
5126
    if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
372✔
5127
        !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
177✔
5128
        !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
76✔
5129
        !EQUAL(pszResampling, "CUBICSPLINE") &&
18✔
5130
        !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
17✔
5131
        !EQUAL(pszResampling, "MODE"))
5✔
5132
    {
5133
        CPLError(CE_Failure, CPLE_NotSupported,
×
5134
                 "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5135
                 "not supported",
5136
                 pszResampling);
5137
        return CE_Failure;
×
5138
    }
5139

5140
    int nKernelRadius = 0;
372✔
5141
    GDALResampleFunction pfnResampleFn =
5142
        GDALGetResampleFunction(pszResampling, &nKernelRadius);
372✔
5143
    if (pfnResampleFn == nullptr)
372✔
5144
        return CE_Failure;
×
5145

5146
    const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
372✔
5147
    const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
372✔
5148
    if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
372✔
5149
        return CE_None;
×
5150
    GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
372✔
5151
    for (int iBand = 1; iBand < nBands; ++iBand)
688✔
5152
    {
5153
        if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
632✔
5154
            papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
316✔
5155
        {
5156
            CPLError(
×
5157
                CE_Failure, CPLE_NotSupported,
5158
                "GDALRegenerateOverviewsMultiBand: all the source bands must "
5159
                "have the same dimensions");
5160
            return CE_Failure;
×
5161
        }
5162
        if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
316✔
5163
        {
5164
            CPLError(
×
5165
                CE_Failure, CPLE_NotSupported,
5166
                "GDALRegenerateOverviewsMultiBand: all the source bands must "
5167
                "have the same data type");
5168
            return CE_Failure;
×
5169
        }
5170
    }
5171

5172
    for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
988✔
5173
    {
5174
        const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
616✔
5175
        const int nDstWidth = poOvrFirstBand->GetXSize();
616✔
5176
        const int nDstHeight = poOvrFirstBand->GetYSize();
616✔
5177
        for (int iBand = 1; iBand < nBands; ++iBand)
1,210✔
5178
        {
5179
            const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
594✔
5180
            if (poOvrBand->GetXSize() != nDstWidth ||
1,188✔
5181
                poOvrBand->GetYSize() != nDstHeight)
594✔
5182
            {
5183
                CPLError(
×
5184
                    CE_Failure, CPLE_NotSupported,
5185
                    "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5186
                    "of the same level must have the same dimensions");
5187
                return CE_Failure;
×
5188
            }
5189
            if (poOvrBand->GetRasterDataType() != eDataType)
594✔
5190
            {
5191
                CPLError(
×
5192
                    CE_Failure, CPLE_NotSupported,
5193
                    "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5194
                    "must have the same data type as the source bands");
5195
                return CE_Failure;
×
5196
            }
5197
        }
5198
    }
5199

5200
    // First pass to compute the total number of pixels to write.
5201
    double dfTotalPixelCount = 0;
372✔
5202
    const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
372✔
5203
    const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
372✔
5204
    const int nSrcXSize = atoi(CSLFetchNameValueDef(
372✔
5205
        papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5206
    const int nSrcYSize = atoi(CSLFetchNameValueDef(
372✔
5207
        papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5208
    for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
988✔
5209
    {
5210
        dfTotalPixelCount +=
616✔
5211
            static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
1,232✔
5212
            papapoOverviewBands[0][iOverview]->GetXSize() *
616✔
5213
            static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
1,232✔
5214
            papapoOverviewBands[0][iOverview]->GetYSize();
616✔
5215
    }
5216

5217
    const GDALDataType eWrkDataType =
5218
        GDALGetOvrWorkDataType(pszResampling, eDataType);
372✔
5219
    const int nWrkDataTypeSize = GDALGetDataTypeSizeBytes(eWrkDataType);
372✔
5220

5221
    const bool bIsMask = papoSrcBands[0]->IsMaskBand();
372✔
5222

5223
    // If we have a nodata mask and we are doing something more complicated
5224
    // than nearest neighbouring, we have to fetch to nodata mask.
5225
    const bool bUseNoDataMask =
5226
        !STARTS_WITH_CI(pszResampling, "NEAR") &&
541✔
5227
        (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
169✔
5228

5229
    bool *const pabHasNoData =
5230
        static_cast<bool *>(VSI_MALLOC_VERBOSE(nBands * sizeof(bool)));
372✔
5231
    double *const padfNoDataValue =
5232
        static_cast<double *>(VSI_MALLOC_VERBOSE(nBands * sizeof(double)));
372✔
5233
    if (pabHasNoData == nullptr || padfNoDataValue == nullptr)
372✔
5234
    {
5235
        CPLFree(pabHasNoData);
×
5236
        CPLFree(padfNoDataValue);
×
5237
        return CE_Failure;
×
5238
    }
5239

5240
    for (int iBand = 0; iBand < nBands; ++iBand)
1,060✔
5241
    {
5242
        int nHasNoData = 0;
688✔
5243
        padfNoDataValue[iBand] =
1,376✔
5244
            papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
688✔
5245
        pabHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
688✔
5246
    }
5247
    const bool bPropagateNoData =
5248
        CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
372✔
5249

5250
    const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
372✔
5251
    const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
1,488✔
5252
                                                       ? CPLGetNumCPUs()
372✔
5253
                                                       : atoi(pszThreads)));
372✔
5254
    auto poThreadPool =
5255
        nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
372✔
5256
    auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5257
                                   : std::unique_ptr<CPLJobQueue>(nullptr);
372✔
5258

5259
    // Only configurable for debug / testing
5260
    const int nChunkMaxSize = std::max(
5261
        100, atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760")));
372✔
5262

5263
    // Second pass to do the real job.
5264
    double dfCurPixelCount = 0;
372✔
5265
    CPLErr eErr = CE_None;
372✔
5266
    for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
987✔
5267
         ++iOverview)
5268
    {
5269
        int iSrcOverview = -1;  // -1 means the source bands.
615✔
5270

5271
        const int nDstTotalWidth =
5272
            papapoOverviewBands[0][iOverview]->GetXSize();
615✔
5273
        const int nDstTotalHeight =
5274
            papapoOverviewBands[0][iOverview]->GetYSize();
615✔
5275

5276
        // Compute the coordinates of the target region to refresh
5277
        constexpr double EPS = 1e-8;
615✔
5278
        const int nDstXOffStart = static_cast<int>(
615✔
5279
            static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
615✔
5280
            EPS);
5281
        const int nDstXOffEnd =
5282
            std::min(static_cast<int>(
1,230✔
5283
                         std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
615✔
5284
                                       nToplevelSrcWidth * nDstTotalWidth -
615✔
5285
                                   EPS)),
5286
                     nDstTotalWidth);
615✔
5287
        const int nDstWidth = nDstXOffEnd - nDstXOffStart;
615✔
5288
        const int nDstYOffStart =
615✔
5289
            static_cast<int>(static_cast<double>(nSrcYOff) /
615✔
5290
                                 nToplevelSrcHeight * nDstTotalHeight +
615✔
5291
                             EPS);
5292
        const int nDstYOffEnd =
5293
            std::min(static_cast<int>(
1,230✔
5294
                         std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
615✔
5295
                                       nToplevelSrcHeight * nDstTotalHeight -
615✔
5296
                                   EPS)),
5297
                     nDstTotalHeight);
615✔
5298

5299
        // Try to use previous level of overview as the source to compute
5300
        // the next level.
5301
        int nSrcWidth = nToplevelSrcWidth;
615✔
5302
        int nSrcHeight = nToplevelSrcHeight;
615✔
5303
        if (iOverview > 0 &&
858✔
5304
            papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
243✔
5305
        {
5306
            nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
235✔
5307
            nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
235✔
5308
            iSrcOverview = iOverview - 1;
235✔
5309
        }
5310

5311
        const double dfXRatioDstToSrc =
615✔
5312
            static_cast<double>(nSrcWidth) / nDstTotalWidth;
615✔
5313
        const double dfYRatioDstToSrc =
615✔
5314
            static_cast<double>(nSrcHeight) / nDstTotalHeight;
615✔
5315

5316
        int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
1,230✔
5317
                                  static_cast<int>(0.5 + dfYRatioDstToSrc));
615✔
5318
        if (nOvrFactor == 0)
615✔
5319
            nOvrFactor = 1;
×
5320

5321
        int nDstChunkXSize = 0;
615✔
5322
        int nDstChunkYSize = 0;
615✔
5323
        papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
615✔
5324
                                                        &nDstChunkYSize);
5325

5326
        const char *pszDST_CHUNK_X_SIZE =
5327
            CSLFetchNameValue(papszOptions, "DST_CHUNK_X_SIZE");
615✔
5328
        const char *pszDST_CHUNK_Y_SIZE =
5329
            CSLFetchNameValue(papszOptions, "DST_CHUNK_Y_SIZE");
615✔
5330
        if (pszDST_CHUNK_X_SIZE && pszDST_CHUNK_Y_SIZE)
615✔
5331
        {
5332
            nDstChunkXSize = std::max(1, atoi(pszDST_CHUNK_X_SIZE));
12✔
5333
            nDstChunkYSize = std::max(1, atoi(pszDST_CHUNK_Y_SIZE));
12✔
5334
            CPLDebug("GDAL", "Using dst chunk size %d x %d", nDstChunkXSize,
12✔
5335
                     nDstChunkYSize);
5336
        }
5337

5338
        // Try to extend the chunk size so that the memory needed to acquire
5339
        // source pixels goes up to 10 MB.
5340
        // This can help for drivers that support multi-threaded reading
5341
        const int nFullResYChunk =
615✔
5342
            2 + static_cast<int>(nDstChunkYSize * dfYRatioDstToSrc);
615✔
5343
        const int nFullResYChunkQueried =
615✔
5344
            nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
615✔
5345
        while (nDstChunkXSize < nDstWidth)
857✔
5346
        {
5347
            const int nFullResXChunk =
259✔
5348
                2 + static_cast<int>(2 * nDstChunkXSize * dfXRatioDstToSrc);
259✔
5349

5350
            const int nFullResXChunkQueried =
259✔
5351
                nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
259✔
5352

5353
            if (static_cast<GIntBig>(nFullResXChunkQueried) *
259✔
5354
                    nFullResYChunkQueried * nBands * nWrkDataTypeSize >
259✔
5355
                nChunkMaxSize)
259✔
5356
            {
5357
                break;
17✔
5358
            }
5359

5360
            nDstChunkXSize *= 2;
242✔
5361
        }
5362
        nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
615✔
5363

5364
        const int nFullResXChunk =
615✔
5365
            2 + static_cast<int>(nDstChunkXSize * dfXRatioDstToSrc);
615✔
5366
        const int nFullResXChunkQueried =
615✔
5367
            nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
615✔
5368

5369
        // Make sure that the RAM requirements to acquire the source data does
5370
        // not exceed nChunkMaxSize
5371
        // If so, reduce the destination chunk size, generate overviews in a
5372
        // temporary dataset, and copy that temporary dataset over the target
5373
        // overview bands (to avoid issues with lossy compression)
5374
        const auto nMemRequirement =
615✔
5375
            static_cast<GIntBig>(nFullResXChunkQueried) *
615✔
5376
            nFullResYChunkQueried * nBands * nWrkDataTypeSize;
615✔
5377
        if (nMemRequirement > nChunkMaxSize &&
615✔
5378
            !(pszDST_CHUNK_X_SIZE && pszDST_CHUNK_Y_SIZE))
10✔
5379
        {
5380
            // Compute a smaller destination chunk size
5381
            const auto nOverShootFactor = nMemRequirement / nChunkMaxSize;
12✔
5382
            const auto nSqrtOverShootFactor = std::max<GIntBig>(
5383
                4, static_cast<GIntBig>(std::ceil(
24✔
5384
                       std::sqrt(static_cast<double>(nOverShootFactor)))));
12✔
5385
            const int nReducedDstChunkXSize = std::max(
5386
                1, static_cast<int>(nDstChunkXSize / nSqrtOverShootFactor));
12✔
5387
            const int nReducedDstChunkYSize = std::max(
5388
                1, static_cast<int>(nDstChunkYSize / nSqrtOverShootFactor));
12✔
5389
            if (nReducedDstChunkXSize < nDstChunkXSize ||
12✔
5390
                nReducedDstChunkYSize < nDstChunkYSize)
×
5391
            {
5392
                CPLStringList aosOptions(papszOptions);
12✔
5393
                aosOptions.SetNameValue(
5394
                    "DST_CHUNK_X_SIZE",
5395
                    CPLSPrintf("%d", nReducedDstChunkXSize));
12✔
5396
                aosOptions.SetNameValue(
5397
                    "DST_CHUNK_Y_SIZE",
5398
                    CPLSPrintf("%d", nReducedDstChunkYSize));
12✔
5399

5400
                const auto nTmpDSMemRequirement =
5401
                    static_cast<GIntBig>(nDstTotalWidth) * nDstTotalHeight *
12✔
5402
                    nBands * GDALGetDataTypeSizeBytes(eDataType);
12✔
5403
                std::unique_ptr<GDALDataset> poTmpDS;
×
5404
                // Config option mostly/only for autotest purposes
5405
                const char *pszGDAL_OVR_TEMP_DRIVER =
5406
                    CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
12✔
5407
                if ((nTmpDSMemRequirement <= nChunkMaxSize &&
12✔
5408
                     !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
2✔
5409
                    EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
10✔
5410
                {
5411
                    auto poTmpDrv =
5412
                        GetGDALDriverManager()->GetDriverByName("MEM");
11✔
5413
                    if (!poTmpDrv)
11✔
5414
                    {
5415
                        eErr = CE_Failure;
×
5416
                        break;
×
5417
                    }
5418
                    poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
11✔
5419
                                                   nDstTotalHeight, nBands,
5420
                                                   eDataType, nullptr));
11✔
5421
                }
5422
                else
5423
                {
5424
                    auto poTmpDrv =
5425
                        GetGDALDriverManager()->GetDriverByName("GTiff");
1✔
5426
                    if (!poTmpDrv)
1✔
5427
                    {
5428
                        eErr = CE_Failure;
×
5429
                        break;
×
5430
                    }
5431
                    std::string osTmpFilename;
2✔
5432
                    auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
1✔
5433
                    if (poDstDS)
1✔
5434
                    {
5435
                        osTmpFilename = poDstDS->GetDescription();
1✔
5436
                        VSIStatBufL sStatBuf;
5437
                        if (!osTmpFilename.empty() &&
1✔
5438
                            VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
×
5439
                            osTmpFilename += "_tmp_ovr.tif";
×
5440
                    }
5441
                    if (osTmpFilename.empty())
1✔
5442
                    {
5443
                        osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
1✔
5444
                        osTmpFilename += ".tif";
1✔
5445
                    }
5446
                    CPLDebug("GDAL",
1✔
5447
                             "Creating temporary file %s of %d x %d x %d",
5448
                             osTmpFilename.c_str(), nDstTotalWidth,
5449
                             nDstTotalHeight, nBands);
5450
                    CPLStringList aosCO;
2✔
5451
                    poTmpDS.reset(poTmpDrv->Create(
1✔
5452
                        osTmpFilename.c_str(), nDstTotalWidth, nDstTotalHeight,
5453
                        nBands, eDataType, aosCO.List()));
1✔
5454
                    if (poTmpDS)
1✔
5455
                    {
5456
                        poTmpDS->MarkSuppressOnClose();
1✔
5457
                        VSIUnlink(osTmpFilename.c_str());
1✔
5458
                    }
5459
                }
5460
                if (!poTmpDS)
12✔
5461
                {
5462
                    eErr = CE_Failure;
×
5463
                    break;
×
5464
                }
5465

5466
                std::vector<GDALRasterBand **> apapoOverviewBands(nBands);
12✔
5467
                for (int i = 0; i < nBands; ++i)
27✔
5468
                {
5469
                    apapoOverviewBands[i] = static_cast<GDALRasterBand **>(
30✔
5470
                        CPLMalloc(sizeof(GDALRasterBand *)));
15✔
5471
                    apapoOverviewBands[i][0] = poTmpDS->GetRasterBand(i + 1);
15✔
5472
                }
5473

5474
                const double dfExtraPixels =
5475
                    static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
24✔
5476
                    papapoOverviewBands[0][iOverview]->GetXSize() *
12✔
5477
                    static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
24✔
5478
                    papapoOverviewBands[0][iOverview]->GetYSize();
12✔
5479

5480
                void *pScaledProgressData = GDALCreateScaledProgress(
24✔
5481
                    dfCurPixelCount / dfTotalPixelCount,
5482
                    (dfCurPixelCount + dfExtraPixels) / dfTotalPixelCount,
12✔
5483
                    pfnProgress, pProgressData);
5484

5485
                // Generate overviews in temporary dataset
5486
                eErr = GDALRegenerateOverviewsMultiBand(
12✔
5487
                    nBands, papoSrcBands, 1, apapoOverviewBands.data(),
12✔
5488
                    pszResampling, GDALScaledProgress, pScaledProgressData,
5489
                    aosOptions.List());
12✔
5490

5491
                GDALDestroyScaledProgress(pScaledProgressData);
12✔
5492

5493
                dfCurPixelCount += dfExtraPixels;
12✔
5494

5495
                for (int i = 0; i < nBands; ++i)
27✔
5496
                {
5497
                    CPLFree(apapoOverviewBands[i]);
15✔
5498
                }
5499

5500
                // Copy temporary dataset to destination overview bands
5501

5502
                if (eErr == CE_None)
12✔
5503
                {
5504
                    // Check if all papapoOverviewBands[][iOverview] bands point
5505
                    // to the same dataset. If so, we can use
5506
                    // GDALDatasetCopyWholeRaster()
5507
                    GDALDataset *poDstOvrBandDS =
5508
                        papapoOverviewBands[0][iOverview]->GetDataset();
12✔
5509
                    if (poDstOvrBandDS)
12✔
5510
                    {
5511
                        if (poDstOvrBandDS->GetRasterCount() != nBands ||
15✔
5512
                            poDstOvrBandDS->GetRasterBand(1) !=
3✔
5513
                                papapoOverviewBands[0][iOverview])
3✔
5514
                        {
5515
                            poDstOvrBandDS = nullptr;
9✔
5516
                        }
5517
                        else
5518
                        {
5519
                            for (int i = 1; poDstOvrBandDS && i < nBands; ++i)
6✔
5520
                            {
5521
                                GDALDataset *poThisDstOvrBandDS =
5522
                                    papapoOverviewBands[i][iOverview]
3✔
5523
                                        ->GetDataset();
3✔
5524
                                if (poThisDstOvrBandDS == nullptr ||
3✔
5525
                                    poThisDstOvrBandDS != poDstOvrBandDS ||
6✔
5526
                                    poThisDstOvrBandDS->GetRasterBand(i + 1) !=
3✔
5527
                                        papapoOverviewBands[i][iOverview])
3✔
5528
                                {
5529
                                    poDstOvrBandDS = nullptr;
×
5530
                                }
5531
                            }
5532
                        }
5533
                    }
5534
                    if (poDstOvrBandDS)
12✔
5535
                    {
5536
                        eErr = GDALDatasetCopyWholeRaster(
3✔
5537
                            GDALDataset::ToHandle(poTmpDS.get()),
5538
                            GDALDataset::ToHandle(poDstOvrBandDS), nullptr,
5539
                            nullptr, nullptr);
5540
                    }
5541
                    else
5542
                    {
5543
                        for (int i = 0; eErr == CE_None && i < nBands; ++i)
18✔
5544
                        {
5545
                            eErr = GDALRasterBandCopyWholeRaster(
9✔
5546
                                GDALRasterBand::ToHandle(
5547
                                    poTmpDS->GetRasterBand(i + 1)),
5548
                                GDALRasterBand::ToHandle(
5549
                                    papapoOverviewBands[i][iOverview]),
9✔
5550
                                nullptr, nullptr, nullptr);
5551
                        }
5552
                    }
5553
                }
5554

5555
                if (eErr != CE_None)
12✔
5556
                    break;
×
5557

5558
                continue;
12✔
5559
            }
5560
        }
5561

5562
        // Structure describing a resampling job
5563
        struct OvrJob
5564
        {
5565
            // Buffers to free when job is finished
5566
            std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
5567
            std::unique_ptr<PointerHolder> oSrcBufferHolder{};
5568
            std::unique_ptr<PointerHolder> oDstBufferHolder{};
5569

5570
            GDALRasterBand *poDstBand = nullptr;
5571

5572
            // Input parameters of pfnResampleFn
5573
            GDALResampleFunction pfnResampleFn = nullptr;
5574
            GDALOverviewResampleArgs args{};
5575
            const void *pChunk = nullptr;
5576

5577
            // Output values of resampling function
5578
            CPLErr eErr = CE_Failure;
5579
            void *pDstBuffer = nullptr;
5580
            GDALDataType eDstBufferDataType = GDT_Unknown;
5581

5582
            // Synchronization
5583
            bool bFinished = false;
5584
            std::mutex mutex{};
5585
            std::condition_variable cv{};
5586
        };
5587

5588
        // Thread function to resample
5589
        const auto JobResampleFunc = [](void *pData)
16,301✔
5590
        {
5591
            OvrJob *poJob = static_cast<OvrJob *>(pData);
16,301✔
5592

5593
            poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
16,301✔
5594
                                               &(poJob->pDstBuffer),
5595
                                               &(poJob->eDstBufferDataType));
5596

5597
            poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
16,300✔
5598

5599
            {
5600
                std::lock_guard<std::mutex> guard(poJob->mutex);
32,601✔
5601
                poJob->bFinished = true;
16,301✔
5602
                poJob->cv.notify_one();
16,301✔
5603
            }
5604
        };
16,301✔
5605

5606
        // Function to write resample data to target band
5607
        const auto WriteJobData = [](const OvrJob *poJob)
16,301✔
5608
        {
5609
            return poJob->poDstBand->RasterIO(
32,602✔
5610
                GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
16,301✔
5611
                poJob->args.nDstXOff2 - poJob->args.nDstXOff,
16,301✔
5612
                poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
16,301✔
5613
                poJob->args.nDstXOff2 - poJob->args.nDstXOff,
16,301✔
5614
                poJob->args.nDstYOff2 - poJob->args.nDstYOff,
16,301✔
5615
                poJob->eDstBufferDataType, 0, 0, nullptr);
16,301✔
5616
        };
5617

5618
        // Wait for completion of oldest job and serialize it
5619
        const auto WaitAndFinalizeOldestJob =
5620
            [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
50✔
5621
        {
5622
            auto poOldestJob = jobList.front().get();
50✔
5623
            {
5624
                std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
100✔
5625
                // coverity[missing_lock:FALSE]
5626
                while (!poOldestJob->bFinished)
71✔
5627
                {
5628
                    poOldestJob->cv.wait(oGuard);
21✔
5629
                }
5630
            }
5631
            CPLErr l_eErr = poOldestJob->eErr;
50✔
5632
            if (l_eErr == CE_None)
50✔
5633
            {
5634
                l_eErr = WriteJobData(poOldestJob);
50✔
5635
            }
5636

5637
            jobList.pop_front();
50✔
5638
            return l_eErr;
50✔
5639
        };
5640

5641
        // Queue of jobs
5642
        std::list<std::unique_ptr<OvrJob>> jobList;
1,206✔
5643

5644
        std::vector<void *> apaChunk(nBands);
1,206✔
5645
        std::vector<GByte *> apabyChunkNoDataMask(nBands);
1,206✔
5646

5647
        // Iterate on destination overview, block by block.
5648
        for (int nDstYOff = nDstYOffStart;
603✔
5649
             nDstYOff < nDstYOffEnd && eErr == CE_None;
2,277✔
5650
             nDstYOff += nDstChunkYSize)
1,674✔
5651
        {
5652
            int nDstYCount;
5653
            if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
1,674✔
5654
                nDstYCount = nDstChunkYSize;
1,256✔
5655
            else
5656
                nDstYCount = nDstYOffEnd - nDstYOff;
418✔
5657

5658
            int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1,674✔
5659
            int nChunkYOff2 = static_cast<int>(
1,674✔
5660
                ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
1,674✔
5661
            if (nChunkYOff2 > nSrcHeight ||
1,674✔
5662
                nDstYOff + nDstYCount == nDstTotalHeight)
1,674✔
5663
                nChunkYOff2 = nSrcHeight;
600✔
5664
            int nYCount = nChunkYOff2 - nChunkYOff;
1,674✔
5665
            CPLAssert(nYCount <= nFullResYChunk);
1,674✔
5666

5667
            int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
1,674✔
5668
            int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
1,674✔
5669
            if (nChunkYOffQueried < 0)
1,674✔
5670
            {
5671
                nChunkYSizeQueried += nChunkYOffQueried;
140✔
5672
                nChunkYOffQueried = 0;
140✔
5673
            }
5674
            if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
1,674✔
5675
                nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
139✔
5676
            CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
1,674✔
5677

5678
            if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount, nullptr,
1,674✔
5679
                             pProgressData))
5680
            {
5681
                CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
1✔
5682
                eErr = CE_Failure;
1✔
5683
            }
5684

5685
            // Iterate on destination overview, block by block.
5686
            for (int nDstXOff = nDstXOffStart;
1,674✔
5687
                 nDstXOff < nDstXOffEnd && eErr == CE_None;
10,123✔
5688
                 nDstXOff += nDstChunkXSize)
8,449✔
5689
            {
5690
                int nDstXCount = 0;
8,449✔
5691
                if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
8,449✔
5692
                    nDstXCount = nDstChunkXSize;
8,253✔
5693
                else
5694
                    nDstXCount = nDstXOffEnd - nDstXOff;
196✔
5695

5696
                dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
8,449✔
5697

5698
                int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
8,449✔
5699
                int nChunkXOff2 = static_cast<int>(
8,449✔
5700
                    ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
8,449✔
5701
                if (nChunkXOff2 > nSrcWidth ||
8,449✔
5702
                    nDstXOff + nDstXCount == nDstTotalWidth)
8,449✔
5703
                    nChunkXOff2 = nSrcWidth;
1,672✔
5704
                const int nXCount = nChunkXOff2 - nChunkXOff;
8,449✔
5705
                CPLAssert(nXCount <= nFullResXChunk);
8,449✔
5706

5707
                int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
8,449✔
5708
                int nChunkXSizeQueried =
8,449✔
5709
                    nXCount + 2 * nKernelRadius * nOvrFactor;
8,449✔
5710
                if (nChunkXOffQueried < 0)
8,449✔
5711
                {
5712
                    nChunkXSizeQueried += nChunkXOffQueried;
200✔
5713
                    nChunkXOffQueried = 0;
200✔
5714
                }
5715
                if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
8,449✔
5716
                    nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
203✔
5717
                CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
8,449✔
5718
#if DEBUG_VERBOSE
5719
                CPLDebug("GDAL",
5720
                         "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
5721
                         nChunkXOffQueried, nChunkYOffQueried,
5722
                         nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
5723
                         nDstYOff, nDstXCount, nDstYCount);
5724
#endif
5725

5726
                // Avoid accumulating too many tasks and exhaust RAM
5727

5728
                // Try to complete already finished jobs
5729
                while (eErr == CE_None && !jobList.empty())
16,511✔
5730
                {
5731
                    auto poOldestJob = jobList.front().get();
8,416✔
5732
                    {
5733
                        std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
8,416✔
5734
                        if (!poOldestJob->bFinished)
8,416✔
5735
                        {
5736
                            break;
354✔
5737
                        }
5738
                    }
5739
                    eErr = poOldestJob->eErr;
8,062✔
5740
                    if (eErr == CE_None)
8,062✔
5741
                    {
5742
                        eErr = WriteJobData(poOldestJob);
8,062✔
5743
                    }
5744

5745
                    jobList.pop_front();
8,062✔
5746
                }
5747

5748
                // And in case we have saturated the number of threads,
5749
                // wait for completion of tasks to go below the threshold.
5750
                while (eErr == CE_None &&
16,968✔
5751
                       jobList.size() >= static_cast<size_t>(nThreads))
8,484✔
5752
                {
5753
                    eErr = WaitAndFinalizeOldestJob(jobList);
35✔
5754
                }
5755

5756
                // (Re)allocate buffers if needed
5757
                for (int iBand = 0; iBand < nBands; ++iBand)
24,751✔
5758
                {
5759
                    if (apaChunk[iBand] == nullptr)
16,302✔
5760
                    {
5761
                        apaChunk[iBand] = VSI_MALLOC3_VERBOSE(
9,292✔
5762
                            nFullResXChunkQueried, nFullResYChunkQueried,
5763
                            nWrkDataTypeSize);
5764
                        if (apaChunk[iBand] == nullptr)
9,292✔
5765
                        {
5766
                            eErr = CE_Failure;
×
5767
                        }
5768
                    }
5769
                    if (bUseNoDataMask &&
24,739✔
5770
                        apabyChunkNoDataMask[iBand] == nullptr)
8,437✔
5771
                    {
5772
                        apabyChunkNoDataMask[iBand] =
16,756✔
5773
                            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
8,378✔
5774
                                nFullResXChunkQueried, nFullResYChunkQueried));
5775
                        if (apabyChunkNoDataMask[iBand] == nullptr)
8,378✔
5776
                        {
5777
                            eErr = CE_Failure;
×
5778
                        }
5779
                    }
5780
                }
5781

5782
                // Read the source buffers for all the bands.
5783
                for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
24,751✔
5784
                {
5785
                    GDALRasterBand *poSrcBand = nullptr;
16,302✔
5786
                    if (iSrcOverview == -1)
16,302✔
5787
                        poSrcBand = papoSrcBands[iBand];
15,390✔
5788
                    else
5789
                        poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
912✔
5790
                    eErr = poSrcBand->RasterIO(
16,302✔
5791
                        GF_Read, nChunkXOffQueried, nChunkYOffQueried,
5792
                        nChunkXSizeQueried, nChunkYSizeQueried, apaChunk[iBand],
16,302✔
5793
                        nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType, 0,
5794
                        0, nullptr);
5795

5796
                    if (bUseNoDataMask && eErr == CE_None)
16,302✔
5797
                    {
5798
                        auto poMaskBand = poSrcBand->IsMaskBand()
8,437✔
5799
                                              ? poSrcBand
8,437✔
5800
                                              : poSrcBand->GetMaskBand();
6,334✔
5801
                        eErr = poMaskBand->RasterIO(
8,437✔
5802
                            GF_Read, nChunkXOffQueried, nChunkYOffQueried,
5803
                            nChunkXSizeQueried, nChunkYSizeQueried,
5804
                            apabyChunkNoDataMask[iBand], nChunkXSizeQueried,
8,437✔
5805
                            nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
5806
                    }
5807
                }
5808

5809
                // Compute the resulting overview block.
5810
                for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
24,750✔
5811
                {
5812
                    auto poJob = std::make_unique<OvrJob>();
32,602✔
5813
                    poJob->pfnResampleFn = pfnResampleFn;
16,301✔
5814
                    poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
16,301✔
5815
                    poJob->args.eOvrDataType =
32,602✔
5816
                        poJob->poDstBand->GetRasterDataType();
16,301✔
5817
                    poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
16,301✔
5818
                    poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
16,301✔
5819
                    const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
16,301✔
5820
                        "NBITS", "IMAGE_STRUCTURE");
16,301✔
5821
                    poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
16,301✔
5822
                    poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
16,301✔
5823
                    poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
16,301✔
5824
                    poJob->args.eWrkDataType = eWrkDataType;
16,301✔
5825
                    poJob->pChunk = apaChunk[iBand];
16,301✔
5826
                    poJob->args.pabyChunkNodataMask =
16,301✔
5827
                        apabyChunkNoDataMask[iBand];
16,301✔
5828
                    poJob->args.nChunkXOff = nChunkXOffQueried;
16,301✔
5829
                    poJob->args.nChunkXSize = nChunkXSizeQueried;
16,301✔
5830
                    poJob->args.nChunkYOff = nChunkYOffQueried;
16,301✔
5831
                    poJob->args.nChunkYSize = nChunkYSizeQueried;
16,301✔
5832
                    poJob->args.nDstXOff = nDstXOff;
16,301✔
5833
                    poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
16,301✔
5834
                    poJob->args.nDstYOff = nDstYOff;
16,301✔
5835
                    poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
16,301✔
5836
                    poJob->args.pszResampling = pszResampling;
16,301✔
5837
                    poJob->args.bHasNoData = pabHasNoData[iBand];
16,301✔
5838
                    poJob->args.dfNoDataValue = padfNoDataValue[iBand];
16,301✔
5839
                    poJob->args.eSrcDataType = eDataType;
16,301✔
5840
                    poJob->args.bPropagateNoData = bPropagateNoData;
16,301✔
5841

5842
                    if (poJobQueue)
16,301✔
5843
                    {
5844
                        poJob->oSrcMaskBufferHolder.reset(
16,224✔
5845
                            new PointerHolder(apabyChunkNoDataMask[iBand]));
8,112✔
5846
                        apabyChunkNoDataMask[iBand] = nullptr;
8,112✔
5847

5848
                        poJob->oSrcBufferHolder.reset(
16,224✔
5849
                            new PointerHolder(apaChunk[iBand]));
8,112✔
5850
                        apaChunk[iBand] = nullptr;
8,112✔
5851

5852
                        poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
8,112✔
5853
                        jobList.emplace_back(std::move(poJob));
8,112✔
5854
                    }
5855
                    else
5856
                    {
5857
                        JobResampleFunc(poJob.get());
8,189✔
5858
                        eErr = poJob->eErr;
8,189✔
5859
                        if (eErr == CE_None)
8,189✔
5860
                        {
5861
                            eErr = WriteJobData(poJob.get());
8,189✔
5862
                        }
5863
                    }
5864
                }
5865
            }
5866
        }
5867

5868
        // Wait for all pending jobs to complete
5869
        while (!jobList.empty())
618✔
5870
        {
5871
            const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
15✔
5872
            if (l_eErr != CE_None && eErr == CE_None)
15✔
5873
                eErr = l_eErr;
×
5874
        }
5875

5876
        // Flush the data to overviews.
5877
        for (int iBand = 0; iBand < nBands; ++iBand)
1,797✔
5878
        {
5879
            CPLFree(apaChunk[iBand]);
1,194✔
5880
            papapoOverviewBands[iBand][iOverview]->FlushCache(false);
1,194✔
5881

5882
            CPLFree(apabyChunkNoDataMask[iBand]);
1,194✔
5883
        }
5884
    }
5885

5886
    CPLFree(pabHasNoData);
372✔
5887
    CPLFree(padfNoDataValue);
372✔
5888

5889
    if (eErr == CE_None)
372✔
5890
        pfnProgress(1.0, nullptr, pProgressData);
370✔
5891

5892
    return eErr;
372✔
5893
}
5894

5895
/************************************************************************/
5896
/*            GDALRegenerateOverviewsMultiBand()                        */
5897
/************************************************************************/
5898

5899
/**
5900
 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5901
 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5902
 *
5903
 * This function will generate one or more overview images from a base
5904
 * image using the requested downsampling algorithm.  Its primary use
5905
 * is for generating overviews via GDALDataset::BuildOverviews(), but it
5906
 * can also be used to generate downsampled images in one file from another
5907
 * outside the overview architecture.
5908
 *
5909
 * The output bands need to exist in advance and share the same characteristics
5910
 * (type, dimensions)
5911
 *
5912
 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5913
 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5914
 *
5915
 * It does not support color tables or complex data types.
5916
 *
5917
 * The pseudo-algorithm used by the function is :
5918
 *    for each overview
5919
 *       iterate on lines of the source by a step of deltay
5920
 *           iterate on columns of the source  by a step of deltax
5921
 *               read the source data of size deltax * deltay for all the bands
5922
 *               generate the corresponding overview block for all the bands
5923
 *
5924
 * This function will honour properly NODATA_VALUES tuples (special dataset
5925
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5926
 * considered as the nodata value and not each value of the triplet
5927
 * independently per band.
5928
 *
5929
 * The GDAL_NUM_THREADS configuration option can be set
5930
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5931
 * overview computation.
5932
 *
5933
 * @param apoSrcBands the list of source bands to downsample
5934
 * @param aapoOverviewBands bidimension array of bands. First dimension is
5935
 *                          indexed by bands. Second dimension is indexed by
5936
 *                          overview levels. All aapoOverviewBands[i] arrays
5937
 *                          must have the same size (i.e. same number of
5938
 *                          overviews)
5939
 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5940
 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5941
 * @param pfnProgress progress report function.
5942
 * @param pProgressData progress function callback data.
5943
 * @param papszOptions NULL terminated list of options as
5944
 *                     key=value pairs, or NULL
5945
 *                     The XOFF, YOFF, XSIZE and YSIZE
5946
 *                     options can be specified to express that overviews should
5947
 *                     be regenerated only in the specified subset of the source
5948
 *                     dataset.
5949
 * @return CE_None on success or CE_Failure on failure.
5950
 * @since 3.10
5951
 */
5952

5953
CPLErr GDALRegenerateOverviewsMultiBand(
5✔
5954
    const std::vector<GDALRasterBand *> &apoSrcBands,
5955
    const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
5956
    const char *pszResampling, GDALProgressFunc pfnProgress,
5957
    void *pProgressData, CSLConstList papszOptions)
5958
{
5959
    CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
5✔
5960
    for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
15✔
5961
    {
5962
        CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
10✔
5963
    }
5964

5965
    if (aapoOverviewBands.empty())
5✔
5966
        return CE_None;
×
5967

5968
    std::vector<GDALRasterBand **> apapoOverviewBands;
5✔
5969
    for (auto &apoOverviewBands : aapoOverviewBands)
20✔
5970
    {
5971
        auto papoOverviewBands = static_cast<GDALRasterBand **>(
5972
            CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
15✔
5973
        for (size_t i = 0; i < apoOverviewBands.size(); ++i)
30✔
5974
        {
5975
            papoOverviewBands[i] = apoOverviewBands[i];
15✔
5976
        }
5977
        apapoOverviewBands.push_back(papoOverviewBands);
15✔
5978
    }
5979
    const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
10✔
5980
        static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
5✔
5981
        static_cast<int>(aapoOverviewBands[0].size()),
5✔
5982
        apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
5✔
5983
        papszOptions);
5984
    for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
20✔
5985
        CPLFree(papoOverviewBands);
15✔
5986
    return eErr;
5✔
5987
}
5988

5989
/************************************************************************/
5990
/*                        GDALComputeBandStats()                        */
5991
/************************************************************************/
5992

5993
/** Undocumented
5994
 * @param hSrcBand undocumented.
5995
 * @param nSampleStep Step between scanlines used to compute statistics.
5996
 *                    When nSampleStep is equal to 1, all scanlines will
5997
 *                    be processed.
5998
 * @param pdfMean undocumented.
5999
 * @param pdfStdDev undocumented.
6000
 * @param pfnProgress undocumented.
6001
 * @param pProgressData undocumented.
6002
 * @return undocumented
6003
 */
6004
CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
16✔
6005
                                        int nSampleStep, double *pdfMean,
6006
                                        double *pdfStdDev,
6007
                                        GDALProgressFunc pfnProgress,
6008
                                        void *pProgressData)
6009

6010
{
6011
    VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
16✔
6012

6013
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
16✔
6014

6015
    if (pfnProgress == nullptr)
16✔
6016
        pfnProgress = GDALDummyProgress;
16✔
6017

6018
    const int nWidth = poSrcBand->GetXSize();
16✔
6019
    const int nHeight = poSrcBand->GetYSize();
16✔
6020

6021
    if (nSampleStep >= nHeight || nSampleStep < 1)
16✔
6022
        nSampleStep = 1;
3✔
6023

6024
    GDALDataType eWrkType = GDT_Unknown;
16✔
6025
    float *pafData = nullptr;
16✔
6026
    GDALDataType eType = poSrcBand->GetRasterDataType();
16✔
6027
    const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
16✔
6028
    if (bComplex)
16✔
6029
    {
6030
        pafData = static_cast<float *>(
6031
            VSI_MALLOC_VERBOSE(nWidth * 2 * sizeof(float)));
×
6032
        eWrkType = GDT_CFloat32;
×
6033
    }
6034
    else
6035
    {
6036
        pafData =
6037
            static_cast<float *>(VSI_MALLOC_VERBOSE(nWidth * sizeof(float)));
16✔
6038
        eWrkType = GDT_Float32;
16✔
6039
    }
6040

6041
    if (nWidth == 0 || pafData == nullptr)
16✔
6042
    {
6043
        VSIFree(pafData);
×
6044
        return CE_Failure;
×
6045
    }
6046

6047
    /* -------------------------------------------------------------------- */
6048
    /*      Loop over all sample lines.                                     */
6049
    /* -------------------------------------------------------------------- */
6050
    double dfSum = 0.0;
16✔
6051
    double dfSum2 = 0.0;
16✔
6052
    int iLine = 0;
16✔
6053
    GIntBig nSamples = 0;
16✔
6054

6055
    do
2,143✔
6056
    {
6057
        if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
2,159✔
6058
                         pProgressData))
6059
        {
6060
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
×
6061
            CPLFree(pafData);
×
6062
            return CE_Failure;
×
6063
        }
6064

6065
        const CPLErr eErr =
6066
            poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
2,159✔
6067
                                1, eWrkType, 0, 0, nullptr);
6068
        if (eErr != CE_None)
2,159✔
6069
        {
6070
            CPLFree(pafData);
1✔
6071
            return eErr;
1✔
6072
        }
6073

6074
        for (int iPixel = 0; iPixel < nWidth; ++iPixel)
725,204✔
6075
        {
6076
            float fValue = 0.0f;
723,046✔
6077

6078
            if (bComplex)
723,046✔
6079
            {
6080
                // Compute the magnitude of the complex value.
6081
                fValue =
6082
                    std::hypot(pafData[iPixel * 2], pafData[iPixel * 2 + 1]);
×
6083
            }
6084
            else
6085
            {
6086
                fValue = pafData[iPixel];
723,046✔
6087
            }
6088

6089
            dfSum += fValue;
723,046✔
6090
            dfSum2 += static_cast<double>(fValue) * fValue;
723,046✔
6091
        }
6092

6093
        nSamples += nWidth;
2,158✔
6094
        iLine += nSampleStep;
2,158✔
6095
    } while (iLine < nHeight);
2,158✔
6096

6097
    if (!pfnProgress(1.0, nullptr, pProgressData))
15✔
6098
    {
6099
        CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
×
6100
        CPLFree(pafData);
×
6101
        return CE_Failure;
×
6102
    }
6103

6104
    /* -------------------------------------------------------------------- */
6105
    /*      Produce the result values.                                      */
6106
    /* -------------------------------------------------------------------- */
6107
    if (pdfMean != nullptr)
15✔
6108
        *pdfMean = dfSum / nSamples;
15✔
6109

6110
    if (pdfStdDev != nullptr)
15✔
6111
    {
6112
        const double dfMean = dfSum / nSamples;
15✔
6113

6114
        *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
15✔
6115
    }
6116

6117
    CPLFree(pafData);
15✔
6118

6119
    return CE_None;
15✔
6120
}
6121

6122
/************************************************************************/
6123
/*                  GDALOverviewMagnitudeCorrection()                   */
6124
/*                                                                      */
6125
/*      Correct the mean and standard deviation of the overviews of     */
6126
/*      the given band to match the base layer approximately.           */
6127
/************************************************************************/
6128

6129
/** Undocumented
6130
 * @param hBaseBand undocumented.
6131
 * @param nOverviewCount undocumented.
6132
 * @param pahOverviews undocumented.
6133
 * @param pfnProgress undocumented.
6134
 * @param pProgressData undocumented.
6135
 * @return undocumented
6136
 */
6137
CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
×
6138
                                       int nOverviewCount,
6139
                                       GDALRasterBandH *pahOverviews,
6140
                                       GDALProgressFunc pfnProgress,
6141
                                       void *pProgressData)
6142

6143
{
6144
    VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
×
6145

6146
    /* -------------------------------------------------------------------- */
6147
    /*      Compute mean/stddev for source raster.                          */
6148
    /* -------------------------------------------------------------------- */
6149
    double dfOrigMean = 0.0;
×
6150
    double dfOrigStdDev = 0.0;
×
6151
    {
6152
        const CPLErr eErr =
6153
            GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
×
6154
                                 pfnProgress, pProgressData);
6155

6156
        if (eErr != CE_None)
×
6157
            return eErr;
×
6158
    }
6159

6160
    /* -------------------------------------------------------------------- */
6161
    /*      Loop on overview bands.                                         */
6162
    /* -------------------------------------------------------------------- */
6163
    for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
×
6164
    {
6165
        GDALRasterBand *poOverview =
6166
            GDALRasterBand::FromHandle(pahOverviews[iOverview]);
×
6167
        double dfOverviewMean, dfOverviewStdDev;
6168

6169
        const CPLErr eErr =
6170
            GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
×
6171
                                 &dfOverviewStdDev, pfnProgress, pProgressData);
6172

6173
        if (eErr != CE_None)
×
6174
            return eErr;
×
6175

6176
        double dfGain = 1.0;
×
6177
        if (dfOrigStdDev >= 0.0001)
×
6178
            dfGain = dfOrigStdDev / dfOverviewStdDev;
×
6179

6180
        /* --------------------------------------------------------------------
6181
         */
6182
        /*      Apply gain and offset. */
6183
        /* --------------------------------------------------------------------
6184
         */
6185
        const int nWidth = poOverview->GetXSize();
×
6186
        const int nHeight = poOverview->GetYSize();
×
6187

6188
        GDALDataType eWrkType = GDT_Unknown;
×
6189
        float *pafData = nullptr;
×
6190
        const GDALDataType eType = poOverview->GetRasterDataType();
×
6191
        const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
×
6192
        if (bComplex)
×
6193
        {
6194
            pafData = static_cast<float *>(
6195
                VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
×
6196
            eWrkType = GDT_CFloat32;
×
6197
        }
6198
        else
6199
        {
6200
            pafData = static_cast<float *>(
6201
                VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
×
6202
            eWrkType = GDT_Float32;
×
6203
        }
6204

6205
        if (pafData == nullptr)
×
6206
        {
6207
            return CE_Failure;
×
6208
        }
6209

6210
        for (int iLine = 0; iLine < nHeight; ++iLine)
×
6211
        {
6212
            if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
×
6213
                             pProgressData))
6214
            {
6215
                CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
×
6216
                CPLFree(pafData);
×
6217
                return CE_Failure;
×
6218
            }
6219

6220
            if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
×
6221
                                     nWidth, 1, eWrkType, 0, 0,
6222
                                     nullptr) != CE_None)
×
6223
            {
6224
                CPLFree(pafData);
×
6225
                return CE_Failure;
×
6226
            }
6227

6228
            for (int iPixel = 0; iPixel < nWidth; ++iPixel)
×
6229
            {
6230
                if (bComplex)
×
6231
                {
6232
                    pafData[iPixel * 2] *= static_cast<float>(dfGain);
×
6233
                    pafData[iPixel * 2 + 1] *= static_cast<float>(dfGain);
×
6234
                }
6235
                else
6236
                {
6237
                    pafData[iPixel] = static_cast<float>(
×
6238
                        (pafData[iPixel] - dfOverviewMean) * dfGain +
×
6239
                        dfOrigMean);
6240
                }
6241
            }
6242

6243
            if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
×
6244
                                     nWidth, 1, eWrkType, 0, 0,
6245
                                     nullptr) != CE_None)
×
6246
            {
6247
                CPLFree(pafData);
×
6248
                return CE_Failure;
×
6249
            }
6250
        }
6251

6252
        if (!pfnProgress(1.0, nullptr, pProgressData))
×
6253
        {
6254
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
×
6255
            CPLFree(pafData);
×
6256
            return CE_Failure;
×
6257
        }
6258

6259
        CPLFree(pafData);
×
6260
    }
6261

6262
    return CE_None;
×
6263
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc