• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

OSGeo / gdal / 15899162844

26 Jun 2025 10:14AM UTC coverage: 71.088% (+0.004%) from 71.084%
15899162844

Pull #12623

github

web-flow
Merge c704a8392 into f5cb024d4
Pull Request #12623: gdal raster overview add: add a --overview-src option

209 of 244 new or added lines in 5 files covered. (85.66%)

96 existing lines in 44 files now uncovered.

574014 of 807474 relevant lines covered (71.09%)

250815.03 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.87
/gcore/rasterio.cpp
1
/******************************************************************************
2
 *
3
 * Project:  GDAL Core
4
 * Purpose:  Contains default implementation of GDALRasterBand::IRasterIO()
5
 *           and supporting functions of broader utility.
6
 * Author:   Frank Warmerdam, warmerdam@pobox.com
7
 *
8
 ******************************************************************************
9
 * Copyright (c) 1998, Frank Warmerdam
10
 * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
11
 *
12
 * SPDX-License-Identifier: MIT
13
 ****************************************************************************/
14

15
#include "cpl_port.h"
16
#include "gdal.h"
17
#include "gdal_priv.h"
18

19
#include <cassert>
20
#include <climits>
21
#include <cmath>
22
#include <cstddef>
23
#include <cstdio>
24
#include <cstdlib>
25
#include <cstring>
26

27
#include <algorithm>
28
#include <limits>
29
#include <stdexcept>
30
#include <type_traits>
31

32
#include "cpl_conv.h"
33
#include "cpl_cpu_features.h"
34
#include "cpl_error.h"
35
#include "cpl_float.h"
36
#include "cpl_progress.h"
37
#include "cpl_string.h"
38
#include "cpl_vsi.h"
39
#include "gdal_priv_templates.hpp"
40
#include "gdal_vrt.h"
41
#include "gdalwarper.h"
42
#include "memdataset.h"
43
#include "vrtdataset.h"
44

45
#if defined(__x86_64) || defined(_M_X64)
46
#include <emmintrin.h>
47
#define HAVE_SSE2
48
#elif defined(USE_NEON_OPTIMIZATIONS)
49
#include "include_sse2neon.h"
50
#define HAVE_SSE2
51
#endif
52

53
#ifdef HAVE_SSSE3_AT_COMPILE_TIME
54
#include "rasterio_ssse3.h"
55
#ifdef __SSSE3__
56
#include <tmmintrin.h>
57
#endif
58
#endif
59

60
#ifdef __SSE4_1__
61
#include <smmintrin.h>
62
#endif
63

64
#ifdef __GNUC__
65
#define CPL_NOINLINE __attribute__((noinline))
66
#else
67
#define CPL_NOINLINE
68
#endif
69

70
static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
71
                             int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
72
                             int nDstPixelStride, GPtrDiff_t nWordCount);
73

74
/************************************************************************/
75
/*                    DownsamplingIntegerXFactor()                      */
76
/************************************************************************/
77

78
template <bool bSameDataType, int DATA_TYPE_SIZE>
79
static bool DownsamplingIntegerXFactor(
695,677✔
80
    GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
81
    GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
82
    GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
83
    int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
84
{
85
    const int nBandDataSize =
695,677✔
86
        bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
87
    int nOuterLoopIters = nBufXSize - 1;
695,677✔
88
    const int nIncSrcOffset = nSrcXInc * nBandDataSize;
695,677✔
89
    const GByte *CPL_RESTRICT pabySrcData;
90
    int nEndBlockX = nBlockXSize + nStartBlockX;
695,677✔
91

92
    if (iSrcX < nEndBlockX)
695,677✔
93
    {
94
        CPLAssert(poBlock);
294,999✔
95
        goto no_reload_block;
294,999✔
96
    }
97
    goto reload_block;
400,678✔
98

99
    // Don't do the last iteration in the loop, as iSrcX might go beyond
100
    // nRasterXSize - 1
101
    while (--nOuterLoopIters >= 1)
1,264,772✔
102
    {
103
        iSrcX += nSrcXInc;
201,834✔
104
        pabySrcData += nIncSrcOffset;
201,834✔
105
        pabyDstData += nPixelSpace;
201,834✔
106

107
        /* --------------------------------------------------------------------
108
         */
109
        /*      Ensure we have the appropriate block loaded. */
110
        /* --------------------------------------------------------------------
111
         */
112
        if (iSrcX >= nEndBlockX)
201,834✔
113
        {
114
        reload_block:
201,834✔
115
        {
116
            const int nLBlockX = iSrcX / nBlockXSize;
615,102✔
117
            nStartBlockX = nLBlockX * nBlockXSize;
615,102✔
118
            nEndBlockX = nStartBlockX + nBlockXSize;
615,102✔
119

120
            if (poBlock != nullptr)
615,102✔
121
                poBlock->DropLock();
341,314✔
122

123
            poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
615,102✔
124
            if (poBlock == nullptr)
615,102✔
125
            {
126
                return false;
1✔
127
            }
128
        }
129

130
        no_reload_block:
615,101✔
131
            const GByte *pabySrcBlock =
132
                static_cast<const GByte *>(poBlock->GetDataRef());
1,264,772✔
133
            GPtrDiff_t iSrcOffset =
1,264,772✔
134
                (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
1,264,772✔
135
            pabySrcData = pabySrcBlock + iSrcOffset;
1,264,772✔
136
        }
137

138
        /* --------------------------------------------------------------------
139
         */
140
        /*      Copy the maximum run of pixels. */
141
        /* --------------------------------------------------------------------
142
         */
143

144
        const int nIters = std::min(
1,264,772✔
145
            (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
1,264,772✔
146
        if (bSameDataType)
147
        {
148
            memcpy(pabyDstData, pabySrcData, nBandDataSize);
1,264,367✔
149
            if (nIters > 1)
1,264,367✔
150
            {
151
                if (DATA_TYPE_SIZE == 1)
152
                {
153
                    pabySrcData += nIncSrcOffset;
326,246✔
154
                    pabyDstData += nPixelSpace;
326,246✔
155
                    GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
326,246✔
156
                                     nPixelSpace, nIters - 1);
326,246✔
157
                    pabySrcData +=
326,246✔
158
                        static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
326,246✔
159
                    pabyDstData +=
326,246✔
160
                        static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
326,246✔
161
                }
162
                else
163
                {
164
                    for (int i = 0; i < nIters - 1; i++)
4,395,158✔
165
                    {
166
                        pabySrcData += nIncSrcOffset;
4,197,064✔
167
                        pabyDstData += nPixelSpace;
4,197,064✔
168
                        memcpy(pabyDstData, pabySrcData, nBandDataSize);
4,197,064✔
169
                    }
170
                }
171
                iSrcX += nSrcXInc * (nIters - 1);
524,340✔
172
                nOuterLoopIters -= nIters - 1;
524,340✔
173
            }
174
        }
175
        else
176
        {
177
            // Type to type conversion ...
178
            GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
405✔
179
                            eBufType, nPixelSpace, std::max(1, nIters));
405✔
180
            if (nIters > 1)
405✔
181
            {
182
                pabySrcData +=
198✔
183
                    static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
198✔
184
                pabyDstData +=
198✔
185
                    static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
198✔
186
                iSrcX += nSrcXInc * (nIters - 1);
198✔
187
                nOuterLoopIters -= nIters - 1;
198✔
188
            }
189
        }
190
    }
191

192
    // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
193
    if (nOuterLoopIters == 0)
1,062,938✔
194
    {
195
        const int nRasterXSize = poBand->GetXSize();
367,262✔
196
        iSrcX =
367,262✔
197
            static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
734,524✔
198
                                      static_cast<GInt64>(nRasterXSize - 1)));
367,262✔
199
        pabyDstData += nPixelSpace;
367,262✔
200
        if (iSrcX < nEndBlockX)
367,262✔
201
        {
202
            goto no_reload_block;
354,672✔
203
        }
204
        goto reload_block;
12,590✔
205
    }
206
    return true;
695,676✔
207
}
208

209
template <class A, class B>
210
CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW inline auto CPLUnsanitizedMul(A a, B b)
2,723,550✔
211
{
212
    return a * b;
2,723,550✔
213
}
214

215
/************************************************************************/
216
/*                             IRasterIO()                              */
217
/*                                                                      */
218
/*      Default internal implementation of RasterIO() ... utilizes      */
219
/*      the Block access methods to satisfy the request.  This would    */
220
/*      normally only be overridden by formats with overviews.          */
221
/************************************************************************/
222

223
CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
6,118,250✔
224
                                 int nXSize, int nYSize, void *pData,
225
                                 int nBufXSize, int nBufYSize,
226
                                 GDALDataType eBufType, GSpacing nPixelSpace,
227
                                 GSpacing nLineSpace,
228
                                 GDALRasterIOExtraArg *psExtraArg)
229

230
{
231
    if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
6,118,250✔
232
    {
233
        CPLError(eFlushBlockErr, CPLE_AppDefined,
×
234
                 "An error occurred while writing a dirty block "
235
                 "from GDALRasterBand::IRasterIO");
236
        CPLErr eErr = eFlushBlockErr;
×
237
        eFlushBlockErr = CE_None;
×
238
        return eErr;
×
239
    }
240
    if (nBlockXSize <= 0 || nBlockYSize <= 0)
6,118,250✔
241
    {
242
        CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
1,768✔
243
        return CE_Failure;
×
244
    }
245

246
    const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
6,116,490✔
247
    const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
6,116,280✔
248
    GByte dummyBlock[2] = {0, 0};
6,115,900✔
249
    GByte *pabySrcBlock =
6,115,900✔
250
        dummyBlock; /* to avoid Coverity warning about nullptr dereference */
251
    GDALRasterBlock *poBlock = nullptr;
6,115,900✔
252
    const bool bUseIntegerRequestCoords =
6,115,900✔
253
        (!psExtraArg->bFloatingPointWindowValidity ||
6,462,510✔
254
         (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
346,611✔
255
          nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
323,229✔
256

257
    /* ==================================================================== */
258
    /*      A common case is the data requested with the destination        */
259
    /*      is packed, and the block width is the raster width.             */
260
    /* ==================================================================== */
261
    if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
6,038,210✔
262
        nBlockXSize == GetXSize() && nBufXSize == nXSize &&
3,193,220✔
263
        nBufYSize == nYSize && bUseIntegerRequestCoords)
12,154,800✔
264
    {
265
        CPLErr eErr = CE_None;
3,083,000✔
266
        int nLBlockY = -1;
3,083,000✔
267

268
        for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
9,166,340✔
269
        {
270
            const int iSrcY = iBufYOff + nYOff;
6,081,540✔
271

272
            if (iSrcY < nLBlockY * nBlockYSize ||
6,081,540✔
273
                iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
6,079,950✔
274
            {
275
                nLBlockY = iSrcY / nBlockYSize;
3,345,960✔
276
                bool bJustInitialize =
3,345,960✔
277
                    eRWFlag == GF_Write && nXOff == 0 &&
295,350✔
278
                    nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
3,698,360✔
279
                    nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
57,055✔
280

281
                // Is this a partial tile at right and/or bottom edges of
282
                // the raster, and that is going to be completely written?
283
                // If so, do not load it from storage, but zero it so that
284
                // the content outsize of the validity area is initialized.
285
                bool bMemZeroBuffer = false;
3,345,960✔
286
                if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
295,350✔
287
                    nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
23,813✔
288
                    nYOff + nYSize == GetYSize() &&
3,641,390✔
289
                    nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
89✔
290
                {
291
                    bJustInitialize = true;
89✔
292
                    bMemZeroBuffer = true;
89✔
293
                }
294

295
                if (poBlock)
3,345,960✔
296
                    poBlock->DropLock();
261,334✔
297

298
                const GUInt32 nErrorCounter = CPLGetErrorCounter();
3,345,960✔
299
                poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
3,343,820✔
300
                if (poBlock == nullptr)
3,347,270✔
301
                {
302
                    if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
1,070✔
303
                        nullptr)
304
                    {
305
                        CPLError(CE_Failure, CPLE_AppDefined,
×
306
                                 "GetBlockRef failed at X block offset %d, "
307
                                 "Y block offset %d%s",
308
                                 0, nLBlockY,
309
                                 (nErrorCounter != CPLGetErrorCounter())
×
310
                                     ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
×
311
                                     : "");
312
                    }
313
                    eErr = CE_Failure;
1,070✔
314
                    break;
1,070✔
315
                }
316

317
                if (eRWFlag == GF_Write)
3,346,200✔
318
                    poBlock->MarkDirty();
295,350✔
319

320
                pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
3,346,200✔
321
                if (bMemZeroBuffer)
3,346,190✔
322
                {
323
                    memset(pabySrcBlock, 0,
89✔
324
                           static_cast<GPtrDiff_t>(nBandDataSize) *
89✔
325
                               nBlockXSize * nBlockYSize);
89✔
326
                }
327
            }
328

329
            const auto nSrcByteOffset =
6,081,780✔
330
                (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
6,081,780✔
331
                     nBlockXSize +
6,081,780✔
332
                 nXOff) *
6,081,780✔
333
                nBandDataSize;
6,081,780✔
334

335
            if (eDataType == eBufType)
6,081,780✔
336
            {
337
                if (eRWFlag == GF_Read)
2,449,090✔
338
                    memcpy(static_cast<GByte *>(pData) +
1,978,370✔
339
                               static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
1,978,370✔
340
                           pabySrcBlock + nSrcByteOffset,
1,978,370✔
341
                           static_cast<size_t>(nLineSpace));
342
                else
343
                    memcpy(pabySrcBlock + nSrcByteOffset,
470,722✔
344
                           static_cast<GByte *>(pData) +
470,722✔
345
                               static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
470,722✔
346
                           static_cast<size_t>(nLineSpace));
347
            }
348
            else
349
            {
350
                // Type to type conversion.
351
                if (eRWFlag == GF_Read)
3,632,690✔
352
                    GDALCopyWords64(
3,612,510✔
353
                        pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
3,612,510✔
354
                        static_cast<GByte *>(pData) +
355
                            static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
3,612,510✔
356
                        eBufType, static_cast<int>(nPixelSpace), nBufXSize);
357
                else
358
                    GDALCopyWords64(static_cast<GByte *>(pData) +
20,176✔
359
                                        static_cast<GPtrDiff_t>(iBufYOff) *
20,176✔
360
                                            nLineSpace,
361
                                    eBufType, static_cast<int>(nPixelSpace),
362
                                    pabySrcBlock + nSrcByteOffset, eDataType,
20,176✔
363
                                    nBandDataSize, nBufXSize);
364
            }
365

366
            if (psExtraArg->pfnProgress != nullptr &&
6,156,300✔
367
                !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
72,959✔
368
                                         psExtraArg->pProgressData))
369
            {
370
                eErr = CE_Failure;
5✔
371
                break;
5✔
372
            }
373
        }
374

375
        if (poBlock)
3,085,870✔
376
            poBlock->DropLock();
3,084,690✔
377

378
        return eErr;
3,085,780✔
379
    }
380

381
    /* ==================================================================== */
382
    /*      Do we have overviews that would be appropriate to satisfy       */
383
    /*      this request?                                                   */
384
    /* ==================================================================== */
385
    if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
3,033,600✔
386
        eRWFlag == GF_Read)
387
    {
388
        GDALRasterIOExtraArg sExtraArg;
389
        GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
2,902✔
390

391
        const int nOverview =
392
            GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
2,902✔
393
                                          nBufXSize, nBufYSize, &sExtraArg);
394
        if (nOverview >= 0)
2,902✔
395
        {
396
            GDALRasterBand *poOverviewBand = GetOverview(nOverview);
2,827✔
397
            if (poOverviewBand == nullptr)
2,827✔
398
                return CE_Failure;
2,827✔
399

400
            return poOverviewBand->RasterIO(
2,827✔
401
                eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
402
                nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
2,827✔
403
        }
404
    }
405

406
    if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
842,725✔
407
        nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
6✔
408
        nLineSpace == nPixelSpace * nBufXSize &&
3,874,190✔
409
        CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
6✔
410
    {
411
        memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
×
412
        return CE_None;
×
413
    }
414

415
    /* ==================================================================== */
416
    /*      The second case when we don't need subsample data but likely    */
417
    /*      need data type conversion.                                      */
418
    /* ==================================================================== */
419
    if (  // nPixelSpace == nBufDataSize &&
3,031,460✔
420
        nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
3,031,460✔
421
    {
422
#if DEBUG_VERBOSE
423
        printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
424
               nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
425
#endif
426

427
        /* --------------------------------------------------------------------
428
         */
429
        /*      Loop over buffer computing source locations. */
430
        /* --------------------------------------------------------------------
431
         */
432
        // Calculate starting values out of loop
433
        const int nLBlockXStart = nXOff / nBlockXSize;
2,466,280✔
434
        const int nXSpanEnd = nBufXSize + nXOff;
2,466,280✔
435

436
        int nYInc = 0;
2,466,280✔
437
        for (int iBufYOff = 0, iSrcY = nYOff; iBufYOff < nBufYSize;
4,971,250✔
438
             iBufYOff += nYInc, iSrcY += nYInc)
2,504,980✔
439
        {
440
            GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
2,505,040✔
441
                                    static_cast<GPtrDiff_t>(nLineSpace);
442
            int nLBlockY = iSrcY / nBlockYSize;
2,505,040✔
443
            int nLBlockX = nLBlockXStart;
2,505,040✔
444
            int iSrcX = nXOff;
2,505,040✔
445
            while (iSrcX < nXSpanEnd)
5,228,500✔
446
            {
447
                int nXSpan = nLBlockX * nBlockXSize;
2,723,520✔
448
                if (nXSpan < INT_MAX - nBlockXSize)
2,723,520✔
449
                    nXSpan += nBlockXSize;
2,723,520✔
450
                else
UNCOV
451
                    nXSpan = INT_MAX;
×
452
                const int nXRight = nXSpan;
2,723,520✔
453
                nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
2,723,520✔
454

455
                const size_t nXSpanSize =
456
                    CPLUnsanitizedMul(nXSpan, static_cast<size_t>(nPixelSpace));
2,723,520✔
457

458
                bool bJustInitialize =
2,723,540✔
459
                    eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
2,042,140✔
460
                    nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
37,219✔
461
                    nXOff <= nLBlockX * nBlockXSize &&
4,791,230✔
462
                    nXOff + nXSize >= nXRight;
25,553✔
463

464
                // Is this a partial tile at right and/or bottom edges of
465
                // the raster, and that is going to be completely written?
466
                // If so, do not load it from storage, but zero it so that
467
                // the content outsize of the validity area is initialized.
468
                bool bMemZeroBuffer = false;
2,723,540✔
469
                if (eRWFlag == GF_Write && !bJustInitialize &&
2,042,130✔
470
                    nXOff <= nLBlockX * nBlockXSize &&
2,017,820✔
471
                    nYOff <= nLBlockY * nBlockYSize &&
2,016,190✔
472
                    (nXOff + nXSize >= nXRight ||
12,145✔
473
                     // cppcheck-suppress knownConditionTrueFalse
474
                     (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
4,768,370✔
475
                    (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
11,965✔
476
                     (nYOff + nYSize == GetYSize() &&
10,743✔
477
                      nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
1,955✔
478
                {
479
                    bJustInitialize = true;
3,177✔
480
                    bMemZeroBuffer = true;
3,177✔
481
                }
482

483
                /* --------------------------------------------------------------------
484
                 */
485
                /*      Ensure we have the appropriate block loaded. */
486
                /* --------------------------------------------------------------------
487
                 */
488
                const GUInt32 nErrorCounter = CPLGetErrorCounter();
2,723,540✔
489
                poBlock =
2,723,560✔
490
                    GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
2,723,510✔
491
                if (!poBlock)
2,723,560✔
492
                {
493
                    if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
71✔
494
                        nullptr)
495
                    {
496
                        CPLError(CE_Failure, CPLE_AppDefined,
×
497
                                 "GetBlockRef failed at X block offset %d, "
498
                                 "Y block offset %d%s",
499
                                 nLBlockX, nLBlockY,
500
                                 (nErrorCounter != CPLGetErrorCounter())
×
501
                                     ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
×
502
                                     : "");
503
                    }
504
                    return (CE_Failure);
71✔
505
                }
506

507
                if (eRWFlag == GF_Write)
2,723,490✔
508
                    poBlock->MarkDirty();
2,042,150✔
509

510
                pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
2,723,480✔
511
                if (bMemZeroBuffer)
2,723,480✔
512
                {
513
                    memset(pabySrcBlock, 0,
3,177✔
514
                           static_cast<GPtrDiff_t>(nBandDataSize) *
3,177✔
515
                               nBlockXSize * nBlockYSize);
3,177✔
516
                }
517
                /* --------------------------------------------------------------------
518
                 */
519
                /*      Copy over this chunk of data. */
520
                /* --------------------------------------------------------------------
521
                 */
522
                GPtrDiff_t iSrcOffset =
2,723,480✔
523
                    (static_cast<GPtrDiff_t>(iSrcX) -
2,723,480✔
524
                     static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
2,723,480✔
525
                     (static_cast<GPtrDiff_t>(iSrcY) -
2,723,480✔
526
                      static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
2,723,480✔
527
                         nBlockXSize) *
2,723,480✔
528
                    nBandDataSize;
2,723,480✔
529
                // Fill up as many rows as possible for the loaded block.
530
                const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
5,446,960✔
531
                                          nBufYSize - iBufYOff);
2,723,480✔
532
                for (int k = 0; k < kmax; k++)
58,624,800✔
533
                {
534
                    if (eDataType == eBufType && nPixelSpace == nBufDataSize)
55,901,400✔
535
                    {
536
                        if (eRWFlag == GF_Read)
51,941,500✔
537
                            memcpy(static_cast<GByte *>(pData) + iBufOffset +
47,507,500✔
538
                                       static_cast<GPtrDiff_t>(k) * nLineSpace,
47,507,500✔
539
                                   pabySrcBlock + iSrcOffset, nXSpanSize);
47,507,500✔
540
                        else
541
                            memcpy(pabySrcBlock + iSrcOffset,
4,433,990✔
542
                                   static_cast<GByte *>(pData) + iBufOffset +
4,433,990✔
543
                                       static_cast<GPtrDiff_t>(k) * nLineSpace,
4,433,990✔
544
                                   nXSpanSize);
545
                    }
546
                    else
547
                    {
548
                        /* type to type conversion */
549
                        if (eRWFlag == GF_Read)
3,959,850✔
550
                            GDALCopyWords64(
3,897,670✔
551
                                pabySrcBlock + iSrcOffset, eDataType,
3,897,670✔
552
                                nBandDataSize,
553
                                static_cast<GByte *>(pData) + iBufOffset +
3,897,670✔
554
                                    static_cast<GPtrDiff_t>(k) * nLineSpace,
3,897,670✔
555
                                eBufType, static_cast<int>(nPixelSpace),
556
                                nXSpan);
557
                        else
558
                            GDALCopyWords64(
62,182✔
559
                                static_cast<GByte *>(pData) + iBufOffset +
62,182✔
560
                                    static_cast<GPtrDiff_t>(k) * nLineSpace,
62,182✔
561
                                eBufType, static_cast<int>(nPixelSpace),
562
                                pabySrcBlock + iSrcOffset, eDataType,
62,182✔
563
                                nBandDataSize, nXSpan);
564
                    }
565

566
                    iSrcOffset +=
55,901,300✔
567
                        static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
55,901,300✔
568
                }
569

570
                iBufOffset =
571
                    CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
2,723,450✔
572
                nLBlockX++;
2,723,460✔
573
                iSrcX += nXSpan;
2,723,460✔
574

575
                poBlock->DropLock();
2,723,460✔
576
                poBlock = nullptr;
2,723,470✔
577
            }
578

579
            /* Compute the increment to go on a block boundary */
580
            nYInc = nBlockYSize - (iSrcY % nBlockYSize);
2,504,990✔
581

582
            if (psExtraArg->pfnProgress != nullptr &&
2,506,830✔
583
                !psExtraArg->pfnProgress(
1,847✔
584
                    1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
2,506,830✔
585
                    psExtraArg->pProgressData))
586
            {
587
                return CE_Failure;
5✔
588
            }
589
        }
590

591
        return CE_None;
2,466,210✔
592
    }
593

594
    /* ==================================================================== */
595
    /*      Loop reading required source blocks to satisfy output           */
596
    /*      request.  This is the most general implementation.              */
597
    /* ==================================================================== */
598

599
    double dfXOff = nXOff;
565,187✔
600
    double dfYOff = nYOff;
565,187✔
601
    double dfXSize = nXSize;
565,187✔
602
    double dfYSize = nYSize;
565,187✔
603
    if (psExtraArg->bFloatingPointWindowValidity)
565,187✔
604
    {
605
        dfXOff = psExtraArg->dfXOff;
230,468✔
606
        dfYOff = psExtraArg->dfYOff;
230,468✔
607
        dfXSize = psExtraArg->dfXSize;
230,468✔
608
        dfYSize = psExtraArg->dfYSize;
230,468✔
609
    }
610

611
    /* -------------------------------------------------------------------- */
612
    /*      Compute stepping increment.                                     */
613
    /* -------------------------------------------------------------------- */
614
    const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
565,187✔
615
    const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
565,187✔
616
    CPLErr eErr = CE_None;
565,187✔
617

618
    if (eRWFlag == GF_Write)
565,187✔
619
    {
620
        /* --------------------------------------------------------------------
621
         */
622
        /*    Write case */
623
        /*    Loop over raster window computing source locations in the buffer.
624
         */
625
        /* --------------------------------------------------------------------
626
         */
627
        GByte *pabyDstBlock = nullptr;
166,651✔
628
        int nLBlockX = -1;
166,651✔
629
        int nLBlockY = -1;
166,651✔
630

631
        for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
1,259,990✔
632
        {
633
            const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
1,093,340✔
634

635
            for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
12,384,000✔
636
            {
637
                const int iBufXOff =
11,290,600✔
638
                    static_cast<int>((iDstX - nXOff) / dfSrcXInc);
11,290,600✔
639
                GPtrDiff_t iBufOffset =
11,290,600✔
640
                    static_cast<GPtrDiff_t>(iBufYOff) *
11,290,600✔
641
                        static_cast<GPtrDiff_t>(nLineSpace) +
642
                    iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
11,290,600✔
643

644
                // FIXME: this code likely doesn't work if the dirty block gets
645
                // flushed to disk before being completely written.
646
                // In the meantime, bJustInitialize should probably be set to
647
                // FALSE even if it is not ideal performance wise, and for
648
                // lossy compression.
649

650
                /* --------------------------------------------------------------------
651
                 */
652
                /*      Ensure we have the appropriate block loaded. */
653
                /* --------------------------------------------------------------------
654
                 */
655
                if (iDstX < nLBlockX * nBlockXSize ||
11,290,600✔
656
                    iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
11,041,300✔
657
                    iDstY < nLBlockY * nBlockYSize ||
10,584,600✔
658
                    iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
10,584,600✔
659
                {
660
                    nLBlockX = iDstX / nBlockXSize;
738,682✔
661
                    nLBlockY = iDstY / nBlockYSize;
738,682✔
662

663
                    const bool bJustInitialize =
738,682✔
664
                        nYOff <= nLBlockY * nBlockYSize &&
1,065,950✔
665
                        nYOff + nYSize - nBlockYSize >=
327,271✔
666
                            nLBlockY * nBlockYSize &&
327,271✔
667
                        nXOff <= nLBlockX * nBlockXSize &&
1,116,260✔
668
                        nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
50,305✔
669
                    /*bool bMemZeroBuffer = FALSE;
670
                    if( !bJustInitialize &&
671
                        nXOff <= nLBlockX * nBlockXSize &&
672
                        nYOff <= nLBlockY * nBlockYSize &&
673
                        (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
674
                         (nXOff + nXSize == GetXSize() &&
675
                         (nLBlockX+1) * nBlockXSize > GetXSize())) &&
676
                        (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
677
                         (nYOff + nYSize == GetYSize() &&
678
                         (nLBlockY+1) * nBlockYSize > GetYSize())) )
679
                    {
680
                        bJustInitialize = TRUE;
681
                        bMemZeroBuffer = TRUE;
682
                    }*/
683
                    if (poBlock != nullptr)
738,682✔
684
                        poBlock->DropLock();
572,031✔
685

686
                    poBlock =
738,682✔
687
                        GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
738,682✔
688
                    if (poBlock == nullptr)
738,682✔
689
                    {
690
                        return (CE_Failure);
×
691
                    }
692

693
                    poBlock->MarkDirty();
738,682✔
694

695
                    pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
738,682✔
696
                    /*if( bMemZeroBuffer )
697
                    {
698
                        memset(pabyDstBlock, 0,
699
                            static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
700
                    * nBlockYSize);
701
                    }*/
702
                }
703

704
                // To make Coverity happy. Should not happen by design.
705
                if (pabyDstBlock == nullptr)
11,290,600✔
706
                {
707
                    CPLAssert(false);
×
708
                    eErr = CE_Failure;
709
                    break;
710
                }
711

712
                /* --------------------------------------------------------------------
713
                 */
714
                /*      Copy over this pixel of data. */
715
                /* --------------------------------------------------------------------
716
                 */
717
                GPtrDiff_t iDstOffset =
11,290,600✔
718
                    (static_cast<GPtrDiff_t>(iDstX) -
11,290,600✔
719
                     static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
11,290,600✔
720
                     (static_cast<GPtrDiff_t>(iDstY) -
11,290,600✔
721
                      static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
11,290,600✔
722
                         nBlockXSize) *
11,290,600✔
723
                    nBandDataSize;
11,290,600✔
724

725
                if (eDataType == eBufType)
11,290,600✔
726
                {
727
                    memcpy(pabyDstBlock + iDstOffset,
11,287,500✔
728
                           static_cast<GByte *>(pData) + iBufOffset,
11,287,500✔
729
                           nBandDataSize);
730
                }
731
                else
732
                {
733
                    /* type to type conversion ... ouch, this is expensive way
734
                    of handling single words */
735
                    GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
3,096✔
736
                                    eBufType, 0, pabyDstBlock + iDstOffset,
3,096✔
737
                                    eDataType, 0, 1);
738
                }
739
            }
740

741
            if (psExtraArg->pfnProgress != nullptr &&
1,093,340✔
742
                !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
×
743
                                         psExtraArg->pProgressData))
744
            {
745
                eErr = CE_Failure;
×
746
                break;
×
747
            }
748
        }
749
    }
750
    else
751
    {
752
        if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
398,536✔
753
        {
754
            if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
8,603✔
755
                 psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
2,513✔
756
                 psExtraArg->eResampleAlg == GRIORA_Bilinear ||
2,511✔
757
                 psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
6,095✔
758
                GetColorTable() != nullptr)
2,926✔
759
            {
760
                CPLError(CE_Warning, CPLE_NotSupported,
×
761
                         "Resampling method not supported on paletted band. "
762
                         "Falling back to nearest neighbour");
763
            }
764
            else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
3,048✔
765
                     GDALDataTypeIsComplex(eDataType))
3✔
766
            {
767
                CPLError(CE_Warning, CPLE_NotSupported,
×
768
                         "Resampling method not supported on complex data type "
769
                         "band. Falling back to nearest neighbour");
770
            }
771
            else
772
            {
773
                return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
3,045✔
774
                                         pData, nBufXSize, nBufYSize, eBufType,
775
                                         nPixelSpace, nLineSpace, psExtraArg);
3,045✔
776
            }
777
        }
778

779
        int nLimitBlockY = 0;
395,483✔
780
        const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
395,483✔
781
        int nStartBlockX = -nBlockXSize;
395,483✔
782
        const double EPS = 1e-10;
395,483✔
783
        int nLBlockY = -1;
395,483✔
784
        const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
395,483✔
785
        const bool bIntegerXFactor =
395,483✔
786
            bUseIntegerRequestCoords &&
372,806✔
787
            static_cast<int>(dfSrcXInc) == dfSrcXInc &&
669,271✔
788
            static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
273,788✔
789

790
        /* --------------------------------------------------------------------
791
         */
792
        /*      Read case */
793
        /*      Loop over buffer computing source locations. */
794
        /* --------------------------------------------------------------------
795
         */
796
        for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
2,451,410✔
797
        {
798
            // Add small epsilon to avoid some numeric precision issues.
799
            const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
2,055,940✔
800
            const int iSrcY = static_cast<int>(std::min(
2,055,940✔
801
                std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
2,055,940✔
802

803
            GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
2,055,940✔
804
                                    static_cast<GPtrDiff_t>(nLineSpace);
805

806
            if (iSrcY >= nLimitBlockY)
2,055,940✔
807
            {
808
                nLBlockY = iSrcY / nBlockYSize;
433,624✔
809
                nLimitBlockY = nLBlockY * nBlockYSize;
433,624✔
810
                if (nLimitBlockY < INT_MAX - nBlockYSize)
433,624✔
811
                    nLimitBlockY += nBlockYSize;
433,624✔
812
                else
813
                    nLimitBlockY = INT_MAX;
×
814
                // Make sure a new block is loaded.
815
                nStartBlockX = -nBlockXSize;
433,624✔
816
            }
817
            else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
1,622,320✔
818
            {
819
                // Make sure a new block is loaded.
820
                nStartBlockX = -nBlockXSize;
441,987✔
821
            }
822

823
            GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
2,055,940✔
824
                                       static_cast<GPtrDiff_t>(nBlockXSize);
2,055,940✔
825

826
            if (bIntegerXFactor)
2,055,940✔
827
            {
828
                int iSrcX = static_cast<int>(dfSrcXStart);
695,677✔
829
                const int nSrcXInc = static_cast<int>(dfSrcXInc);
695,677✔
830
                GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
695,677✔
831
                bool bRet = false;
695,677✔
832
                if (bByteCopy)
695,677✔
833
                {
834
                    bRet = DownsamplingIntegerXFactor<true, 1>(
585,768✔
835
                        this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
836
                        static_cast<int>(nPixelSpace), nBufXSize, GDT_Byte,
837
                        GDT_Byte, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
838
                }
839
                else if (eDataType == eBufType)
109,909✔
840
                {
841
                    switch (nBandDataSize)
109,704✔
842
                    {
843
                        case 2:
109,624✔
844
                            bRet = DownsamplingIntegerXFactor<true, 2>(
109,624✔
845
                                this, iSrcX, nSrcXInc, iSrcOffsetCst,
846
                                pabyDstData, static_cast<int>(nPixelSpace),
847
                                nBufXSize, eDataType, eDataType, nStartBlockX,
848
                                nBlockXSize, poBlock, nLBlockY);
849
                            break;
109,624✔
850
                        case 4:
22✔
851
                            bRet = DownsamplingIntegerXFactor<true, 4>(
22✔
852
                                this, iSrcX, nSrcXInc, iSrcOffsetCst,
853
                                pabyDstData, static_cast<int>(nPixelSpace),
854
                                nBufXSize, eDataType, eDataType, nStartBlockX,
855
                                nBlockXSize, poBlock, nLBlockY);
856
                            break;
22✔
857
                        case 8:
56✔
858
                            bRet = DownsamplingIntegerXFactor<true, 8>(
56✔
859
                                this, iSrcX, nSrcXInc, iSrcOffsetCst,
860
                                pabyDstData, static_cast<int>(nPixelSpace),
861
                                nBufXSize, eDataType, eDataType, nStartBlockX,
862
                                nBlockXSize, poBlock, nLBlockY);
863
                            break;
56✔
864
                        case 16:
2✔
865
                            bRet = DownsamplingIntegerXFactor<true, 16>(
2✔
866
                                this, iSrcX, nSrcXInc, iSrcOffsetCst,
867
                                pabyDstData, static_cast<int>(nPixelSpace),
868
                                nBufXSize, eDataType, eDataType, nStartBlockX,
869
                                nBlockXSize, poBlock, nLBlockY);
870
                            break;
2✔
871
                        default:
×
872
                            CPLAssert(false);
×
873
                            break;
874
                    }
875
                }
876
                else
877
                {
878
                    bRet = DownsamplingIntegerXFactor<false, 0>(
205✔
879
                        this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
880
                        static_cast<int>(nPixelSpace), nBufXSize, eDataType,
881
                        eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
882
                }
883
                if (!bRet)
695,677✔
884
                    eErr = CE_Failure;
1✔
885
            }
886
            else
887
            {
888
                double dfSrcX = dfSrcXStart;
1,360,260✔
889
                for (int iBufXOff = 0; iBufXOff < nBufXSize;
582,293,000✔
890
                     iBufXOff++, dfSrcX += dfSrcXInc)
580,933,000✔
891
                {
892
                    // TODO?: try to avoid the clamping for most iterations
893
                    const int iSrcX = static_cast<int>(
894
                        std::min(std::max(0.0, dfSrcX),
1,161,870,000✔
895
                                 static_cast<double>(nRasterXSize - 1)));
580,933,000✔
896

897
                    /* --------------------------------------------------------------------
898
                     */
899
                    /*      Ensure we have the appropriate block loaded. */
900
                    /* --------------------------------------------------------------------
901
                     */
902
                    if (iSrcX >= nBlockXSize + nStartBlockX)
580,933,000✔
903
                    {
904
                        const int nLBlockX = iSrcX / nBlockXSize;
1,702,800✔
905
                        nStartBlockX = nLBlockX * nBlockXSize;
1,702,800✔
906

907
                        if (poBlock != nullptr)
1,702,800✔
908
                            poBlock->DropLock();
1,581,100✔
909

910
                        poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
1,702,800✔
911
                        if (poBlock == nullptr)
1,702,800✔
912
                        {
913
                            eErr = CE_Failure;
9✔
914
                            break;
9✔
915
                        }
916

917
                        pabySrcBlock =
918
                            static_cast<GByte *>(poBlock->GetDataRef());
1,702,790✔
919
                    }
920
                    const GPtrDiff_t nDiffX =
580,933,000✔
921
                        static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
580,933,000✔
922

923
                    /* --------------------------------------------------------------------
924
                     */
925
                    /*      Copy over this pixel of data. */
926
                    /* --------------------------------------------------------------------
927
                     */
928

929
                    if (bByteCopy)
580,933,000✔
930
                    {
931
                        GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
527,231,000✔
932
                        static_cast<GByte *>(pData)[iBufOffset] =
527,231,000✔
933
                            pabySrcBlock[iSrcOffset];
527,231,000✔
934
                    }
935
                    else if (eDataType == eBufType)
53,701,600✔
936
                    {
937
                        GPtrDiff_t iSrcOffset =
48,225,600✔
938
                            (nDiffX + iSrcOffsetCst) * nBandDataSize;
48,225,600✔
939
                        memcpy(static_cast<GByte *>(pData) + iBufOffset,
48,225,600✔
940
                               pabySrcBlock + iSrcOffset, nBandDataSize);
48,225,600✔
941
                    }
942
                    else
943
                    {
944
                        // Type to type conversion ...
945
                        GPtrDiff_t iSrcOffset =
5,476,050✔
946
                            (nDiffX + iSrcOffsetCst) * nBandDataSize;
5,476,050✔
947
                        GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
5,476,050✔
948
                                        static_cast<GByte *>(pData) +
949
                                            iBufOffset,
5,476,050✔
950
                                        eBufType, 0, 1);
951
                    }
952

953
                    iBufOffset += static_cast<int>(nPixelSpace);
580,933,000✔
954
                }
955
            }
956
            if (eErr == CE_Failure)
2,055,940✔
957
                break;
11✔
958

959
            if (psExtraArg->pfnProgress != nullptr &&
2,287,020✔
960
                !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
231,086✔
961
                                         psExtraArg->pProgressData))
962
            {
963
                eErr = CE_Failure;
1✔
964
                break;
1✔
965
            }
966
        }
967
    }
968

969
    if (poBlock != nullptr)
562,134✔
970
        poBlock->DropLock();
562,124✔
971

972
    return eErr;
562,134✔
973
}
974

975
/************************************************************************/
976
/*                         GDALRasterIOTransformer()                    */
977
/************************************************************************/
978

979
struct GDALRasterIOTransformerStruct
980
{
981
    double dfXOff;
982
    double dfYOff;
983
    double dfXRatioDstToSrc;
984
    double dfYRatioDstToSrc;
985
};
986

987
static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
6,748✔
988
                                   int nPointCount, double *x, double *y,
989
                                   double * /* z */, int *panSuccess)
990
{
991
    GDALRasterIOTransformerStruct *psParams =
6,748✔
992
        static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
993
    if (bDstToSrc)
6,748✔
994
    {
995
        for (int i = 0; i < nPointCount; i++)
252,996✔
996
        {
997
            x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
246,836✔
998
            y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
246,836✔
999
            panSuccess[i] = TRUE;
246,836✔
1000
        }
1001
    }
1002
    else
1003
    {
1004
        for (int i = 0; i < nPointCount; i++)
1,176✔
1005
        {
1006
            x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
588✔
1007
            y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
588✔
1008
            panSuccess[i] = TRUE;
588✔
1009
        }
1010
    }
1011
    return TRUE;
6,748✔
1012
}
1013

1014
/************************************************************************/
1015
/*                          RasterIOResampled()                         */
1016
/************************************************************************/
1017

1018
//! @cond Doxygen_Suppress
1019
CPLErr GDALRasterBand::RasterIOResampled(
3,045✔
1020
    GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1021
    void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1022
    GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
1023
{
1024
    // Determine if we use warping resampling or overview resampling
1025
    const bool bUseWarp =
1026
        (GDALDataTypeIsComplex(eDataType) &&
3,045✔
1027
         psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
3,202✔
1028
         psExtraArg->eResampleAlg != GRIORA_Mode);
157✔
1029

1030
    double dfXOff = nXOff;
3,045✔
1031
    double dfYOff = nYOff;
3,045✔
1032
    double dfXSize = nXSize;
3,045✔
1033
    double dfYSize = nYSize;
3,045✔
1034
    if (psExtraArg->bFloatingPointWindowValidity)
3,045✔
1035
    {
1036
        dfXOff = psExtraArg->dfXOff;
2,586✔
1037
        dfYOff = psExtraArg->dfYOff;
2,586✔
1038
        dfXSize = psExtraArg->dfXSize;
2,586✔
1039
        dfYSize = psExtraArg->dfYSize;
2,586✔
1040
    }
1041

1042
    const double dfXRatioDstToSrc = dfXSize / nBufXSize;
3,045✔
1043
    const double dfYRatioDstToSrc = dfYSize / nBufYSize;
3,045✔
1044

1045
    // Determine the coordinates in the "virtual" output raster to see
1046
    // if there are not integers, in which case we will use them as a shift
1047
    // so that subwindow extracts give the exact same results as entire raster
1048
    // scaling.
1049
    double dfDestXOff = dfXOff / dfXRatioDstToSrc;
3,045✔
1050
    bool bHasXOffVirtual = false;
3,045✔
1051
    int nDestXOffVirtual = 0;
3,045✔
1052
    if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
3,045✔
1053
    {
1054
        bHasXOffVirtual = true;
2,717✔
1055
        dfXOff = nXOff;
2,717✔
1056
        nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
2,717✔
1057
    }
1058

1059
    double dfDestYOff = dfYOff / dfYRatioDstToSrc;
3,045✔
1060
    bool bHasYOffVirtual = false;
3,045✔
1061
    int nDestYOffVirtual = 0;
3,045✔
1062
    if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
3,045✔
1063
    {
1064
        bHasYOffVirtual = true;
2,713✔
1065
        dfYOff = nYOff;
2,713✔
1066
        nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
2,713✔
1067
    }
1068

1069
    // Create a MEM dataset that wraps the output buffer.
1070
    GDALDataset *poMEMDS;
1071
    void *pTempBuffer = nullptr;
3,045✔
1072
    GSpacing nPSMem = nPixelSpace;
3,045✔
1073
    GSpacing nLSMem = nLineSpace;
3,045✔
1074
    void *pDataMem = pData;
3,045✔
1075
    GDALDataType eDTMem = eBufType;
3,045✔
1076
    if (eBufType != eDataType)
3,045✔
1077
    {
1078
        nPSMem = GDALGetDataTypeSizeBytes(eDataType);
40✔
1079
        nLSMem = nPSMem * nBufXSize;
40✔
1080
        pTempBuffer =
1081
            VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
40✔
1082
        if (pTempBuffer == nullptr)
40✔
1083
            return CE_Failure;
×
1084
        pDataMem = pTempBuffer;
40✔
1085
        eDTMem = eDataType;
40✔
1086
    }
1087

1088
    poMEMDS =
1089
        MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
3,045✔
1090
                           nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
1091
    GByte *pabyData = static_cast<GByte *>(pDataMem) -
3,045✔
1092
                      nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
3,045✔
1093
    GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
3,045✔
1094
        poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
1095
    poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
3,045✔
1096

1097
    const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
3,045✔
1098
    const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
3,045✔
1099
    if (pszNBITS)
3,045✔
1100
        GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
6✔
1101
            "NBITS", pszNBITS, "IMAGE_STRUCTURE");
6✔
1102

1103
    CPLErr eErr = CE_None;
3,045✔
1104

1105
    // Do the resampling.
1106
    if (bUseWarp)
3,045✔
1107
    {
1108
        int bHasNoData = FALSE;
149✔
1109
        double dfNoDataValue = GetNoDataValue(&bHasNoData);
149✔
1110

1111
        VRTDatasetH hVRTDS = nullptr;
149✔
1112
        GDALRasterBandH hVRTBand = nullptr;
149✔
1113
        if (GetDataset() == nullptr)
149✔
1114
        {
1115
            /* Create VRT dataset that wraps the whole dataset */
1116
            hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
×
1117
            VRTAddBand(hVRTDS, eDataType, nullptr);
×
1118
            hVRTBand = GDALGetRasterBand(hVRTDS, 1);
×
1119
            VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
×
1120
                               0, 0, nRasterXSize, nRasterYSize, nullptr,
1121
                               VRT_NODATA_UNSET);
1122

1123
            /* Add a mask band if needed */
1124
            if (GetMaskFlags() != GMF_ALL_VALID)
×
1125
            {
1126
                GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
×
1127
                VRTSourcedRasterBand *poVRTMaskBand =
1128
                    reinterpret_cast<VRTSourcedRasterBand *>(
1129
                        reinterpret_cast<GDALRasterBand *>(hVRTBand)
1130
                            ->GetMaskBand());
×
1131
                poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
×
1132
                                                 nRasterYSize, 0, 0,
×
1133
                                                 nRasterXSize, nRasterYSize);
×
1134
            }
1135
        }
1136

1137
        GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
149✔
1138
        switch (psExtraArg->eResampleAlg)
149✔
1139
        {
1140
            case GRIORA_NearestNeighbour:
×
1141
                psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
×
1142
                break;
×
1143
            case GRIORA_Bilinear:
147✔
1144
                psWarpOptions->eResampleAlg = GRA_Bilinear;
147✔
1145
                break;
147✔
1146
            case GRIORA_Cubic:
×
1147
                psWarpOptions->eResampleAlg = GRA_Cubic;
×
1148
                break;
×
1149
            case GRIORA_CubicSpline:
×
1150
                psWarpOptions->eResampleAlg = GRA_CubicSpline;
×
1151
                break;
×
1152
            case GRIORA_Lanczos:
×
1153
                psWarpOptions->eResampleAlg = GRA_Lanczos;
×
1154
                break;
×
1155
            case GRIORA_Average:
×
1156
                psWarpOptions->eResampleAlg = GRA_Average;
×
1157
                break;
×
1158
            case GRIORA_RMS:
2✔
1159
                psWarpOptions->eResampleAlg = GRA_RMS;
2✔
1160
                break;
2✔
1161
            case GRIORA_Mode:
×
1162
                psWarpOptions->eResampleAlg = GRA_Mode;
×
1163
                break;
×
1164
            default:
×
1165
                CPLAssert(false);
×
1166
                psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1167
                break;
1168
        }
1169
        psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
149✔
1170
        psWarpOptions->hDstDS = poMEMDS;
149✔
1171
        psWarpOptions->nBandCount = 1;
149✔
1172
        int nSrcBandNumber = hVRTDS ? 1 : nBand;
149✔
1173
        int nDstBandNumber = 1;
149✔
1174
        psWarpOptions->panSrcBands = &nSrcBandNumber;
149✔
1175
        psWarpOptions->panDstBands = &nDstBandNumber;
149✔
1176
        psWarpOptions->pfnProgress = psExtraArg->pfnProgress
298✔
1177
                                         ? psExtraArg->pfnProgress
149✔
1178
                                         : GDALDummyProgress;
1179
        psWarpOptions->pProgressArg = psExtraArg->pProgressData;
149✔
1180
        psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
149✔
1181
        if (bHasNoData)
149✔
1182
        {
1183
            psWarpOptions->papszWarpOptions = CSLSetNameValue(
×
1184
                psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
1185
            if (psWarpOptions->padfSrcNoDataReal == nullptr)
×
1186
            {
1187
                psWarpOptions->padfSrcNoDataReal =
×
1188
                    static_cast<double *>(CPLMalloc(sizeof(double)));
×
1189
                psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
×
1190
            }
1191

1192
            if (psWarpOptions->padfDstNoDataReal == nullptr)
×
1193
            {
1194
                psWarpOptions->padfDstNoDataReal =
×
1195
                    static_cast<double *>(CPLMalloc(sizeof(double)));
×
1196
                psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
×
1197
            }
1198
        }
1199

1200
        GDALRasterIOTransformerStruct sTransformer;
1201
        sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
149✔
1202
        sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
149✔
1203
        sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
149✔
1204
        sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
149✔
1205
        psWarpOptions->pTransformerArg = &sTransformer;
149✔
1206

1207
        GDALWarpOperationH hWarpOperation =
1208
            GDALCreateWarpOperation(psWarpOptions);
149✔
1209
        eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
149✔
1210
                                     nDestYOffVirtual, nBufXSize, nBufYSize);
1211
        GDALDestroyWarpOperation(hWarpOperation);
149✔
1212

1213
        psWarpOptions->panSrcBands = nullptr;
149✔
1214
        psWarpOptions->panDstBands = nullptr;
149✔
1215
        GDALDestroyWarpOptions(psWarpOptions);
149✔
1216

1217
        if (hVRTDS)
149✔
1218
            GDALClose(hVRTDS);
×
1219
    }
1220
    else
1221
    {
1222
        const char *pszResampling =
2,896✔
1223
            (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
3,552✔
1224
            : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
780✔
1225
            : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
246✔
1226
            : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
239✔
1227
            : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
172✔
1228
            : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
95✔
1229
            : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
43✔
1230
            : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
3✔
1231
                                                               : "UNKNOWN";
1232

1233
        int nKernelRadius = 0;
2,896✔
1234
        GDALResampleFunction pfnResampleFunc =
1235
            GDALGetResampleFunction(pszResampling, &nKernelRadius);
2,896✔
1236
        CPLAssert(pfnResampleFunc);
2,896✔
1237
        GDALDataType eWrkDataType =
1238
            GDALGetOvrWorkDataType(pszResampling, eDataType);
2,896✔
1239
        int nHasNoData = 0;
2,896✔
1240
        double dfNoDataValue = GetNoDataValue(&nHasNoData);
2,896✔
1241
        const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
2,896✔
1242
        if (!bHasNoData)
2,896✔
1243
            dfNoDataValue = 0.0;
2,806✔
1244

1245
        int nDstBlockXSize = nBufXSize;
2,896✔
1246
        int nDstBlockYSize = nBufYSize;
2,896✔
1247
        int nFullResXChunk = 0;
2,896✔
1248
        int nFullResYChunk = 0;
2,896✔
1249
        while (true)
1250
        {
1251
            nFullResXChunk =
2,907✔
1252
                3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
2,907✔
1253
            nFullResYChunk =
2,907✔
1254
                3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
2,907✔
1255
            if (nFullResXChunk > nRasterXSize)
2,907✔
1256
                nFullResXChunk = nRasterXSize;
2,664✔
1257
            if (nFullResYChunk > nRasterYSize)
2,907✔
1258
                nFullResYChunk = nRasterYSize;
265✔
1259
            if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
2,907✔
1260
                (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
2,853✔
1261
                 1024 * 1024))
1262
                break;
1263
            // When operating on the full width of a raster whose block width is
1264
            // the raster width, prefer doing chunks in height.
1265
            if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
11✔
1266
                nDstBlockYSize > 1)
1267
                nDstBlockYSize /= 2;
×
1268
            /* Otherwise cut the maximal dimension */
1269
            else if (nDstBlockXSize > 1 &&
11✔
1270
                     (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
×
1271
                nDstBlockXSize /= 2;
11✔
1272
            else
1273
                nDstBlockYSize /= 2;
×
1274
        }
1275

1276
        int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
2,896✔
1277
        int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
2,896✔
1278
        if (nOvrXFactor == 0)
2,896✔
1279
            nOvrXFactor = 1;
2,029✔
1280
        if (nOvrYFactor == 0)
2,896✔
1281
            nOvrYFactor = 1;
2,028✔
1282
        int nFullResXSizeQueried =
2,896✔
1283
            nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
2,896✔
1284
        int nFullResYSizeQueried =
2,896✔
1285
            nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
2,896✔
1286

1287
        if (nFullResXSizeQueried > nRasterXSize)
2,896✔
1288
            nFullResXSizeQueried = nRasterXSize;
2,556✔
1289
        if (nFullResYSizeQueried > nRasterYSize)
2,896✔
1290
            nFullResYSizeQueried = nRasterYSize;
154✔
1291

1292
        void *pChunk =
1293
            VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
2,896✔
1294
                                nFullResXSizeQueried, nFullResYSizeQueried);
1295
        GByte *pabyChunkNoDataMask = nullptr;
2,896✔
1296

1297
        GDALRasterBand *poMaskBand = GetMaskBand();
2,896✔
1298
        int l_nMaskFlags = GetMaskFlags();
2,896✔
1299

1300
        bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
2,896✔
1301
        if (bUseNoDataMask)
2,896✔
1302
        {
1303
            pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
158✔
1304
                nFullResXSizeQueried, nFullResYSizeQueried));
1305
        }
1306
        if (pChunk == nullptr ||
2,896✔
1307
            (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
158✔
1308
        {
1309
            GDALClose(poMEMDS);
×
1310
            CPLFree(pChunk);
×
1311
            CPLFree(pabyChunkNoDataMask);
×
1312
            VSIFree(pTempBuffer);
×
1313
            return CE_Failure;
×
1314
        }
1315

1316
        const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
2,896✔
1317
                                 DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
2,896✔
1318
        int nBlocksDone = 0;
2,896✔
1319

1320
        int nDstYOff;
1321
        for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
5,792✔
1322
             nDstYOff += nDstBlockYSize)
2,896✔
1323
        {
1324
            int nDstYCount;
1325
            if (nDstYOff + nDstBlockYSize <= nBufYSize)
2,896✔
1326
                nDstYCount = nDstBlockYSize;
2,896✔
1327
            else
1328
                nDstYCount = nBufYSize - nDstYOff;
×
1329

1330
            int nChunkYOff =
2,896✔
1331
                nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
2,896✔
1332
            int nChunkYOff2 = nYOff + 1 +
2,896✔
1333
                              static_cast<int>(ceil((nDstYOff + nDstYCount) *
2,896✔
1334
                                                    dfYRatioDstToSrc));
1335
            if (nChunkYOff2 > nRasterYSize)
2,896✔
1336
                nChunkYOff2 = nRasterYSize;
377✔
1337
            int nYCount = nChunkYOff2 - nChunkYOff;
2,896✔
1338
            CPLAssert(nYCount <= nFullResYChunk);
2,896✔
1339

1340
            int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
2,896✔
1341
            int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
2,896✔
1342
            if (nChunkYOffQueried < 0)
2,896✔
1343
            {
1344
                nChunkYSizeQueried += nChunkYOffQueried;
277✔
1345
                nChunkYOffQueried = 0;
277✔
1346
            }
1347
            if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
2,896✔
1348
                nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
380✔
1349
            CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
2,896✔
1350

1351
            int nDstXOff = 0;
2,896✔
1352
            for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
5,792✔
1353
                 nDstXOff += nDstBlockXSize)
2,896✔
1354
            {
1355
                int nDstXCount = 0;
2,896✔
1356
                if (nDstXOff + nDstBlockXSize <= nBufXSize)
2,896✔
1357
                    nDstXCount = nDstBlockXSize;
2,896✔
1358
                else
1359
                    nDstXCount = nBufXSize - nDstXOff;
×
1360

1361
                int nChunkXOff =
2,896✔
1362
                    nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
2,896✔
1363
                int nChunkXOff2 =
2,896✔
1364
                    nXOff + 1 +
2,896✔
1365
                    static_cast<int>(
2,896✔
1366
                        ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
2,896✔
1367
                if (nChunkXOff2 > nRasterXSize)
2,896✔
1368
                    nChunkXOff2 = nRasterXSize;
2,677✔
1369
                int nXCount = nChunkXOff2 - nChunkXOff;
2,896✔
1370
                CPLAssert(nXCount <= nFullResXChunk);
2,896✔
1371

1372
                int nChunkXOffQueried =
2,896✔
1373
                    nChunkXOff - nKernelRadius * nOvrXFactor;
2,896✔
1374
                int nChunkXSizeQueried =
2,896✔
1375
                    nXCount + 2 * nKernelRadius * nOvrXFactor;
2,896✔
1376
                if (nChunkXOffQueried < 0)
2,896✔
1377
                {
1378
                    nChunkXSizeQueried += nChunkXOffQueried;
2,581✔
1379
                    nChunkXOffQueried = 0;
2,581✔
1380
                }
1381
                if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
2,896✔
1382
                    nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
2,567✔
1383
                CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
2,896✔
1384

1385
                // Read the source buffers.
1386
                eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
2,896✔
1387
                                nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1388
                                nChunkXSizeQueried, nChunkYSizeQueried,
1389
                                eWrkDataType, 0, 0, nullptr);
1390

1391
                bool bSkipResample = false;
2,896✔
1392
                bool bNoDataMaskFullyOpaque = false;
2,896✔
1393
                if (eErr == CE_None && bUseNoDataMask)
2,896✔
1394
                {
1395
                    eErr = poMaskBand->RasterIO(
158✔
1396
                        GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1397
                        nChunkXSizeQueried, nChunkYSizeQueried,
1398
                        pabyChunkNoDataMask, nChunkXSizeQueried,
1399
                        nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
1400

1401
                    /* Optimizations if mask if fully opaque or transparent */
1402
                    int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
158✔
1403
                    GByte bVal = pabyChunkNoDataMask[0];
158✔
1404
                    int i = 1;
158✔
1405
                    for (; i < nPixels; i++)
3,751,650✔
1406
                    {
1407
                        if (pabyChunkNoDataMask[i] != bVal)
3,751,590✔
1408
                            break;
104✔
1409
                    }
1410
                    if (i == nPixels)
158✔
1411
                    {
1412
                        if (bVal == 0)
54✔
1413
                        {
1414
                            for (int j = 0; j < nDstYCount; j++)
712✔
1415
                            {
1416
                                GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
686✔
1417
                                                static_cast<GByte *>(pDataMem) +
1418
                                                    nLSMem * (j + nDstYOff) +
686✔
1419
                                                    nDstXOff * nPSMem,
686✔
1420
                                                eDTMem,
1421
                                                static_cast<int>(nPSMem),
1422
                                                nDstXCount);
1423
                            }
1424
                            bSkipResample = true;
26✔
1425
                        }
1426
                        else
1427
                        {
1428
                            bNoDataMaskFullyOpaque = true;
28✔
1429
                        }
1430
                    }
1431
                }
1432

1433
                if (!bSkipResample && eErr == CE_None)
2,896✔
1434
                {
1435
                    const bool bPropagateNoData = false;
2,867✔
1436
                    void *pDstBuffer = nullptr;
2,867✔
1437
                    GDALDataType eDstBufferDataType = GDT_Unknown;
2,867✔
1438
                    GDALRasterBand *poMEMBand =
1439
                        GDALRasterBand::FromHandle(hMEMBand);
2,867✔
1440
                    GDALOverviewResampleArgs args;
2,867✔
1441
                    args.eSrcDataType = eDataType;
2,867✔
1442
                    args.eOvrDataType = poMEMBand->GetRasterDataType();
2,867✔
1443
                    args.nOvrXSize = poMEMBand->GetXSize();
2,867✔
1444
                    args.nOvrYSize = poMEMBand->GetYSize();
2,867✔
1445
                    args.nOvrNBITS = nNBITS;
2,867✔
1446
                    args.dfXRatioDstToSrc = dfXRatioDstToSrc;
2,867✔
1447
                    args.dfYRatioDstToSrc = dfYRatioDstToSrc;
2,867✔
1448
                    args.dfSrcXDelta =
2,867✔
1449
                        dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
2,867✔
1450
                    args.dfSrcYDelta =
2,867✔
1451
                        dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
2,867✔
1452
                    args.eWrkDataType = eWrkDataType;
2,867✔
1453
                    args.pabyChunkNodataMask =
2,867✔
1454
                        bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
2,867✔
1455
                    args.nChunkXOff =
2,867✔
1456
                        nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
2,867✔
1457
                    args.nChunkXSize = nChunkXSizeQueried;
2,867✔
1458
                    args.nChunkYOff =
2,867✔
1459
                        nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
2,867✔
1460
                    args.nChunkYSize = nChunkYSizeQueried;
2,867✔
1461
                    args.nDstXOff = nDstXOff + nDestXOffVirtual;
2,867✔
1462
                    args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
2,867✔
1463
                    args.nDstYOff = nDstYOff + nDestYOffVirtual;
2,867✔
1464
                    args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
2,867✔
1465
                    args.pszResampling = pszResampling;
2,867✔
1466
                    args.bHasNoData = bHasNoData;
2,867✔
1467
                    args.dfNoDataValue = dfNoDataValue;
2,867✔
1468
                    args.poColorTable = GetColorTable();
2,867✔
1469
                    args.bPropagateNoData = bPropagateNoData;
2,867✔
1470
                    eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
2,867✔
1471
                                           &eDstBufferDataType);
1472
                    if (eErr == CE_None)
2,867✔
1473
                    {
1474
                        eErr = poMEMBand->RasterIO(
2,867✔
1475
                            GF_Write, nDstXOff + nDestXOffVirtual,
1476
                            nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
1477
                            pDstBuffer, nDstXCount, nDstYCount,
1478
                            eDstBufferDataType, 0, 0, nullptr);
1479
                    }
1480
                    CPLFree(pDstBuffer);
2,867✔
1481
                }
1482

1483
                nBlocksDone++;
2,896✔
1484
                if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
3,321✔
1485
                    !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
425✔
1486
                                             "", psExtraArg->pProgressData))
1487
                {
1488
                    eErr = CE_Failure;
1✔
1489
                }
1490
            }
1491
        }
1492

1493
        CPLFree(pChunk);
2,896✔
1494
        CPLFree(pabyChunkNoDataMask);
2,896✔
1495
    }
1496

1497
    if (eBufType != eDataType)
3,045✔
1498
    {
1499
        CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
40✔
1500
            GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
1501
            pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
1502
            nullptr));
1503
    }
1504
    GDALClose(poMEMDS);
3,045✔
1505
    VSIFree(pTempBuffer);
3,045✔
1506

1507
    return eErr;
3,045✔
1508
}
1509

1510
/************************************************************************/
1511
/*                          RasterIOResampled()                         */
1512
/************************************************************************/
1513

1514
CPLErr GDALDataset::RasterIOResampled(
757✔
1515
    GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1516
    void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1517
    int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
1518
    GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
1519

1520
{
1521
#if 0
1522
    // Determine if we use warping resampling or overview resampling
1523
    bool bUseWarp = false;
1524
    if( GDALDataTypeIsComplex( eDataType ) )
1525
        bUseWarp = true;
1526
#endif
1527

1528
    double dfXOff = nXOff;
757✔
1529
    double dfYOff = nYOff;
757✔
1530
    double dfXSize = nXSize;
757✔
1531
    double dfYSize = nYSize;
757✔
1532
    if (psExtraArg->bFloatingPointWindowValidity)
757✔
1533
    {
1534
        dfXOff = psExtraArg->dfXOff;
637✔
1535
        dfYOff = psExtraArg->dfYOff;
637✔
1536
        dfXSize = psExtraArg->dfXSize;
637✔
1537
        dfYSize = psExtraArg->dfYSize;
637✔
1538
    }
1539

1540
    const double dfXRatioDstToSrc = dfXSize / nBufXSize;
757✔
1541
    const double dfYRatioDstToSrc = dfYSize / nBufYSize;
757✔
1542

1543
    // Determine the coordinates in the "virtual" output raster to see
1544
    // if there are not integers, in which case we will use them as a shift
1545
    // so that subwindow extracts give the exact same results as entire raster
1546
    // scaling.
1547
    double dfDestXOff = dfXOff / dfXRatioDstToSrc;
757✔
1548
    bool bHasXOffVirtual = false;
757✔
1549
    int nDestXOffVirtual = 0;
757✔
1550
    if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
757✔
1551
    {
1552
        bHasXOffVirtual = true;
628✔
1553
        dfXOff = nXOff;
628✔
1554
        nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
628✔
1555
    }
1556

1557
    double dfDestYOff = dfYOff / dfYRatioDstToSrc;
757✔
1558
    bool bHasYOffVirtual = false;
757✔
1559
    int nDestYOffVirtual = 0;
757✔
1560
    if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
757✔
1561
    {
1562
        bHasYOffVirtual = true;
587✔
1563
        dfYOff = nYOff;
587✔
1564
        nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
587✔
1565
    }
1566

1567
    // Create a MEM dataset that wraps the output buffer.
1568
    GDALDataset *poMEMDS =
1569
        MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
757✔
1570
                           nDestYOffVirtual + nBufYSize, 0, eBufType, nullptr);
1571
    GDALRasterBand **papoDstBands = static_cast<GDALRasterBand **>(
1572
        CPLMalloc(nBandCount * sizeof(GDALRasterBand *)));
751✔
1573
    int nNBITS = 0;
751✔
1574
    for (int i = 0; i < nBandCount; i++)
2,336✔
1575
    {
1576
        char szBuffer[32] = {'\0'};
1,585✔
1577
        int nRet = CPLPrintPointer(
3,173✔
1578
            szBuffer,
1579
            static_cast<GByte *>(pData) - nPixelSpace * nDestXOffVirtual -
1,585✔
1580
                nLineSpace * nDestYOffVirtual + nBandSpace * i,
1,585✔
1581
            sizeof(szBuffer));
1582
        szBuffer[nRet] = 0;
1,588✔
1583

1584
        char szBuffer0[64] = {'\0'};
1,588✔
1585
        snprintf(szBuffer0, sizeof(szBuffer0), "DATAPOINTER=%s", szBuffer);
1,588✔
1586

1587
        char szBuffer1[64] = {'\0'};
1,588✔
1588
        snprintf(szBuffer1, sizeof(szBuffer1), "PIXELOFFSET=" CPL_FRMT_GIB,
1,588✔
1589
                 static_cast<GIntBig>(nPixelSpace));
1590

1591
        char szBuffer2[64] = {'\0'};
1,588✔
1592
        snprintf(szBuffer2, sizeof(szBuffer2), "LINEOFFSET=" CPL_FRMT_GIB,
1,588✔
1593
                 static_cast<GIntBig>(nLineSpace));
1594

1595
        char *apszOptions[4] = {szBuffer0, szBuffer1, szBuffer2, nullptr};
1,588✔
1596

1597
        poMEMDS->AddBand(eBufType, apszOptions);
1,588✔
1598

1599
        GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
1,595✔
1600
        papoDstBands[i] = poMEMDS->GetRasterBand(i + 1);
1,581✔
1601
        const char *pszNBITS =
1602
            poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
1,578✔
1603
        if (pszNBITS)
1,581✔
1604
        {
1605
            nNBITS = atoi(pszNBITS);
×
1606
            poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
×
1607
                                                           "IMAGE_STRUCTURE");
×
1608
        }
1609
    }
1610

1611
    CPLErr eErr = CE_None;
751✔
1612

1613
    // TODO(schwehr): Why disabled?  Why not just delete?
1614
    // Looks like this code was initially added as disable by copying
1615
    // from RasterIO here:
1616
    // https://trac.osgeo.org/gdal/changeset/29572
1617
#if 0
1618
    // Do the resampling.
1619
    if( bUseWarp )
1620
    {
1621
        VRTDatasetH hVRTDS = nullptr;
1622
        GDALRasterBandH hVRTBand = nullptr;
1623
        if( GetDataset() == nullptr )
1624
        {
1625
            /* Create VRT dataset that wraps the whole dataset */
1626
            hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1627
            VRTAddBand( hVRTDS, eDataType, nullptr );
1628
            hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1629
            VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
1630
                                (GDALRasterBandH)this,
1631
                                0, 0,
1632
                                nRasterXSize, nRasterYSize,
1633
                                0, 0,
1634
                                nRasterXSize, nRasterYSize,
1635
                                nullptr, VRT_NODATA_UNSET );
1636

1637
            /* Add a mask band if needed */
1638
            if( GetMaskFlags() != GMF_ALL_VALID )
1639
            {
1640
                ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
1641
                VRTSourcedRasterBand* poVRTMaskBand =
1642
                    (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
1643
                poVRTMaskBand->
1644
                    AddMaskBandSource( this,
1645
                                    0, 0,
1646
                                    nRasterXSize, nRasterYSize,
1647
                                    0, 0,
1648
                                    nRasterXSize, nRasterYSize);
1649
            }
1650
        }
1651

1652
        GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
1653
        psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
1654
        psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
1655
        psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
1656
        psWarpOptions->nBandCount = 1;
1657
        int nSrcBandNumber = (hVRTDS ? 1 : nBand);
1658
        int nDstBandNumber = 1;
1659
        psWarpOptions->panSrcBands = &nSrcBandNumber;
1660
        psWarpOptions->panDstBands = &nDstBandNumber;
1661
        psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
1662
                    psExtraArg->pfnProgress : GDALDummyProgress;
1663
        psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1664
        psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1665
        GDALRasterIOTransformerStruct sTransformer;
1666
        sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1667
        sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1668
        sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1669
        sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1670
        psWarpOptions->pTransformerArg = &sTransformer;
1671

1672
        GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
1673
        eErr = GDALChunkAndWarpImage( hWarpOperation,
1674
                                      nDestXOffVirtual, nDestYOffVirtual,
1675
                                      nBufXSize, nBufYSize );
1676
        GDALDestroyWarpOperation( hWarpOperation );
1677

1678
        psWarpOptions->panSrcBands = nullptr;
1679
        psWarpOptions->panDstBands = nullptr;
1680
        GDALDestroyWarpOptions( psWarpOptions );
1681

1682
        if( hVRTDS )
1683
            GDALClose(hVRTDS);
1684
    }
1685
    else
1686
#endif
1687
    {
1688
        const char *pszResampling =
751✔
1689
            (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
1,384✔
1690
            : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
633✔
1691
            : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
×
1692
            : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
×
1693
            : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
×
1694
            : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
×
1695
            : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
×
1696
            : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
×
1697
                                                               : "UNKNOWN";
1698

1699
        GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
751✔
1700
        GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
748✔
1701
        int nBlockXSize, nBlockYSize;
1702
        poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
749✔
1703

1704
        int nKernelRadius;
1705
        GDALResampleFunction pfnResampleFunc =
1706
            GDALGetResampleFunction(pszResampling, &nKernelRadius);
748✔
1707
        CPLAssert(pfnResampleFunc);
747✔
1708
#ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1709
        GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
1710
            GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
1711
#endif
1712
        GDALDataType eWrkDataType =
1713
            GDALGetOvrWorkDataType(pszResampling, eDataType);
747✔
1714

1715
        int nDstBlockXSize = nBufXSize;
740✔
1716
        int nDstBlockYSize = nBufYSize;
740✔
1717
        int nFullResXChunk, nFullResYChunk;
1718
        while (true)
1719
        {
1720
            nFullResXChunk =
740✔
1721
                3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
740✔
1722
            nFullResYChunk =
740✔
1723
                3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
740✔
1724
            if (nFullResXChunk > nRasterXSize)
740✔
1725
                nFullResXChunk = nRasterXSize;
562✔
1726
            if (nFullResYChunk > nRasterYSize)
740✔
1727
                nFullResYChunk = nRasterYSize;
43✔
1728
            if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
740✔
1729
                (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
738✔
1730
                 1024 * 1024))
1731
                break;
1732
            // When operating on the full width of a raster whose block width is
1733
            // the raster width, prefer doing chunks in height.
1734
            if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
×
1735
                nDstBlockYSize > 1)
1736
                nDstBlockYSize /= 2;
×
1737
            /* Otherwise cut the maximal dimension */
1738
            else if (nDstBlockXSize > 1 &&
×
1739
                     (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
×
1740
                nDstBlockXSize /= 2;
×
1741
            else
1742
                nDstBlockYSize /= 2;
×
1743
        }
1744

1745
        int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
1,488✔
1746
                                  static_cast<int>(0.5 + dfYRatioDstToSrc));
740✔
1747
        if (nOvrFactor == 0)
748✔
1748
            nOvrFactor = 1;
98✔
1749
        int nFullResXSizeQueried =
748✔
1750
            nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
748✔
1751
        int nFullResYSizeQueried =
748✔
1752
            nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
748✔
1753

1754
        if (nFullResXSizeQueried > nRasterXSize)
748✔
1755
            nFullResXSizeQueried = nRasterXSize;
593✔
1756
        if (nFullResYSizeQueried > nRasterYSize)
748✔
1757
            nFullResYSizeQueried = nRasterYSize;
46✔
1758

1759
        void *pChunk = VSI_MALLOC3_VERBOSE(
748✔
1760
            cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
1761
                              nBandCount),
1762
            nFullResXSizeQueried, nFullResYSizeQueried);
1763
        GByte *pabyChunkNoDataMask = nullptr;
757✔
1764

1765
        GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
757✔
1766
        int nMaskFlags = poFirstSrcBand->GetMaskFlags();
755✔
1767

1768
        bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
752✔
1769
        if (bUseNoDataMask)
752✔
1770
        {
1771
            pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
489✔
1772
                nFullResXSizeQueried, nFullResYSizeQueried));
1773
        }
1774
        if (pChunk == nullptr ||
752✔
1775
            (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
489✔
1776
        {
1777
            GDALClose(poMEMDS);
12✔
1778
            CPLFree(pChunk);
×
1779
            CPLFree(pabyChunkNoDataMask);
×
1780
            CPLFree(papoDstBands);
×
1781
            return CE_Failure;
×
1782
        }
1783

1784
        const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
740✔
1785
                                 DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
740✔
1786
        int nBlocksDone = 0;
740✔
1787

1788
        int nDstYOff;
1789
        for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1,501✔
1790
             nDstYOff += nDstBlockYSize)
761✔
1791
        {
1792
            int nDstYCount;
1793
            if (nDstYOff + nDstBlockYSize <= nBufYSize)
739✔
1794
                nDstYCount = nDstBlockYSize;
744✔
1795
            else
UNCOV
1796
                nDstYCount = nBufYSize - nDstYOff;
×
1797

1798
            int nChunkYOff =
739✔
1799
                nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
739✔
1800
            int nChunkYOff2 = nYOff + 1 +
739✔
1801
                              static_cast<int>(ceil((nDstYOff + nDstYCount) *
739✔
1802
                                                    dfYRatioDstToSrc));
1803
            if (nChunkYOff2 > nRasterYSize)
739✔
1804
                nChunkYOff2 = nRasterYSize;
89✔
1805
            int nYCount = nChunkYOff2 - nChunkYOff;
739✔
1806
            CPLAssert(nYCount <= nFullResYChunk);
739✔
1807

1808
            int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
739✔
1809
            int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
739✔
1810
            if (nChunkYOffQueried < 0)
739✔
1811
            {
1812
                nChunkYSizeQueried += nChunkYOffQueried;
92✔
1813
                nChunkYOffQueried = 0;
92✔
1814
            }
1815
            if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
739✔
1816
                nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
107✔
1817
            CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
739✔
1818

1819
            int nDstXOff;
1820
            for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1,496✔
1821
                 nDstXOff += nDstBlockXSize)
757✔
1822
            {
1823
                int nDstXCount;
1824
                if (nDstXOff + nDstBlockXSize <= nBufXSize)
735✔
1825
                    nDstXCount = nDstBlockXSize;
735✔
1826
                else
UNCOV
1827
                    nDstXCount = nBufXSize - nDstXOff;
×
1828

1829
                int nChunkXOff =
735✔
1830
                    nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
735✔
1831
                int nChunkXOff2 =
735✔
1832
                    nXOff + 1 +
735✔
1833
                    static_cast<int>(
735✔
1834
                        ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
735✔
1835
                if (nChunkXOff2 > nRasterXSize)
735✔
1836
                    nChunkXOff2 = nRasterXSize;
586✔
1837
                int nXCount = nChunkXOff2 - nChunkXOff;
735✔
1838
                CPLAssert(nXCount <= nFullResXChunk);
735✔
1839

1840
                int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
735✔
1841
                int nChunkXSizeQueried =
735✔
1842
                    nXCount + 2 * nKernelRadius * nOvrFactor;
735✔
1843
                if (nChunkXOffQueried < 0)
735✔
1844
                {
1845
                    nChunkXSizeQueried += nChunkXOffQueried;
577✔
1846
                    nChunkXOffQueried = 0;
577✔
1847
                }
1848
                if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
735✔
1849
                    nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
591✔
1850
                CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
735✔
1851

1852
                bool bSkipResample = false;
735✔
1853
                bool bNoDataMaskFullyOpaque = false;
735✔
1854
                if (eErr == CE_None && bUseNoDataMask)
735✔
1855
                {
1856
                    eErr = poMaskBand->RasterIO(
489✔
1857
                        GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1858
                        nChunkXSizeQueried, nChunkYSizeQueried,
1859
                        pabyChunkNoDataMask, nChunkXSizeQueried,
1860
                        nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
1861

1862
                    /* Optimizations if mask if fully opaque or transparent */
1863
                    const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
489✔
1864
                    const GByte bVal = pabyChunkNoDataMask[0];
489✔
1865
                    int i = 1;  // Used after for.
489✔
1866
                    for (; i < nPixels; i++)
12,503,700✔
1867
                    {
1868
                        if (pabyChunkNoDataMask[i] != bVal)
12,503,300✔
1869
                            break;
72✔
1870
                    }
1871
                    if (i == nPixels)
489✔
1872
                    {
1873
                        if (bVal == 0)
417✔
1874
                        {
1875
                            GByte abyZero[16] = {0};
373✔
1876
                            for (int iBand = 0; iBand < nBandCount; iBand++)
780✔
1877
                            {
1878
                                for (int j = 0; j < nDstYCount; j++)
3,499✔
1879
                                {
1880
                                    GDALCopyWords64(
3,092✔
1881
                                        abyZero, GDT_Byte, 0,
1882
                                        static_cast<GByte *>(pData) +
1883
                                            iBand * nBandSpace +
3,092✔
1884
                                            nLineSpace * (j + nDstYOff) +
3,092✔
1885
                                            nDstXOff * nPixelSpace,
3,092✔
1886
                                        eBufType, static_cast<int>(nPixelSpace),
1887
                                        nDstXCount);
1888
                                }
1889
                            }
1890
                            bSkipResample = true;
373✔
1891
                        }
1892
                        else
1893
                        {
1894
                            bNoDataMaskFullyOpaque = true;
44✔
1895
                        }
1896
                    }
1897
                }
1898

1899
                if (!bSkipResample && eErr == CE_None)
735✔
1900
                {
1901
                    /* Read the source buffers */
1902
                    eErr = RasterIO(
371✔
1903
                        GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1904
                        nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1905
                        nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
1906
                        nBandCount, panBandMap, 0, 0, 0, nullptr);
1907
                }
1908

1909
#ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1910
                if (pfnResampleFuncMultiBands && !bSkipResample &&
1911
                    eErr == CE_None)
1912
                {
1913
                    eErr = pfnResampleFuncMultiBands(
1914
                        dfXRatioDstToSrc, dfYRatioDstToSrc,
1915
                        dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
1916
                        dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
1917
                        eWrkDataType, (GByte *)pChunk, nBandCount,
1918
                        bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
1919
                        nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
1920
                        nChunkXSizeQueried,
1921
                        nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
1922
                        nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
1923
                        nDstXOff + nDestXOffVirtual + nDstXCount,
1924
                        nDstYOff + nDestYOffVirtual,
1925
                        nDstYOff + nDestYOffVirtual + nDstYCount, papoDstBands,
1926
                        pszResampling, FALSE /*bHasNoData*/,
1927
                        0.0 /* dfNoDataValue */, nullptr /* color table*/,
1928
                        eDataType);
1929
                }
1930
                else
1931
#endif
1932
                {
1933
                    size_t nChunkBandOffset =
1934
                        static_cast<size_t>(nChunkXSizeQueried) *
746✔
1935
                        nChunkYSizeQueried *
746✔
1936
                        GDALGetDataTypeSizeBytes(eWrkDataType);
746✔
1937
                    for (int i = 0;
1,949✔
1938
                         i < nBandCount && !bSkipResample && eErr == CE_None;
1,949✔
1939
                         i++)
1940
                    {
1941
                        const bool bPropagateNoData = false;
1,192✔
1942
                        void *pDstBuffer = nullptr;
1,192✔
1943
                        GDALDataType eDstBufferDataType = GDT_Unknown;
1,192✔
1944
                        GDALRasterBand *poMEMBand =
1945
                            poMEMDS->GetRasterBand(i + 1);
1,192✔
1946
                        GDALOverviewResampleArgs args;
1,192✔
1947
                        args.eSrcDataType = eDataType;
1,192✔
1948
                        args.eOvrDataType = poMEMBand->GetRasterDataType();
1,192✔
1949
                        args.nOvrXSize = poMEMBand->GetXSize();
1,192✔
1950
                        args.nOvrYSize = poMEMBand->GetYSize();
1,192✔
1951
                        args.nOvrNBITS = nNBITS;
1,191✔
1952
                        args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1,191✔
1953
                        args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1,191✔
1954
                        args.dfSrcXDelta =
1,191✔
1955
                            dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1,191✔
1956
                        args.dfSrcYDelta =
1,191✔
1957
                            dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1,191✔
1958
                        args.eWrkDataType = eWrkDataType;
1,191✔
1959
                        args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
1,191✔
1960
                                                       ? nullptr
1,191✔
1961
                                                       : pabyChunkNoDataMask;
1962
                        args.nChunkXOff =
1,191✔
1963
                            nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1,191✔
1964
                        args.nChunkXSize = nChunkXSizeQueried;
1,191✔
1965
                        args.nChunkYOff =
1,191✔
1966
                            nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1,191✔
1967
                        args.nChunkYSize = nChunkYSizeQueried;
1,191✔
1968
                        args.nDstXOff = nDstXOff + nDestXOffVirtual;
1,191✔
1969
                        args.nDstXOff2 =
1,191✔
1970
                            nDstXOff + nDestXOffVirtual + nDstXCount;
1,191✔
1971
                        args.nDstYOff = nDstYOff + nDestYOffVirtual;
1,191✔
1972
                        args.nDstYOff2 =
1,191✔
1973
                            nDstYOff + nDestYOffVirtual + nDstYCount;
1,191✔
1974
                        args.pszResampling = pszResampling;
1,191✔
1975
                        args.bHasNoData = false;
1,191✔
1976
                        args.dfNoDataValue = 0.0;
1,191✔
1977
                        args.poColorTable = nullptr;
1,191✔
1978
                        args.bPropagateNoData = bPropagateNoData;
1,191✔
1979

1980
                        eErr =
1981
                            pfnResampleFunc(args,
2,383✔
1982
                                            reinterpret_cast<GByte *>(pChunk) +
1,191✔
1983
                                                i * nChunkBandOffset,
1,191✔
1984
                                            &pDstBuffer, &eDstBufferDataType);
1985
                        if (eErr == CE_None)
1,192✔
1986
                        {
1987
                            eErr = poMEMBand->RasterIO(
1,192✔
1988
                                GF_Write, nDstXOff + nDestXOffVirtual,
1989
                                nDstYOff + nDestYOffVirtual, nDstXCount,
1990
                                nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
1991
                                eDstBufferDataType, 0, 0, nullptr);
1992
                        }
1993
                        CPLFree(pDstBuffer);
1,192✔
1994
                    }
1995
                }
1996

1997
                nBlocksDone++;
757✔
1998
                if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
1,146✔
1999
                    !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
389✔
2000
                                             "", psExtraArg->pProgressData))
2001
                {
2002
                    eErr = CE_Failure;
×
2003
                }
2004
            }
2005
        }
2006

2007
        CPLFree(pChunk);
762✔
2008
        CPLFree(pabyChunkNoDataMask);
758✔
2009
    }
2010

2011
    CPLFree(papoDstBands);
758✔
2012
    GDALClose(poMEMDS);
758✔
2013

2014
    return eErr;
758✔
2015
}
2016

2017
//! @endcond
2018

2019
/************************************************************************/
2020
/*                           GDALSwapWords()                            */
2021
/************************************************************************/
2022

2023
/**
2024
 * Byte swap words in-place.
2025
 *
2026
 * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2027
 * a memory array.  No assumption is made that the words being swapped are
2028
 * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2029
 * to determine if the current platform is big endian or little endian.  Use
2030
 * The macros like CPL_SWAP32() to byte swap single values without the overhead
2031
 * of a function call.
2032
 *
2033
 * @param pData pointer to start of data buffer.
2034
 * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2035
 * @param nWordCount the number of words to be swapped in this call.
2036
 * @param nWordSkip the byte offset from the start of one word to the start of
2037
 * the next. For packed buffers this is the same as nWordSize.
2038
 */
2039

2040
void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
497,157✔
2041
                               int nWordSkip)
2042

2043
{
2044
    if (nWordCount > 0)
497,157✔
2045
        VALIDATE_POINTER0(pData, "GDALSwapWords");
497,157✔
2046

2047
    GByte *pabyData = static_cast<GByte *>(pData);
497,157✔
2048

2049
    switch (nWordSize)
497,157✔
2050
    {
2051
        case 1:
7,234✔
2052
            break;
7,234✔
2053

2054
        case 2:
476,923✔
2055
            CPLAssert(nWordSkip >= 2 || nWordCount == 1);
476,923✔
2056
            for (int i = 0; i < nWordCount; i++)
228,064,000✔
2057
            {
2058
                CPL_SWAP16PTR(pabyData);
227,587,000✔
2059
                pabyData += nWordSkip;
227,587,000✔
2060
            }
2061
            break;
476,923✔
2062

2063
        case 4:
10,574✔
2064
            CPLAssert(nWordSkip >= 4 || nWordCount == 1);
10,574✔
2065
            if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
10,574✔
2066
            {
2067
                for (int i = 0; i < nWordCount; i++)
29,140,500✔
2068
                {
2069
                    *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
29,130,000✔
2070
                        *reinterpret_cast<const GUInt32 *>(pabyData));
2071
                    pabyData += nWordSkip;
29,130,000✔
2072
                }
10,571✔
2073
            }
2074
            else
2075
            {
2076
                for (int i = 0; i < nWordCount; i++)
9✔
2077
                {
2078
                    CPL_SWAP32PTR(pabyData);
6✔
2079
                    pabyData += nWordSkip;
6✔
2080
                }
2081
            }
2082
            break;
10,574✔
2083

2084
        case 8:
2,426✔
2085
            CPLAssert(nWordSkip >= 8 || nWordCount == 1);
2,426✔
2086
            if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
2,426✔
2087
            {
2088
                for (int i = 0; i < nWordCount; i++)
3,356,900✔
2089
                {
2090
                    *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
3,354,480✔
2091
                        *reinterpret_cast<const GUInt64 *>(pabyData));
2092
                    pabyData += nWordSkip;
3,354,480✔
2093
                }
2,425✔
2094
            }
2095
            else
2096
            {
2097
                for (int i = 0; i < nWordCount; i++)
3✔
2098
                {
2099
                    CPL_SWAP64PTR(pabyData);
2✔
2100
                    pabyData += nWordSkip;
2✔
2101
                }
2102
            }
2103
            break;
2,426✔
2104

2105
        default:
×
2106
            CPLAssert(false);
×
2107
    }
2108
}
2109

2110
/************************************************************************/
2111
/*                           GDALSwapWordsEx()                          */
2112
/************************************************************************/
2113

2114
/**
2115
 * Byte swap words in-place.
2116
 *
2117
 * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2118
 * a memory array.  No assumption is made that the words being swapped are
2119
 * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2120
 * to determine if the current platform is big endian or little endian.  Use
2121
 * The macros like CPL_SWAP32() to byte swap single values without the overhead
2122
 * of a function call.
2123
 *
2124
 * @param pData pointer to start of data buffer.
2125
 * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2126
 * @param nWordCount the number of words to be swapped in this call.
2127
 * @param nWordSkip the byte offset from the start of one word to the start of
2128
 * the next. For packed buffers this is the same as nWordSize.
2129
 * @since GDAL 2.1
2130
 */
2131
void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
6,118✔
2132
                                 int nWordSkip)
2133
{
2134
    GByte *pabyData = static_cast<GByte *>(pData);
6,118✔
2135
    while (nWordCount)
12,236✔
2136
    {
2137
        // Pick-up a multiple of 8 as max chunk size.
2138
        const int nWordCountSmall =
6,118✔
2139
            (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
6,118✔
2140
        GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
6,118✔
2141
        pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
6,118✔
2142
        nWordCount -= nWordCountSmall;
6,118✔
2143
    }
2144
}
6,118✔
2145

2146
// Place the new GDALCopyWords helpers in an anonymous namespace
2147
namespace
2148
{
2149

2150
/************************************************************************/
2151
/*                           GDALCopyWordsT()                           */
2152
/************************************************************************/
2153
/**
2154
 * Template function, used to copy data from pSrcData into buffer
2155
 * pDstData, with stride nSrcPixelStride in the source data and
2156
 * stride nDstPixelStride in the destination data. This template can
2157
 * deal with the case where the input data type is real or complex and
2158
 * the output is real.
2159
 *
2160
 * @param pSrcData the source data buffer
2161
 * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
2162
 *                      of interest.
2163
 * @param pDstData the destination buffer.
2164
 * @param nDstPixelStride the stride in the buffer pDstData for pixels of
2165
 *                      interest.
2166
 * @param nWordCount the total number of pixel words to copy
2167
 *
2168
 * @code
2169
 * // Assume an input buffer of type GUInt16 named pBufferIn
2170
 * GByte *pBufferOut = new GByte[numBytesOut];
2171
 * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
2172
 * @endcode
2173
 * @note
2174
 * This is a private function, and should not be exposed outside of
2175
 * rasterio.cpp. External users should call the GDALCopyWords driver function.
2176
 */
2177

2178
template <class Tin, class Tout>
2179
static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
42,343,404✔
2180
                                         int nSrcPixelStride,
2181
                                         Tout *const CPL_RESTRICT pDstData,
2182
                                         int nDstPixelStride,
2183
                                         GPtrDiff_t nWordCount)
2184
{
2185
    decltype(nWordCount) nDstOffset = 0;
42,343,404✔
2186

2187
    const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
42,343,404✔
2188
    char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
42,343,404✔
2189
    for (decltype(nWordCount) n = 0; n < nWordCount; n++)
512,666,478✔
2190
    {
2191
        const Tin tValue =
470,319,894✔
2192
            *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
470,319,894✔
2193
        Tout *const pOutPixel =
470,319,894✔
2194
            reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
470,319,894✔
2195

2196
        GDALCopyWord(tValue, *pOutPixel);
470,319,894✔
2197

2198
        nDstOffset += nDstPixelStride;
470,322,894✔
2199
    }
2200
}
42,346,492✔
2201

2202
template <class Tin, class Tout>
2203
static void CPL_NOINLINE GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
29,693,774✔
2204
                                        int nSrcPixelStride,
2205
                                        Tout *const CPL_RESTRICT pDstData,
2206
                                        int nDstPixelStride,
2207
                                        GPtrDiff_t nWordCount)
2208
{
2209
    GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
29,693,774✔
2210
                          nWordCount);
2211
}
29,693,773✔
2212

2213
template <class Tin, class Tout>
2214
static void inline GDALCopyWordsT_8atatime(
4,868,005✔
2215
    const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
2216
    Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
2217
    GPtrDiff_t nWordCount)
2218
{
2219
    decltype(nWordCount) nDstOffset = 0;
4,868,005✔
2220

2221
    const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
4,868,005✔
2222
    char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
4,868,005✔
2223
    decltype(nWordCount) n = 0;
4,868,005✔
2224
    if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
4,868,005✔
2225
        nDstPixelStride == static_cast<int>(sizeof(Tout)))
2226
    {
2227
        for (; n < nWordCount - 7; n += 8)
33,467,974✔
2228
        {
2229
            const Tin *pInValues = reinterpret_cast<const Tin *>(
33,101,960✔
2230
                pSrcDataPtr + (n * nSrcPixelStride));
33,101,960✔
2231
            Tout *const pOutPixels =
33,101,960✔
2232
                reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
33,101,960✔
2233

2234
            GDALCopy8Words(pInValues, pOutPixels);
33,101,960✔
2235

2236
            nDstOffset += 8 * nDstPixelStride;
33,097,350✔
2237
        }
2238
    }
2239
    for (; n < nWordCount; n++)
10,166,354✔
2240
    {
2241
        const Tin tValue =
5,298,288✔
2242
            *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
5,298,288✔
2243
        Tout *const pOutPixel =
5,298,288✔
2244
            reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
5,298,288✔
2245

2246
        GDALCopyWord(tValue, *pOutPixel);
5,298,288✔
2247

2248
        nDstOffset += nDstPixelStride;
5,303,031✔
2249
    }
2250
}
4,868,076✔
2251

2252
#ifdef HAVE_SSE2
2253

2254
template <class Tout>
2255
void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
39,675✔
2256
                              int nSrcPixelStride,
2257
                              Tout *const CPL_RESTRICT pDstData,
2258
                              int nDstPixelStride, GPtrDiff_t nWordCount)
2259
{
2260
    static_assert(std::is_integral<Tout>::value &&
2261
                      sizeof(Tout) == sizeof(uint16_t),
2262
                  "Bad Tout");
2263
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
39,675✔
2264
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2265
    {
2266
        decltype(nWordCount) n = 0;
33,324✔
2267
        const __m128i xmm_zero = _mm_setzero_si128();
33,324✔
2268
        GByte *CPL_RESTRICT pabyDstDataPtr =
33,324✔
2269
            reinterpret_cast<GByte *>(pDstData);
2270
        for (; n < nWordCount - 15; n += 16)
1,415,668✔
2271
        {
2272
            __m128i xmm = _mm_loadu_si128(
1,382,344✔
2273
                reinterpret_cast<const __m128i *>(pSrcData + n));
1,382,344✔
2274
            __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
1,382,344✔
2275
            __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
1,382,344✔
2276
            _mm_storeu_si128(
2277
                reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
1,382,344✔
2278
            _mm_storeu_si128(
2279
                reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
1,382,344✔
2280
        }
2281
        for (; n < nWordCount; n++)
109,209✔
2282
        {
2283
            pDstData[n] = pSrcData[n];
75,885✔
2284
        }
33,324✔
2285
    }
2286
    else
2287
    {
2288
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
6,351✔
2289
                              nDstPixelStride, nWordCount);
2290
    }
2291
}
39,675✔
2292

2293
template <>
2294
CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
26,942✔
2295
                                 int nSrcPixelStride,
2296
                                 GUInt16 *const CPL_RESTRICT pDstData,
2297
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2298
{
2299
    GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
26,942✔
2300
                             nDstPixelStride, nWordCount);
2301
}
26,942✔
2302

2303
template <>
2304
CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
12,733✔
2305
                                 int nSrcPixelStride,
2306
                                 GInt16 *const CPL_RESTRICT pDstData,
2307
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2308
{
2309
    GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
12,733✔
2310
                             nDstPixelStride, nWordCount);
2311
}
12,733✔
2312

2313
template <class Tout>
2314
void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
12,799,565✔
2315
                              int nSrcPixelStride,
2316
                              Tout *const CPL_RESTRICT pDstData,
2317
                              int nDstPixelStride, GPtrDiff_t nWordCount)
2318
{
2319
    static_assert(std::is_integral<Tout>::value &&
2320
                      sizeof(Tout) == sizeof(uint32_t),
2321
                  "Bad Tout");
2322
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
12,799,565✔
2323
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2324
    {
2325
        decltype(nWordCount) n = 0;
6,181,455✔
2326
        const __m128i xmm_zero = _mm_setzero_si128();
6,181,455✔
2327
        GByte *CPL_RESTRICT pabyDstDataPtr =
6,181,455✔
2328
            reinterpret_cast<GByte *>(pDstData);
2329
        for (; n < nWordCount - 15; n += 16)
70,110,600✔
2330
        {
2331
            __m128i xmm = _mm_loadu_si128(
64,042,045✔
2332
                reinterpret_cast<const __m128i *>(pSrcData + n));
64,042,045✔
2333
            __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
63,926,645✔
2334
            __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
64,034,345✔
2335
            __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
63,900,445✔
2336
            __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
63,875,145✔
2337
            __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
63,786,645✔
2338
            __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
63,929,145✔
2339
            _mm_storeu_si128(
2340
                reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
63,929,145✔
2341
            _mm_storeu_si128(
2342
                reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
63,929,145✔
2343
            _mm_storeu_si128(
2344
                reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
63,929,145✔
2345
            _mm_storeu_si128(
2346
                reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
63,929,145✔
2347
        }
2348
        for (; n < nWordCount; n++)
14,240,279✔
2349
        {
2350
            pDstData[n] = pSrcData[n];
8,171,744✔
2351
        }
6,068,565✔
2352
    }
2353
    else
2354
    {
2355
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
6,618,150✔
2356
                              nDstPixelStride, nWordCount);
2357
    }
2358
}
12,615,865✔
2359

2360
template <>
2361
CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
465✔
2362
                                 int nSrcPixelStride,
2363
                                 GUInt32 *const CPL_RESTRICT pDstData,
2364
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2365
{
2366
    GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
465✔
2367
                             nDstPixelStride, nWordCount);
2368
}
465✔
2369

2370
template <>
2371
CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
12,797,300✔
2372
                                 int nSrcPixelStride,
2373
                                 GInt32 *const CPL_RESTRICT pDstData,
2374
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2375
{
2376
    GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
12,797,300✔
2377
                             nDstPixelStride, nWordCount);
2378
}
12,834,900✔
2379

2380
template <>
2381
CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2,471,810✔
2382
                                 int nSrcPixelStride,
2383
                                 float *const CPL_RESTRICT pDstData,
2384
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2385
{
2386
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2,471,810✔
2387
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2388
    {
2389
        decltype(nWordCount) n = 0;
112,368✔
2390
        const __m128i xmm_zero = _mm_setzero_si128();
112,368✔
2391
        GByte *CPL_RESTRICT pabyDstDataPtr =
112,368✔
2392
            reinterpret_cast<GByte *>(pDstData);
2393
        for (; n < nWordCount - 15; n += 16)
3,261,800✔
2394
        {
2395
            __m128i xmm = _mm_loadu_si128(
3,149,440✔
2396
                reinterpret_cast<const __m128i *>(pSrcData + n));
3,149,440✔
2397
            __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
3,149,440✔
2398
            __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
3,149,440✔
2399
            __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
3,149,440✔
2400
            __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
3,149,440✔
2401
            __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
3,149,440✔
2402
            __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
3,149,440✔
2403
            __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
3,149,440✔
2404
            __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
3,149,440✔
2405
            __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
3,149,440✔
2406
            __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
3,149,440✔
2407
            _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
3,149,440✔
2408
                          xmm0_f);
2409
            _mm_storeu_ps(
2410
                reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
3,149,440✔
2411
            _mm_storeu_ps(
2412
                reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
3,149,440✔
2413
            _mm_storeu_ps(
2414
                reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
3,149,440✔
2415
        }
2416
        for (; n < nWordCount; n++)
480,044✔
2417
        {
2418
            pDstData[n] = pSrcData[n];
367,676✔
2419
        }
112,368✔
2420
    }
2421
    else
2422
    {
2423
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2,359,440✔
2424
                              nDstPixelStride, nWordCount);
2425
    }
2426
}
2,471,810✔
2427

2428
template <>
2429
CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
147,111✔
2430
                                 int nSrcPixelStride,
2431
                                 double *const CPL_RESTRICT pDstData,
2432
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2433
{
2434
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
147,111✔
2435
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2436
    {
2437
        decltype(nWordCount) n = 0;
124,055✔
2438
        const __m128i xmm_zero = _mm_setzero_si128();
124,055✔
2439
        GByte *CPL_RESTRICT pabyDstDataPtr =
124,055✔
2440
            reinterpret_cast<GByte *>(pDstData);
2441
        for (; n < nWordCount - 15; n += 16)
1,423,640✔
2442
        {
2443
            __m128i xmm = _mm_loadu_si128(
1,299,580✔
2444
                reinterpret_cast<const __m128i *>(pSrcData + n));
1,299,580✔
2445
            __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
1,299,580✔
2446
            __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
1,299,580✔
2447
            __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
1,299,580✔
2448
            __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
1,299,580✔
2449
            __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
1,299,580✔
2450
            __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
1,299,580✔
2451

2452
#if defined(__AVX2__) && defined(slightly_slower_than_SSE2)
2453
            _mm256_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2454
                             _mm256_cvtepi32_pd(xmm0));
2455
            _mm256_storeu_pd(
2456
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2457
                _mm256_cvtepi32_pd(xmm1));
2458
            _mm256_storeu_pd(
2459
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2460
                _mm256_cvtepi32_pd(xmm2));
2461
            _mm256_storeu_pd(
2462
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2463
                _mm256_cvtepi32_pd(xmm3));
2464
#else
2465
            __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
1,299,580✔
2466
            __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
1,299,580✔
2467
            __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
1,299,580✔
2468
            __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
1,299,580✔
2469
            xmm0 = _mm_srli_si128(xmm0, 8);
1,299,580✔
2470
            xmm1 = _mm_srli_si128(xmm1, 8);
1,299,580✔
2471
            xmm2 = _mm_srli_si128(xmm2, 8);
1,299,580✔
2472
            xmm3 = _mm_srli_si128(xmm3, 8);
1,299,580✔
2473
            __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
1,299,580✔
2474
            __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
1,299,580✔
2475
            __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
1,299,580✔
2476
            __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
1,299,580✔
2477

2478
            _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
1,299,580✔
2479
                          xmm0_low_d);
2480
            _mm_storeu_pd(
2481
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
1,299,580✔
2482
                xmm0_high_d);
2483
            _mm_storeu_pd(
2484
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
1,299,580✔
2485
                xmm1_low_d);
2486
            _mm_storeu_pd(
2487
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
1,299,580✔
2488
                xmm1_high_d);
2489
            _mm_storeu_pd(
2490
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
1,299,580✔
2491
                xmm2_low_d);
2492
            _mm_storeu_pd(
2493
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
1,299,580✔
2494
                xmm2_high_d);
2495
            _mm_storeu_pd(
2496
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
1,299,580✔
2497
                xmm3_low_d);
2498
            _mm_storeu_pd(
2499
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
1,299,580✔
2500
                xmm3_high_d);
2501
#endif
2502
        }
2503
        for (; n < nWordCount; n++)
234,453✔
2504
        {
2505
            pDstData[n] = pSrcData[n];
110,398✔
2506
        }
124,055✔
2507
    }
2508
    else
2509
    {
2510
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
23,056✔
2511
                              nDstPixelStride, nWordCount);
2512
    }
2513
}
147,111✔
2514

2515
template <>
2516
CPL_NOINLINE void GDALCopyWordsT(const uint8_t *const CPL_RESTRICT pSrcData,
148✔
2517
                                 int nSrcPixelStride,
2518
                                 int8_t *const CPL_RESTRICT pDstData,
2519
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2520
{
2521
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
148✔
2522
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2523
    {
2524
        decltype(nWordCount) n = 0;
142✔
2525
        const __m128i xmm_127 = _mm_set1_epi8(127);
142✔
2526
        for (; n < nWordCount - 31; n += 32)
146✔
2527
        {
2528
            __m128i xmm0 = _mm_loadu_si128(
8✔
2529
                reinterpret_cast<const __m128i *>(pSrcData + n));
4✔
2530
            __m128i xmm1 = _mm_loadu_si128(
4✔
2531
                reinterpret_cast<const __m128i *>(pSrcData + n + 16));
4✔
2532
            xmm0 = _mm_min_epu8(xmm0, xmm_127);
4✔
2533
            xmm1 = _mm_min_epu8(xmm1, xmm_127);
4✔
2534
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
4✔
2535
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
4✔
2536
                             xmm1);
2537
        }
2538
        for (; n < nWordCount; n++)
2,422✔
2539
        {
2540
            pDstData[n] =
2,280✔
2541
                pSrcData[n] >= 127 ? 127 : static_cast<int8_t>(pSrcData[n]);
2,280✔
2542
        }
142✔
2543
    }
2544
    else
2545
    {
2546
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
6✔
2547
                              nDstPixelStride, nWordCount);
2548
    }
2549
}
148✔
2550

2551
template <>
2552
CPL_NOINLINE void GDALCopyWordsT(const int8_t *const CPL_RESTRICT pSrcData,
82✔
2553
                                 int nSrcPixelStride,
2554
                                 uint8_t *const CPL_RESTRICT pDstData,
2555
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2556
{
2557
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
82✔
2558
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2559
    {
2560
        decltype(nWordCount) n = 0;
56✔
2561
#if !(defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS))
2562
        const __m128i xmm_INT8_to_UINT8 = _mm_set1_epi8(-128);
56✔
2563
#endif
2564
        for (; n < nWordCount - 31; n += 32)
117✔
2565
        {
2566
            __m128i xmm0 = _mm_loadu_si128(
122✔
2567
                reinterpret_cast<const __m128i *>(pSrcData + n));
61✔
2568
            __m128i xmm1 = _mm_loadu_si128(
61✔
2569
                reinterpret_cast<const __m128i *>(pSrcData + n + 16));
61✔
2570
#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2571
            xmm0 = _mm_max_epi8(xmm0, _mm_setzero_si128());
2572
            xmm1 = _mm_max_epi8(xmm1, _mm_setzero_si128());
2573
#else
2574
            xmm0 = _mm_add_epi8(xmm0, xmm_INT8_to_UINT8);
61✔
2575
            xmm1 = _mm_add_epi8(xmm1, xmm_INT8_to_UINT8);
61✔
2576
            xmm0 = _mm_max_epu8(xmm0, xmm_INT8_to_UINT8);
61✔
2577
            xmm1 = _mm_max_epu8(xmm1, xmm_INT8_to_UINT8);
61✔
2578
            xmm0 = _mm_sub_epi8(xmm0, xmm_INT8_to_UINT8);
61✔
2579
            xmm1 = _mm_sub_epi8(xmm1, xmm_INT8_to_UINT8);
61✔
2580
#endif
2581
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
61✔
2582
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
61✔
2583
                             xmm1);
2584
        }
2585
        for (; n < nWordCount; n++)
352✔
2586
        {
2587
            pDstData[n] =
296✔
2588
                pSrcData[n] < 0 ? 0 : static_cast<uint8_t>(pSrcData[n]);
296✔
2589
        }
56✔
2590
    }
2591
    else
2592
    {
2593
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
26✔
2594
                              nDstPixelStride, nWordCount);
2595
    }
2596
}
82✔
2597

2598
template <>
2599
CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
6,037✔
2600
                                 int nSrcPixelStride,
2601
                                 uint8_t *const CPL_RESTRICT pDstData,
2602
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2603
{
2604
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
6,037✔
2605
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2606
    {
2607
        decltype(nWordCount) n = 0;
5,062✔
2608
#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2609
        const auto xmm_MAX_INT16 = _mm_set1_epi16(32767);
2610
#else
2611
        // In SSE2, min_epu16 does not exist, so shift from
2612
        // UInt16 to SInt16 to be able to use min_epi16
2613
        const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
5,062✔
2614
        const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
5,062✔
2615
#endif
2616
        for (; n < nWordCount - 15; n += 16)
71,888✔
2617
        {
2618
            __m128i xmm0 = _mm_loadu_si128(
133,652✔
2619
                reinterpret_cast<const __m128i *>(pSrcData + n));
66,826✔
2620
            __m128i xmm1 = _mm_loadu_si128(
66,826✔
2621
                reinterpret_cast<const __m128i *>(pSrcData + n + 8));
66,826✔
2622
#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2623
            xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2624
            xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2625
#else
2626
            xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
66,826✔
2627
            xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
66,826✔
2628
            xmm0 = _mm_min_epi16(xmm0, xmm_m255_shifted);
66,826✔
2629
            xmm1 = _mm_min_epi16(xmm1, xmm_m255_shifted);
66,826✔
2630
            xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
66,826✔
2631
            xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
66,826✔
2632
#endif
2633
            xmm0 = _mm_packus_epi16(xmm0, xmm1);
66,826✔
2634
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
66,826✔
2635
        }
2636
        for (; n < nWordCount; n++)
16,403✔
2637
        {
2638
            pDstData[n] =
11,341✔
2639
                pSrcData[n] >= 255 ? 255 : static_cast<uint8_t>(pSrcData[n]);
11,341✔
2640
        }
5,062✔
2641
    }
2642
    else
2643
    {
2644
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
975✔
2645
                              nDstPixelStride, nWordCount);
2646
    }
2647
}
6,037✔
2648

2649
template <>
2650
CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
46✔
2651
                                 int nSrcPixelStride,
2652
                                 int16_t *const CPL_RESTRICT pDstData,
2653
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2654
{
2655
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
46✔
2656
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2657
    {
2658
        decltype(nWordCount) n = 0;
40✔
2659
#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2660
        const __m128i xmm_MAX_INT16 = _mm_set1_epi16(32767);
2661
#else
2662
        // In SSE2, min_epu16 does not exist, so shift from
2663
        // UInt16 to SInt16 to be able to use min_epi16
2664
        const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
40✔
2665
        const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
40✔
2666
#endif
2667
        for (; n < nWordCount - 15; n += 16)
169✔
2668
        {
2669
            __m128i xmm0 = _mm_loadu_si128(
258✔
2670
                reinterpret_cast<const __m128i *>(pSrcData + n));
129✔
2671
            __m128i xmm1 = _mm_loadu_si128(
129✔
2672
                reinterpret_cast<const __m128i *>(pSrcData + n + 8));
129✔
2673
#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2674
            xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2675
            xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2676
#else
2677
            xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
129✔
2678
            xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
129✔
2679
            xmm0 = _mm_min_epi16(xmm0, xmm_32767_shifted);
129✔
2680
            xmm1 = _mm_min_epi16(xmm1, xmm_32767_shifted);
129✔
2681
            xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
129✔
2682
            xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
129✔
2683
#endif
2684
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
129✔
2685
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
129✔
2686
                             xmm1);
2687
        }
2688
        for (; n < nWordCount; n++)
191✔
2689
        {
2690
            pDstData[n] = pSrcData[n] >= 32767
282✔
2691
                              ? 32767
2692
                              : static_cast<int16_t>(pSrcData[n]);
131✔
2693
        }
40✔
2694
    }
2695
    else
2696
    {
2697
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
6✔
2698
                              nDstPixelStride, nWordCount);
2699
    }
2700
}
46✔
2701

2702
template <>
2703
CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
135✔
2704
                                 int nSrcPixelStride,
2705
                                 uint16_t *const CPL_RESTRICT pDstData,
2706
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2707
{
2708
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
135✔
2709
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2710
    {
2711
        decltype(nWordCount) n = 0;
92✔
2712
        const __m128i xmm_zero = _mm_setzero_si128();
92✔
2713
        for (; n < nWordCount - 15; n += 16)
277✔
2714
        {
2715
            __m128i xmm0 = _mm_loadu_si128(
370✔
2716
                reinterpret_cast<const __m128i *>(pSrcData + n));
185✔
2717
            __m128i xmm1 = _mm_loadu_si128(
185✔
2718
                reinterpret_cast<const __m128i *>(pSrcData + n + 8));
185✔
2719
            xmm0 = _mm_max_epi16(xmm0, xmm_zero);
185✔
2720
            xmm1 = _mm_max_epi16(xmm1, xmm_zero);
185✔
2721
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
185✔
2722
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
185✔
2723
                             xmm1);
2724
        }
2725
        for (; n < nWordCount; n++)
468✔
2726
        {
2727
            pDstData[n] =
376✔
2728
                pSrcData[n] < 0 ? 0 : static_cast<uint16_t>(pSrcData[n]);
376✔
2729
        }
92✔
2730
    }
2731
    else
2732
    {
2733
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
43✔
2734
                              nDstPixelStride, nWordCount);
2735
    }
2736
}
135✔
2737

2738
#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2739

2740
template <>
2741
CPL_NOINLINE void GDALCopyWordsT(const uint32_t *const CPL_RESTRICT pSrcData,
2742
                                 int nSrcPixelStride,
2743
                                 int32_t *const CPL_RESTRICT pDstData,
2744
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2745
{
2746
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2747
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2748
    {
2749
        decltype(nWordCount) n = 0;
2750
        const __m128i xmm_MAX_INT = _mm_set1_epi32(INT_MAX);
2751
        for (; n < nWordCount - 8; n += 7)
2752
        {
2753
            __m128i xmm0 = _mm_loadu_si128(
2754
                reinterpret_cast<const __m128i *>(pSrcData + n));
2755
            __m128i xmm1 = _mm_loadu_si128(
2756
                reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2757
            xmm0 = _mm_min_epu32(xmm0, xmm_MAX_INT);
2758
            xmm1 = _mm_min_epu32(xmm1, xmm_MAX_INT);
2759
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2760
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2761
                             xmm1);
2762
        }
2763
        for (; n < nWordCount; n++)
2764
        {
2765
            pDstData[n] = pSrcData[n] >= INT_MAX
2766
                              ? INT_MAX
2767
                              : static_cast<int32_t>(pSrcData[n]);
2768
        }
2769
    }
2770
    else
2771
    {
2772
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2773
                              nDstPixelStride, nWordCount);
2774
    }
2775
}
2776

2777
template <>
2778
CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
2779
                                 int nSrcPixelStride,
2780
                                 uint32_t *const CPL_RESTRICT pDstData,
2781
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2782
{
2783
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2784
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2785
    {
2786
        decltype(nWordCount) n = 0;
2787
        const __m128i xmm_zero = _mm_setzero_si128();
2788
        for (; n < nWordCount - 7; n += 8)
2789
        {
2790
            __m128i xmm0 = _mm_loadu_si128(
2791
                reinterpret_cast<const __m128i *>(pSrcData + n));
2792
            __m128i xmm1 = _mm_loadu_si128(
2793
                reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2794
            xmm0 = _mm_max_epi32(xmm0, xmm_zero);
2795
            xmm1 = _mm_max_epi32(xmm1, xmm_zero);
2796
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2797
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2798
                             xmm1);
2799
        }
2800
        for (; n < nWordCount; n++)
2801
        {
2802
            pDstData[n] =
2803
                pSrcData[n] < 0 ? 0 : static_cast<uint32_t>(pSrcData[n]);
2804
        }
2805
    }
2806
    else
2807
    {
2808
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2809
                              nDstPixelStride, nWordCount);
2810
    }
2811
}
2812

2813
#endif  // defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2814

2815
template <>
2816
CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
339✔
2817
                                 int nSrcPixelStride,
2818
                                 float *const CPL_RESTRICT pDstData,
2819
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2820
{
2821
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
339✔
2822
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2823
    {
2824
        decltype(nWordCount) n = 0;
333✔
2825
        const __m128i xmm_zero = _mm_setzero_si128();
333✔
2826
        GByte *CPL_RESTRICT pabyDstDataPtr =
333✔
2827
            reinterpret_cast<GByte *>(pDstData);
2828
        for (; n < nWordCount - 7; n += 8)
1,472✔
2829
        {
2830
            __m128i xmm = _mm_loadu_si128(
1,139✔
2831
                reinterpret_cast<const __m128i *>(pSrcData + n));
1,139✔
2832
            __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
1,139✔
2833
            __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
1,139✔
2834
            __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
1,139✔
2835
            __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
1,139✔
2836
            _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
1,139✔
2837
                          xmm0_f);
2838
            _mm_storeu_ps(
2839
                reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
1,139✔
2840
        }
2841
        for (; n < nWordCount; n++)
1,099✔
2842
        {
2843
            pDstData[n] = pSrcData[n];
766✔
2844
        }
333✔
2845
    }
2846
    else
2847
    {
2848
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
6✔
2849
                              nDstPixelStride, nWordCount);
2850
    }
2851
}
339✔
2852

2853
template <>
2854
CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
1,072,750✔
2855
                                 int nSrcPixelStride,
2856
                                 float *const CPL_RESTRICT pDstData,
2857
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2858
{
2859
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
1,072,750✔
2860
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2861
    {
2862
        decltype(nWordCount) n = 0;
82,850✔
2863
        GByte *CPL_RESTRICT pabyDstDataPtr =
82,850✔
2864
            reinterpret_cast<GByte *>(pDstData);
2865
        for (; n < nWordCount - 7; n += 8)
553,615✔
2866
        {
2867
            __m128i xmm = _mm_loadu_si128(
470,765✔
2868
                reinterpret_cast<const __m128i *>(pSrcData + n));
470,765✔
2869
            const auto sign = _mm_srai_epi16(xmm, 15);
470,765✔
2870
            __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
470,765✔
2871
            __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
470,765✔
2872
            __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
470,765✔
2873
            __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
470,765✔
2874
            _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
470,765✔
2875
                          xmm0_f);
2876
            _mm_storeu_ps(
2877
                reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
470,765✔
2878
        }
2879
        for (; n < nWordCount; n++)
242,713✔
2880
        {
2881
            pDstData[n] = pSrcData[n];
159,863✔
2882
        }
82,850✔
2883
    }
2884
    else
2885
    {
2886
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
989,901✔
2887
                              nDstPixelStride, nWordCount);
2888
    }
2889
}
1,072,750✔
2890

2891
template <>
2892
CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
370✔
2893
                                 int nSrcPixelStride,
2894
                                 double *const CPL_RESTRICT pDstData,
2895
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2896
{
2897
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
370✔
2898
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2899
    {
2900
        decltype(nWordCount) n = 0;
259✔
2901
        const __m128i xmm_zero = _mm_setzero_si128();
259✔
2902
        GByte *CPL_RESTRICT pabyDstDataPtr =
259✔
2903
            reinterpret_cast<GByte *>(pDstData);
2904
        for (; n < nWordCount - 7; n += 8)
693✔
2905
        {
2906
            __m128i xmm = _mm_loadu_si128(
434✔
2907
                reinterpret_cast<const __m128i *>(pSrcData + n));
434✔
2908
            __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
434✔
2909
            __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
434✔
2910

2911
            __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
434✔
2912
            __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
434✔
2913
            xmm0 = _mm_srli_si128(xmm0, 8);
434✔
2914
            xmm1 = _mm_srli_si128(xmm1, 8);
434✔
2915
            __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
434✔
2916
            __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
434✔
2917

2918
            _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
434✔
2919
                          xmm0_low_d);
2920
            _mm_storeu_pd(
2921
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
434✔
2922
                xmm0_high_d);
2923
            _mm_storeu_pd(
2924
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
434✔
2925
                xmm1_low_d);
2926
            _mm_storeu_pd(
2927
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
434✔
2928
                xmm1_high_d);
2929
        }
2930
        for (; n < nWordCount; n++)
888✔
2931
        {
2932
            pDstData[n] = pSrcData[n];
629✔
2933
        }
259✔
2934
    }
2935
    else
2936
    {
2937
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
111✔
2938
                              nDstPixelStride, nWordCount);
2939
    }
2940
}
370✔
2941

2942
template <>
2943
CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2,760,080✔
2944
                                 int nSrcPixelStride,
2945
                                 double *const CPL_RESTRICT pDstData,
2946
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2947
{
2948
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2,760,080✔
2949
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2950
    {
2951
        decltype(nWordCount) n = 0;
34,381✔
2952
        GByte *CPL_RESTRICT pabyDstDataPtr =
34,381✔
2953
            reinterpret_cast<GByte *>(pDstData);
2954
        for (; n < nWordCount - 7; n += 8)
400,726✔
2955
        {
2956
            __m128i xmm = _mm_loadu_si128(
366,508✔
2957
                reinterpret_cast<const __m128i *>(pSrcData + n));
366,508✔
2958
            const auto sign = _mm_srai_epi16(xmm, 15);
366,334✔
2959
            __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
366,430✔
2960
            __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
366,306✔
2961

2962
            __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
366,157✔
2963
            __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
366,132✔
2964
            xmm0 = _mm_srli_si128(xmm0, 8);
366,132✔
2965
            xmm1 = _mm_srli_si128(xmm1, 8);
366,261✔
2966
            __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
366,363✔
2967
            __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
366,345✔
2968

2969
            _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
366,345✔
2970
                          xmm0_low_d);
2971
            _mm_storeu_pd(
2972
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
366,345✔
2973
                xmm0_high_d);
2974
            _mm_storeu_pd(
2975
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
366,345✔
2976
                xmm1_low_d);
2977
            _mm_storeu_pd(
2978
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
366,345✔
2979
                xmm1_high_d);
2980
        }
2981
        for (; n < nWordCount; n++)
251,878✔
2982
        {
2983
            pDstData[n] = pSrcData[n];
217,660✔
2984
        }
34,218✔
2985
    }
2986
    else
2987
    {
2988
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2,725,700✔
2989
                              nDstPixelStride, nWordCount);
2990
    }
2991
}
2,759,900✔
2992

2993
template <>
2994
CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
4,388,580✔
2995
                                 int nSrcPixelStride,
2996
                                 GByte *const CPL_RESTRICT pDstData,
2997
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
2998
{
2999
    GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
4,388,580✔
3000
                            nDstPixelStride, nWordCount);
3001
}
4,388,680✔
3002

3003
template <>
3004
CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
38,353✔
3005
                                 int nSrcPixelStride,
3006
                                 GUInt16 *const CPL_RESTRICT pDstData,
3007
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
3008
{
3009
    GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
38,353✔
3010
                            nDstPixelStride, nWordCount);
3011
}
38,353✔
3012

3013
template <>
3014
CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
51,506✔
3015
                                 int nSrcPixelStride,
3016
                                 double *const CPL_RESTRICT pDstData,
3017
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
3018
{
3019
    GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
51,506✔
3020
                            nDstPixelStride, nWordCount);
3021
}
51,506✔
3022

3023
template <>
3024
CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
121,357✔
3025
                                 int nSrcPixelStride,
3026
                                 float *const CPL_RESTRICT pDstData,
3027
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
3028
{
3029
    GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
121,357✔
3030
                            nDstPixelStride, nWordCount);
3031
}
121,357✔
3032

3033
template <>
3034
CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
111✔
3035
                                 int nSrcPixelStride,
3036
                                 float *const CPL_RESTRICT pDstData,
3037
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
3038
{
3039
    GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
111✔
3040
                            nDstPixelStride, nWordCount);
3041
}
111✔
3042

3043
template <>
3044
CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
532✔
3045
                                 int nSrcPixelStride,
3046
                                 double *const CPL_RESTRICT pDstData,
3047
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
3048
{
3049
    GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
532✔
3050
                            nDstPixelStride, nWordCount);
3051
}
532✔
3052

3053
#ifdef __F16C__
3054

3055
template <>
3056
CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3057
                                 int nSrcPixelStride,
3058
                                 GFloat16 *const CPL_RESTRICT pDstData,
3059
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
3060
{
3061
    GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3062
                            nDstPixelStride, nWordCount);
3063
}
3064

3065
template <>
3066
CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3067
                                 int nSrcPixelStride,
3068
                                 GFloat16 *const CPL_RESTRICT pDstData,
3069
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
3070
{
3071
    GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3072
                            nDstPixelStride, nWordCount);
3073
}
3074

3075
#endif  // __F16C__
3076

3077
#endif  // HAVE_SSE2
3078

3079
template <>
3080
CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
190,085✔
3081
                                 int nSrcPixelStride,
3082
                                 GByte *const CPL_RESTRICT pDstData,
3083
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
3084
{
3085
    GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
190,085✔
3086
                            nDstPixelStride, nWordCount);
3087
}
190,086✔
3088

3089
template <>
3090
CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
15,775✔
3091
                                 int nSrcPixelStride,
3092
                                 GInt16 *const CPL_RESTRICT pDstData,
3093
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
3094
{
3095
    GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
15,775✔
3096
                            nDstPixelStride, nWordCount);
3097
}
15,775✔
3098

3099
template <>
3100
CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
61,689✔
3101
                                 int nSrcPixelStride,
3102
                                 GUInt16 *const CPL_RESTRICT pDstData,
3103
                                 int nDstPixelStride, GPtrDiff_t nWordCount)
3104
{
3105
    GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
61,689✔
3106
                            nDstPixelStride, nWordCount);
3107
}
61,687✔
3108

3109
/************************************************************************/
3110
/*                   GDALCopyWordsComplexT()                            */
3111
/************************************************************************/
3112
/**
3113
 * Template function, used to copy data from pSrcData into buffer
3114
 * pDstData, with stride nSrcPixelStride in the source data and
3115
 * stride nDstPixelStride in the destination data. Deals with the
3116
 * complex case, where input is complex and output is complex.
3117
 *
3118
 * @param pSrcData the source data buffer
3119
 * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3120
 *                      of interest.
3121
 * @param pDstData the destination buffer.
3122
 * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3123
 *                      interest.
3124
 * @param nWordCount the total number of pixel words to copy
3125
 *
3126
 */
3127
template <class Tin, class Tout>
3128
inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
96,717✔
3129
                                  int nSrcPixelStride,
3130
                                  Tout *const CPL_RESTRICT pDstData,
3131
                                  int nDstPixelStride, GPtrDiff_t nWordCount)
3132
{
3133
    decltype(nWordCount) nDstOffset = 0;
96,717✔
3134
    const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
96,717✔
3135
    char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
96,717✔
3136

3137
    for (decltype(nWordCount) n = 0; n < nWordCount; n++)
5,243,171✔
3138
    {
3139
        const Tin *const pPixelIn =
5,146,449✔
3140
            reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
5,146,449✔
3141
        Tout *const pPixelOut =
5,146,449✔
3142
            reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
5,146,449✔
3143

3144
        GDALCopyWord(pPixelIn[0], pPixelOut[0]);
5,146,449✔
3145
        GDALCopyWord(pPixelIn[1], pPixelOut[1]);
5,146,449✔
3146

3147
        nDstOffset += nDstPixelStride;
5,146,449✔
3148
    }
3149
}
96,717✔
3150

3151
/************************************************************************/
3152
/*                   GDALCopyWordsComplexOutT()                         */
3153
/************************************************************************/
3154
/**
3155
 * Template function, used to copy data from pSrcData into buffer
3156
 * pDstData, with stride nSrcPixelStride in the source data and
3157
 * stride nDstPixelStride in the destination data. Deals with the
3158
 * case where the value is real coming in, but complex going out.
3159
 *
3160
 * @param pSrcData the source data buffer
3161
 * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3162
 *                      of interest, in bytes.
3163
 * @param pDstData the destination buffer.
3164
 * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3165
 *                      interest, in bytes.
3166
 * @param nWordCount the total number of pixel words to copy
3167
 *
3168
 */
3169
template <class Tin, class Tout>
3170
inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
3,877✔
3171
                                     int nSrcPixelStride,
3172
                                     Tout *const CPL_RESTRICT pDstData,
3173
                                     int nDstPixelStride, GPtrDiff_t nWordCount)
3174
{
3175
    decltype(nWordCount) nDstOffset = 0;
3,877✔
3176

3177
    const Tout tOutZero = static_cast<Tout>(0);
3,877✔
3178

3179
    const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3,877✔
3180
    char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3,877✔
3181

3182
    for (decltype(nWordCount) n = 0; n < nWordCount; n++)
1,155,414✔
3183
    {
3184
        const Tin tValue =
1,151,537✔
3185
            *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
1,151,537✔
3186
        Tout *const pPixelOut =
1,151,537✔
3187
            reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
1,151,537✔
3188
        GDALCopyWord(tValue, *pPixelOut);
1,151,537✔
3189

3190
        pPixelOut[1] = tOutZero;
1,151,537✔
3191

3192
        nDstOffset += nDstPixelStride;
1,151,537✔
3193
    }
3194
}
3,877✔
3195

3196
/************************************************************************/
3197
/*                           GDALCopyWordsFromT()                       */
3198
/************************************************************************/
3199
/**
3200
 * Template driver function. Given the input type T, call the appropriate
3201
 * GDALCopyWordsT function template for the desired output type. You should
3202
 * never call this function directly (call GDALCopyWords instead).
3203
 *
3204
 * @param pSrcData source data buffer
3205
 * @param nSrcPixelStride pixel stride in input buffer, in pixel words
3206
 * @param bInComplex input is complex
3207
 * @param pDstData destination data buffer
3208
 * @param eDstType destination data type
3209
 * @param nDstPixelStride pixel stride in output buffer, in pixel words
3210
 * @param nWordCount number of pixel words to be copied
3211
 */
3212
template <class T>
3213
inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
53,984,861✔
3214
                               int nSrcPixelStride, bool bInComplex,
3215
                               void *CPL_RESTRICT pDstData,
3216
                               GDALDataType eDstType, int nDstPixelStride,
3217
                               GPtrDiff_t nWordCount)
3218
{
3219
    switch (eDstType)
53,984,861✔
3220
    {
3221
        case GDT_Byte:
4,623,403✔
3222
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
4,623,403✔
3223
                           static_cast<unsigned char *>(pDstData),
3224
                           nDstPixelStride, nWordCount);
3225
            break;
4,623,485✔
3226
        case GDT_Int8:
751✔
3227
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
751✔
3228
                           static_cast<signed char *>(pDstData),
3229
                           nDstPixelStride, nWordCount);
3230
            break;
751✔
3231
        case GDT_UInt16:
140,690✔
3232
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
140,690✔
3233
                           static_cast<unsigned short *>(pDstData),
3234
                           nDstPixelStride, nWordCount);
3235
            break;
140,688✔
3236
        case GDT_Int16:
4,162,813✔
3237
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
4,162,813✔
3238
                           static_cast<short *>(pDstData), nDstPixelStride,
3239
                           nWordCount);
3240
            break;
4,162,813✔
3241
        case GDT_UInt32:
22,223✔
3242
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
22,223✔
3243
                           static_cast<unsigned int *>(pDstData),
3244
                           nDstPixelStride, nWordCount);
3245
            break;
22,223✔
3246
        case GDT_Int32:
26,005,468✔
3247
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
26,005,468✔
3248
                           static_cast<int *>(pDstData), nDstPixelStride,
3249
                           nWordCount);
3250
            break;
26,037,969✔
3251
        case GDT_UInt64:
809✔
3252
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
809✔
3253
                           static_cast<std::uint64_t *>(pDstData),
3254
                           nDstPixelStride, nWordCount);
3255
            break;
809✔
3256
        case GDT_Int64:
5,181✔
3257
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
5,181✔
3258
                           static_cast<std::int64_t *>(pDstData),
3259
                           nDstPixelStride, nWordCount);
3260
            break;
5,181✔
3261
        case GDT_Float16:
940✔
3262
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
940✔
3263
                           static_cast<GFloat16 *>(pDstData), nDstPixelStride,
3264
                           nWordCount);
3265
            break;
940✔
3266
        case GDT_Float32:
3,699,564✔
3267
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
3,699,564✔
3268
                           static_cast<float *>(pDstData), nDstPixelStride,
3269
                           nWordCount);
3270
            break;
3,699,564✔
3271
        case GDT_Float64:
15,195,169✔
3272
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
15,195,169✔
3273
                           static_cast<double *>(pDstData), nDstPixelStride,
3274
                           nWordCount);
3275
            break;
15,195,209✔
3276
        case GDT_CInt16:
94,123✔
3277
            if (bInComplex)
94,123✔
3278
            {
3279
                GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
92,870✔
3280
                                      static_cast<short *>(pDstData),
3281
                                      nDstPixelStride, nWordCount);
3282
            }
3283
            else  // input is not complex, so we need to promote to a complex
3284
                  // buffer
3285
            {
3286
                GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
1,253✔
3287
                                         static_cast<short *>(pDstData),
3288
                                         nDstPixelStride, nWordCount);
3289
            }
3290
            break;
94,123✔
3291
        case GDT_CInt32:
1,052✔
3292
            if (bInComplex)
1,052✔
3293
            {
3294
                GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
421✔
3295
                                      static_cast<int *>(pDstData),
3296
                                      nDstPixelStride, nWordCount);
3297
            }
3298
            else  // input is not complex, so we need to promote to a complex
3299
                  // buffer
3300
            {
3301
                GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
631✔
3302
                                         static_cast<int *>(pDstData),
3303
                                         nDstPixelStride, nWordCount);
3304
            }
3305
            break;
1,052✔
3306
        case GDT_CFloat16:
281✔
3307
            if (bInComplex)
281✔
3308
            {
3309
                GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
16✔
3310
                                      static_cast<GFloat16 *>(pDstData),
3311
                                      nDstPixelStride, nWordCount);
3312
            }
3313
            else  // input is not complex, so we need to promote to a complex
3314
                  // buffer
3315
            {
3316
                GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
265✔
3317
                                         static_cast<GFloat16 *>(pDstData),
3318
                                         nDstPixelStride, nWordCount);
3319
            }
3320
            break;
281✔
3321
        case GDT_CFloat32:
3,359✔
3322
            if (bInComplex)
3,359✔
3323
            {
3324
                GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
2,564✔
3325
                                      static_cast<float *>(pDstData),
3326
                                      nDstPixelStride, nWordCount);
3327
            }
3328
            else  // input is not complex, so we need to promote to a complex
3329
                  // buffer
3330
            {
3331
                GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
795✔
3332
                                         static_cast<float *>(pDstData),
3333
                                         nDstPixelStride, nWordCount);
3334
            }
3335
            break;
3,359✔
3336
        case GDT_CFloat64:
1,779✔
3337
            if (bInComplex)
1,779✔
3338
            {
3339
                GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
846✔
3340
                                      static_cast<double *>(pDstData),
3341
                                      nDstPixelStride, nWordCount);
3342
            }
3343
            else  // input is not complex, so we need to promote to a complex
3344
                  // buffer
3345
            {
3346
                GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
933✔
3347
                                         static_cast<double *>(pDstData),
3348
                                         nDstPixelStride, nWordCount);
3349
            }
3350
            break;
1,779✔
3351
        case GDT_Unknown:
×
3352
        case GDT_TypeCount:
3353
            CPLAssert(false);
×
3354
    }
3355
}
54,017,582✔
3356

3357
}  // end anonymous namespace
3358

3359
/************************************************************************/
3360
/*                          GDALReplicateWord()                         */
3361
/************************************************************************/
3362

3363
template <class T>
3364
inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
589,522✔
3365
                               GPtrDiff_t nWordCount)
3366
{
3367
    const T valSet = *static_cast<const T *>(pDstData);
589,522✔
3368
    if (nDstPixelStride == static_cast<int>(sizeof(T)))
589,522✔
3369
    {
3370
        T *pDstPtr = static_cast<T *>(pDstData) + 1;
558,702✔
3371
        while (nWordCount >= 4)
18,592,735✔
3372
        {
3373
            nWordCount -= 4;
18,034,041✔
3374
            pDstPtr[0] = valSet;
18,034,041✔
3375
            pDstPtr[1] = valSet;
18,034,041✔
3376
            pDstPtr[2] = valSet;
18,034,041✔
3377
            pDstPtr[3] = valSet;
18,034,041✔
3378
            pDstPtr += 4;
18,034,041✔
3379
        }
3380
        while (nWordCount > 0)
1,448,361✔
3381
        {
3382
            --nWordCount;
889,659✔
3383
            *pDstPtr = valSet;
889,659✔
3384
            pDstPtr++;
889,659✔
3385
        }
3386
    }
3387
    else
3388
    {
3389
        GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
30,820✔
3390
        while (nWordCount > 0)
1,041,407✔
3391
        {
3392
            --nWordCount;
1,010,587✔
3393
            *reinterpret_cast<T *>(pabyDstPtr) = valSet;
1,010,587✔
3394
            pabyDstPtr += nDstPixelStride;
1,010,587✔
3395
        }
3396
    }
3397
}
589,522✔
3398

3399
static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
1,000,410✔
3400
                              GDALDataType eSrcType,
3401
                              void *CPL_RESTRICT pDstData,
3402
                              GDALDataType eDstType, int nDstPixelStride,
3403
                              GPtrDiff_t nWordCount)
3404
{
3405
    /* -----------------------------------------------------------------------
3406
     */
3407
    /* Special case when the source data is always the same value */
3408
    /* (for VRTSourcedRasterBand::IRasterIO and
3409
     * VRTDerivedRasterBand::IRasterIO*/
3410
    /*  for example) */
3411
    /* -----------------------------------------------------------------------
3412
     */
3413
    // Let the general translation case do the necessary conversions
3414
    // on the first destination element.
3415
    GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
1,000,410✔
3416

3417
    // Now copy the first element to the nWordCount - 1 following destination
3418
    // elements.
3419
    nWordCount--;
1,002,590✔
3420
    GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
1,002,590✔
3421

3422
    switch (eDstType)
1,002,590✔
3423
    {
3424
        case GDT_Byte:
415,480✔
3425
        case GDT_Int8:
3426
        {
3427
            if (nDstPixelStride == 1)
415,480✔
3428
            {
3429
                if (nWordCount > 0)
375,175✔
3430
                    memset(pabyDstWord,
375,175✔
3431
                           *reinterpret_cast<const GByte *>(pDstData),
375,175✔
3432
                           nWordCount);
3433
            }
3434
            else
3435
            {
3436
                GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
40,305✔
3437
                while (nWordCount > 0)
23,934,200✔
3438
                {
3439
                    --nWordCount;
23,893,900✔
3440
                    *pabyDstWord = valSet;
23,893,900✔
3441
                    pabyDstWord += nDstPixelStride;
23,893,900✔
3442
                }
3443
            }
3444
            break;
415,480✔
3445
        }
3446

3447
#define CASE_DUPLICATE_SIMPLE(enum_type, c_type)                               \
3448
    case enum_type:                                                            \
3449
    {                                                                          \
3450
        GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount);     \
3451
        break;                                                                 \
3452
    }
3453

3454
            CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
34,497✔
3455
            CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
202,438✔
3456
            CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
56✔
3457
            CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
290,817✔
3458
            CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
21✔
3459
            CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
1,064✔
3460
            CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
×
3461
            CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
52,524✔
3462
            CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
5,171✔
3463

3464
#define CASE_DUPLICATE_COMPLEX(enum_type, c_type)                              \
3465
    case enum_type:                                                            \
3466
    {                                                                          \
3467
        c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0];        \
3468
        c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1];        \
3469
        while (nWordCount > 0)                                                 \
3470
        {                                                                      \
3471
            --nWordCount;                                                      \
3472
            reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1;              \
3473
            reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2;              \
3474
            pabyDstWord += nDstPixelStride;                                    \
3475
        }                                                                      \
3476
        break;                                                                 \
3477
    }
3478

3479
            CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
784✔
3480
            CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
784✔
3481
            CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
×
3482
            CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
784✔
3483
            CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
784✔
3484

3485
        case GDT_Unknown:
×
3486
        case GDT_TypeCount:
3487
            CPLAssert(false);
×
3488
    }
3489
}
1,006,630✔
3490

3491
/************************************************************************/
3492
/*                        GDALUnrolledCopy()                            */
3493
/************************************************************************/
3494

3495
template <class T, int srcStride, int dstStride>
3496
static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
3,135,693✔
3497
                                           const T *CPL_RESTRICT pSrc,
3498
                                           GPtrDiff_t nIters)
3499
{
3500
    if (nIters >= 16)
3,135,693✔
3501
    {
3502
        for (GPtrDiff_t i = nIters / 16; i != 0; i--)
135,346,257✔
3503
        {
3504
            pDest[0 * dstStride] = pSrc[0 * srcStride];
132,337,650✔
3505
            pDest[1 * dstStride] = pSrc[1 * srcStride];
132,337,650✔
3506
            pDest[2 * dstStride] = pSrc[2 * srcStride];
132,337,650✔
3507
            pDest[3 * dstStride] = pSrc[3 * srcStride];
132,337,650✔
3508
            pDest[4 * dstStride] = pSrc[4 * srcStride];
132,337,650✔
3509
            pDest[5 * dstStride] = pSrc[5 * srcStride];
132,337,650✔
3510
            pDest[6 * dstStride] = pSrc[6 * srcStride];
132,337,650✔
3511
            pDest[7 * dstStride] = pSrc[7 * srcStride];
132,337,650✔
3512
            pDest[8 * dstStride] = pSrc[8 * srcStride];
132,337,650✔
3513
            pDest[9 * dstStride] = pSrc[9 * srcStride];
132,337,650✔
3514
            pDest[10 * dstStride] = pSrc[10 * srcStride];
132,337,650✔
3515
            pDest[11 * dstStride] = pSrc[11 * srcStride];
132,337,650✔
3516
            pDest[12 * dstStride] = pSrc[12 * srcStride];
132,337,650✔
3517
            pDest[13 * dstStride] = pSrc[13 * srcStride];
132,337,650✔
3518
            pDest[14 * dstStride] = pSrc[14 * srcStride];
132,337,650✔
3519
            pDest[15 * dstStride] = pSrc[15 * srcStride];
132,337,650✔
3520
            pDest += 16 * dstStride;
132,337,650✔
3521
            pSrc += 16 * srcStride;
132,337,650✔
3522
        }
3523
        nIters = nIters % 16;
3,008,601✔
3524
    }
3525
    for (GPtrDiff_t i = 0; i < nIters; i++)
5,377,498✔
3526
    {
3527
        pDest[i * dstStride] = *pSrc;
2,241,815✔
3528
        pSrc += srcStride;
2,241,815✔
3529
    }
3530
}
3,135,693✔
3531

3532
template <class T, int srcStride, int dstStride>
3533
static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
3,129,593✔
3534
                                    const T *CPL_RESTRICT pSrc,
3535
                                    GPtrDiff_t nIters)
3536
{
3537
    GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
3,129,593✔
3538
}
3,129,623✔
3539

3540
#ifdef HAVE_SSE2
3541

3542
template <>
3543
void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
352,920✔
3544
                                   const GByte *CPL_RESTRICT pSrc,
3545
                                   GPtrDiff_t nIters)
3546
{
3547
    decltype(nIters) i = 0;
352,920✔
3548
    if (nIters > 16)
352,920✔
3549
    {
3550
        const __m128i xmm_mask = _mm_set1_epi16(0xff);
194,667✔
3551
        // If we were sure that there would always be 1 trailing byte, we could
3552
        // check against nIters - 15
3553
        for (; i < nIters - 16; i += 16)
2,988,110✔
3554
        {
3555
            __m128i xmm0 =
3556
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
2,793,440✔
3557
            __m128i xmm1 =
3558
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
5,586,890✔
3559
            // Set higher 8bit of each int16 packed word to 0
3560
            xmm0 = _mm_and_si128(xmm0, xmm_mask);
2,793,440✔
3561
            xmm1 = _mm_and_si128(xmm1, xmm_mask);
2,793,440✔
3562
            // Pack int16 to uint8 and merge back both vector
3563
            xmm0 = _mm_packus_epi16(xmm0, xmm1);
2,793,440✔
3564

3565
            // Store result
3566
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
2,793,440✔
3567

3568
            pSrc += 2 * 16;
2,793,440✔
3569
        }
3570
    }
3571
    for (; i < nIters; i++)
4,619,940✔
3572
    {
3573
        pDest[i] = *pSrc;
4,267,020✔
3574
        pSrc += 2;
4,267,020✔
3575
    }
3576
}
352,920✔
3577

3578
#ifdef HAVE_SSSE3_AT_COMPILE_TIME
3579

3580
template <>
3581
void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
191,860✔
3582
                                   const GByte *CPL_RESTRICT pSrc,
3583
                                   GPtrDiff_t nIters)
3584
{
3585
    if (nIters > 16 && CPLHaveRuntimeSSSE3())
191,860✔
3586
    {
3587
        GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
185,760✔
3588
    }
3589
    else
3590
    {
3591
        GDALUnrolledCopyGeneric<GByte, 3, 1>(pDest, pSrc, nIters);
6,100✔
3592
    }
3593
}
191,860✔
3594

3595
#endif
3596

3597
template <>
3598
void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
106,238✔
3599
                                   const GByte *CPL_RESTRICT pSrc,
3600
                                   GPtrDiff_t nIters)
3601
{
3602
    decltype(nIters) i = 0;
106,238✔
3603
    if (nIters > 16)
106,238✔
3604
    {
3605
        const __m128i xmm_mask = _mm_set1_epi32(0xff);
100,945✔
3606
        // If we were sure that there would always be 3 trailing bytes, we could
3607
        // check against nIters - 15
3608
        for (; i < nIters - 16; i += 16)
9,914,960✔
3609
        {
3610
            __m128i xmm0 =
3611
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
9,813,880✔
3612
            __m128i xmm1 =
3613
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
9,813,880✔
3614
            __m128i xmm2 =
3615
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
9,813,880✔
3616
            __m128i xmm3 =
3617
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
19,627,800✔
3618
            // Set higher 24bit of each int32 packed word to 0
3619
            xmm0 = _mm_and_si128(xmm0, xmm_mask);
9,813,880✔
3620
            xmm1 = _mm_and_si128(xmm1, xmm_mask);
9,813,880✔
3621
            xmm2 = _mm_and_si128(xmm2, xmm_mask);
9,813,880✔
3622
            xmm3 = _mm_and_si128(xmm3, xmm_mask);
9,813,880✔
3623
            // Pack int32 to int16
3624
            xmm0 = _mm_packs_epi32(xmm0, xmm1);
9,813,730✔
3625
            xmm2 = _mm_packs_epi32(xmm2, xmm3);
9,813,780✔
3626
            // Pack int16 to uint8
3627
            xmm0 = _mm_packus_epi16(xmm0, xmm2);
9,814,020✔
3628

3629
            // Store result
3630
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
9,814,020✔
3631

3632
            pSrc += 4 * 16;
9,814,020✔
3633
        }
3634
    }
3635
    for (; i < nIters; i++)
1,135,930✔
3636
    {
3637
        pDest[i] = *pSrc;
1,029,550✔
3638
        pSrc += 4;
1,029,550✔
3639
    }
3640
}
106,379✔
3641
#endif  // HAVE_SSE2
3642

3643
/************************************************************************/
3644
/*                         GDALFastCopy()                               */
3645
/************************************************************************/
3646

3647
template <class T>
3648
static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
40,245,900✔
3649
                                const T *CPL_RESTRICT pSrc, int nSrcStride,
3650
                                GPtrDiff_t nIters)
3651
{
3652
    constexpr int sizeofT = static_cast<int>(sizeof(T));
40,245,900✔
3653
    if (nIters == 1)
40,245,900✔
3654
    {
3655
        *pDest = *pSrc;
22,108,700✔
3656
    }
3657
    else if (nDestStride == sizeofT)
18,137,221✔
3658
    {
3659
        if (nSrcStride == sizeofT)
14,933,058✔
3660
        {
3661
            memcpy(pDest, pSrc, nIters * sizeof(T));
14,076,207✔
3662
        }
3663
        else if (nSrcStride == 2 * sizeofT)
856,917✔
3664
        {
3665
            GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
355,879✔
3666
        }
3667
        else if (nSrcStride == 3 * sizeofT)
501,038✔
3668
        {
3669
            GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
288,432✔
3670
        }
3671
        else if (nSrcStride == 4 * sizeofT)
212,606✔
3672
        {
3673
            GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
110,221✔
3674
        }
3675
        else
3676
        {
3677
            while (nIters-- > 0)
17,218,690✔
3678
            {
3679
                *pDest = *pSrc;
17,116,250✔
3680
                pSrc += nSrcStride / sizeofT;
17,116,250✔
3681
                pDest++;
17,116,250✔
3682
            }
3683
        }
3684
    }
3685
    else if (nSrcStride == sizeofT)
3,204,163✔
3686
    {
3687
        if (nDestStride == 2 * sizeofT)
3,189,696✔
3688
        {
3689
            GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
158,669✔
3690
        }
3691
        else if (nDestStride == 3 * sizeofT)
3,031,025✔
3692
        {
3693
            GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
2,144,771✔
3694
        }
3695
        else if (nDestStride == 4 * sizeofT)
886,256✔
3696
        {
3697
            GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
722,642✔
3698
        }
3699
        else
3700
        {
3701
            while (nIters-- > 0)
17,106,060✔
3702
            {
3703
                *pDest = *pSrc;
16,942,410✔
3704
                pSrc++;
16,942,410✔
3705
                pDest += nDestStride / sizeofT;
16,942,410✔
3706
            }
3707
        }
3708
    }
3709
    else
3710
    {
3711
        while (nIters-- > 0)
1,221,569✔
3712
        {
3713
            *pDest = *pSrc;
1,207,102✔
3714
            pSrc += nSrcStride / sizeofT;
1,207,102✔
3715
            pDest += nDestStride / sizeofT;
1,207,102✔
3716
        }
3717
    }
3718
}
40,246,000✔
3719

3720
/************************************************************************/
3721
/*                         GDALFastCopyByte()                           */
3722
/************************************************************************/
3723

3724
static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
326,246✔
3725
                             int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
3726
                             int nDstPixelStride, GPtrDiff_t nWordCount)
3727
{
3728
    GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
326,246✔
3729
                 nWordCount);
3730
}
326,246✔
3731

3732
/************************************************************************/
3733
/*                           GDALCopyWords()                            */
3734
/************************************************************************/
3735

3736
/**
3737
 * Copy pixel words from buffer to buffer.
3738
 *
3739
 * @see GDALCopyWords64()
3740
 */
3741
void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
87,037,500✔
3742
                               GDALDataType eSrcType, int nSrcPixelStride,
3743
                               void *CPL_RESTRICT pDstData,
3744
                               GDALDataType eDstType, int nDstPixelStride,
3745
                               int nWordCount)
3746
{
3747
    GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
87,037,500✔
3748
                    nDstPixelStride, nWordCount);
3749
}
87,031,600✔
3750

3751
/************************************************************************/
3752
/*                          GDALCopyWords64()                           */
3753
/************************************************************************/
3754

3755
/**
3756
 * Copy pixel words from buffer to buffer.
3757
 *
3758
 * This function is used to copy pixel word values from one memory buffer
3759
 * to another, with support for conversion between data types, and differing
3760
 * step factors. The data type conversion is done using the following
3761
 * rules:
3762
 * <ul>
3763
 * <li>Values assigned to a lower range integer type are clipped. For
3764
 * instance assigning GDT_Int16 values to a GDT_Byte buffer will cause values
3765
 * less the 0 to be set to 0, and values larger than 255 to be set to 255.
3766
 * </li>
3767
 * <li>
3768
 * Assignment from floating point to integer rounds to closest integer.
3769
 * +Infinity is mapped to the largest integer. -Infinity is mapped to the
3770
 * smallest integer. NaN is mapped to 0.
3771
 * </li>
3772
 * <li>
3773
 * Assignment from non-complex to complex will result in the imaginary part
3774
 * being set to zero on output.
3775
 * </li>
3776
 * <li> Assignment from complex to
3777
 * non-complex will result in the complex portion being lost and the real
3778
 * component being preserved (<i>not magnitude!</i>).
3779
 * </li>
3780
 * </ul>
3781
 *
3782
 * No assumptions are made about the source or destination words occurring
3783
 * on word boundaries.  It is assumed that all values are in native machine
3784
 * byte order.
3785
 *
3786
 * @param pSrcData Pointer to source data to be converted.
3787
 * @param eSrcType the source data type (see GDALDataType enum)
3788
 * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
3789
 * in bytes
3790
 * @param pDstData Pointer to buffer where destination data should go
3791
 * @param eDstType the destination data type (see GDALDataType enum)
3792
 * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
3793
 * words), in bytes
3794
 * @param nWordCount number of words to be copied
3795
 *
3796
 * @note
3797
 * When adding a new data type to GDAL, you must do the following to
3798
 * support it properly within the GDALCopyWords function:
3799
 * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
3800
 *    This should invoke the appropriate GDALCopyWordsFromT wrapper.
3801
 * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
3802
 *    This should call the appropriate GDALCopyWordsT template.
3803
 * 3. If appropriate, overload the appropriate CopyWord template in the
3804
 *    above namespace. This will ensure that any conversion issues are
3805
 *    handled (cases like the float -> int32 case, where the min/max)
3806
 *    values are subject to roundoff error.
3807
 */
3808

3809
void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
109,031,000✔
3810
                                 GDALDataType eSrcType, int nSrcPixelStride,
3811
                                 void *CPL_RESTRICT pDstData,
3812
                                 GDALDataType eDstType, int nDstPixelStride,
3813
                                 GPtrDiff_t nWordCount)
3814

3815
{
3816
    // On platforms where alignment matters, be careful
3817
    const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
109,031,000✔
3818
    const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
108,973,000✔
3819
    if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
108,929,000✔
3820
    {
3821
        CPLError(CE_Failure, CPLE_NotSupported,
2✔
3822
                 "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
3823
                 "argument");
3824
        return;
2✔
3825
    }
3826
    if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
108,929,000✔
3827
        ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
58,710,200✔
3828
         (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
58,736,300✔
3829
         (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
58,732,600✔
3830
         (nDstPixelStride % nDstDataTypeSize) != 0))
58,735,700✔
3831
    {
3832
        if (eSrcType == eDstType)
905✔
3833
        {
3834
            for (decltype(nWordCount) i = 0; i < nWordCount; i++)
34,800✔
3835
            {
3836
                memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
34,000✔
3837
                       static_cast<const GByte *>(pSrcData) +
3838
                           nSrcPixelStride * i,
34,000✔
3839
                       nDstDataTypeSize);
3840
            }
3841
        }
3842
        else
3843
        {
3844
            const auto getAlignedPtr = [](GByte *ptr, int align)
210✔
3845
            {
3846
                return ptr +
3847
                       ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
210✔
3848
                        align);
210✔
3849
            };
3850

3851
            // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
3852
            // be sure to get correctly aligned pointer.
3853
            constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
105✔
3854
            GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
3855
            GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
3856
            GByte *pabySrcBuffer =
3857
                getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
105✔
3858
            GByte *pabyDstBuffer =
3859
                getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
105✔
3860
            for (decltype(nWordCount) i = 0; i < nWordCount; i++)
3,360✔
3861
            {
3862
                memcpy(pabySrcBuffer,
3,255✔
3863
                       static_cast<const GByte *>(pSrcData) +
3864
                           nSrcPixelStride * i,
3,255✔
3865
                       nSrcDataTypeSize);
3866
                GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
3,255✔
3867
                                eDstType, 0, 1);
3868
                memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
3,255✔
3869
                       pabyDstBuffer, nDstDataTypeSize);
3870
            }
3871
        }
3872
        return;
905✔
3873
    }
3874

3875
    // Deal with the case where we're replicating a single word into the
3876
    // provided buffer
3877
    if (nSrcPixelStride == 0 && nWordCount > 1)
108,928,000✔
3878
    {
3879
        GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
999,654✔
3880
                          nDstPixelStride, nWordCount);
3881
        return;
1,006,140✔
3882
    }
3883

3884
    if (eSrcType == eDstType)
107,928,000✔
3885
    {
3886
        if (eSrcType == GDT_Byte || eSrcType == GDT_Int8)
54,204,600✔
3887
        {
3888
            GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
19,112,600✔
3889
                         static_cast<const GByte *>(pSrcData), nSrcPixelStride,
3890
                         nWordCount);
3891
            return;
19,111,700✔
3892
        }
3893

3894
        if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
35,092,000✔
3895
            (nDstPixelStride % 2) == 0)
20,808,500✔
3896
        {
3897
            GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
20,808,500✔
3898
                         static_cast<const short *>(pSrcData), nSrcPixelStride,
3899
                         nWordCount);
3900
            return;
20,808,500✔
3901
        }
3902

3903
        if (nWordCount == 1)
14,283,500✔
3904
        {
3905
#if defined(CSA_BUILD) || defined(__COVERITY__)
3906
            // Avoid false positives...
3907
            memcpy(pDstData, pSrcData, nSrcDataTypeSize);
3908
#else
3909
            if (nSrcDataTypeSize == 2)
13,892,800✔
3910
                memcpy(pDstData, pSrcData, 2);
×
3911
            else if (nSrcDataTypeSize == 4)
13,892,800✔
3912
                memcpy(pDstData, pSrcData, 4);
13,809,100✔
3913
            else if (nSrcDataTypeSize == 8)
83,682✔
3914
                memcpy(pDstData, pSrcData, 8);
67,162✔
3915
            else /* if( eSrcType == GDT_CFloat64 ) */
3916
                memcpy(pDstData, pSrcData, 16);
16,520✔
3917
#endif
3918
            return;
13,892,800✔
3919
        }
3920

3921
        // Let memcpy() handle the case where we're copying a packed buffer
3922
        // of pixels.
3923
        if (nSrcPixelStride == nDstPixelStride)
390,706✔
3924
        {
3925
            if (nSrcPixelStride == nSrcDataTypeSize)
262,459✔
3926
            {
3927
                memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
262,391✔
3928
                return;
262,391✔
3929
            }
3930
        }
3931
    }
3932

3933
    // Handle the more general case -- deals with conversion of data types
3934
    // directly.
3935
    switch (eSrcType)
53,852,000✔
3936
    {
3937
        case GDT_Byte:
15,438,400✔
3938
            GDALCopyWordsFromT<unsigned char>(
15,438,400✔
3939
                static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
3940
                false, pDstData, eDstType, nDstPixelStride, nWordCount);
3941
            break;
15,489,100✔
3942
        case GDT_Int8:
1,225✔
3943
            GDALCopyWordsFromT<signed char>(
1,225✔
3944
                static_cast<const signed char *>(pSrcData), nSrcPixelStride,
3945
                false, pDstData, eDstType, nDstPixelStride, nWordCount);
3946
            break;
1,225✔
3947
        case GDT_UInt16:
53,322✔
3948
            GDALCopyWordsFromT<unsigned short>(
53,322✔
3949
                static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
3950
                false, pDstData, eDstType, nDstPixelStride, nWordCount);
3951
            break;
53,322✔
3952
        case GDT_Int16:
4,350,230✔
3953
            GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
4,350,230✔
3954
                                      nSrcPixelStride, false, pDstData,
3955
                                      eDstType, nDstPixelStride, nWordCount);
3956
            break;
4,350,240✔
3957
        case GDT_UInt32:
7,066✔
3958
            GDALCopyWordsFromT<unsigned int>(
7,066✔
3959
                static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
3960
                false, pDstData, eDstType, nDstPixelStride, nWordCount);
3961
            break;
7,066✔
3962
        case GDT_Int32:
12,255,000✔
3963
            GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
12,255,000✔
3964
                                    nSrcPixelStride, false, pDstData, eDstType,
3965
                                    nDstPixelStride, nWordCount);
3966
            break;
12,255,000✔
3967
        case GDT_UInt64:
1,635✔
3968
            GDALCopyWordsFromT<std::uint64_t>(
1,635✔
3969
                static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
3970
                false, pDstData, eDstType, nDstPixelStride, nWordCount);
3971
            break;
1,635✔
3972
        case GDT_Int64:
10,978✔
3973
            GDALCopyWordsFromT<std::int64_t>(
10,978✔
3974
                static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
3975
                false, pDstData, eDstType, nDstPixelStride, nWordCount);
3976
            break;
10,978✔
3977
        case GDT_Float16:
1,074✔
3978
            GDALCopyWordsFromT<GFloat16>(
1,074✔
3979
                static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
3980
                pDstData, eDstType, nDstPixelStride, nWordCount);
3981
            break;
1,074✔
3982
        case GDT_Float32:
395,147✔
3983
            GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
395,147✔
3984
                                      nSrcPixelStride, false, pDstData,
3985
                                      eDstType, nDstPixelStride, nWordCount);
3986
            break;
395,149✔
3987
        case GDT_Float64:
20,665,500✔
3988
            GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
20,665,500✔
3989
                                       nSrcPixelStride, false, pDstData,
3990
                                       eDstType, nDstPixelStride, nWordCount);
3991
            break;
20,665,600✔
3992
        case GDT_CInt16:
478,131✔
3993
            GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
478,131✔
3994
                                      nSrcPixelStride, true, pDstData, eDstType,
3995
                                      nDstPixelStride, nWordCount);
3996
            break;
478,131✔
3997
        case GDT_CInt32:
546✔
3998
            GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
546✔
3999
                                    nSrcPixelStride, true, pDstData, eDstType,
4000
                                    nDstPixelStride, nWordCount);
4001
            break;
546✔
4002
        case GDT_CFloat16:
396✔
4003
            GDALCopyWordsFromT<GFloat16>(
396✔
4004
                static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
4005
                pDstData, eDstType, nDstPixelStride, nWordCount);
4006
            break;
396✔
4007
        case GDT_CFloat32:
1,537✔
4008
            GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
1,537✔
4009
                                      nSrcPixelStride, true, pDstData, eDstType,
4010
                                      nDstPixelStride, nWordCount);
4011
            break;
1,537✔
4012
        case GDT_CFloat64:
276,380✔
4013
            GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
276,380✔
4014
                                       nSrcPixelStride, true, pDstData,
4015
                                       eDstType, nDstPixelStride, nWordCount);
4016
            break;
276,380✔
4017
        case GDT_Unknown:
×
4018
        case GDT_TypeCount:
4019
            CPLAssert(false);
×
4020
    }
4021
}
4022

4023
/************************************************************************/
4024
/*                            GDALCopyBits()                            */
4025
/************************************************************************/
4026

4027
/**
4028
 * Bitwise word copying.
4029
 *
4030
 * A function for moving sets of partial bytes around.  Loosely
4031
 * speaking this is a bitwise analog to GDALCopyWords().
4032
 *
4033
 * It copies nStepCount "words" where each word is nBitCount bits long.
4034
 * The nSrcStep and nDstStep are the number of bits from the start of one
4035
 * word to the next (same as nBitCount if they are packed).  The nSrcOffset
4036
 * and nDstOffset are the offset into the source and destination buffers
4037
 * to start at, also measured in bits.
4038
 *
4039
 * All bit offsets are assumed to start from the high order bit in a byte
4040
 * (i.e. most significant bit first).  Currently this function is not very
4041
 * optimized, but it may be improved for some common cases in the future
4042
 * as needed.
4043
 *
4044
 * @param pabySrcData the source data buffer.
4045
 * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
4046
 * first word to copy.
4047
 * @param nSrcStep the offset in bits from the start one source word to the
4048
 * start of the next.
4049
 * @param pabyDstData the destination data buffer.
4050
 * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
4051
 * first word to copy over.
4052
 * @param nDstStep the offset in bits from the start one word to the
4053
 * start of the next.
4054
 * @param nBitCount the number of bits in a word to be copied.
4055
 * @param nStepCount the number of words to copy.
4056
 */
4057

4058
void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
×
4059
                  GByte *pabyDstData, int nDstOffset, int nDstStep,
4060
                  int nBitCount, int nStepCount)
4061

4062
{
4063
    VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
×
4064

4065
    for (int iStep = 0; iStep < nStepCount; iStep++)
×
4066
    {
4067
        for (int iBit = 0; iBit < nBitCount; iBit++)
×
4068
        {
4069
            if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
×
4070
                pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
×
4071
            else
4072
                pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
×
4073

4074
            nSrcOffset++;
×
4075
            nDstOffset++;
×
4076
        }
4077

4078
        nSrcOffset += (nSrcStep - nBitCount);
×
4079
        nDstOffset += (nDstStep - nBitCount);
×
4080
    }
4081
}
4082

4083
/************************************************************************/
4084
/*                    GDALGetBestOverviewLevel()                        */
4085
/*                                                                      */
4086
/* Returns the best overview level to satisfy the query or -1 if none   */
4087
/* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when        */
4088
/* returning a valid overview level                                     */
4089
/************************************************************************/
4090

4091
int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
×
4092
                                 int &nXSize, int &nYSize, int nBufXSize,
4093
                                 int nBufYSize)
4094
{
4095
    return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
×
4096
                                         nBufXSize, nBufYSize, nullptr);
×
4097
}
4098

4099
int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
523,804✔
4100
                                  int &nYOff, int &nXSize, int &nYSize,
4101
                                  int nBufXSize, int nBufYSize,
4102
                                  GDALRasterIOExtraArg *psExtraArg)
4103
{
4104
    if (psExtraArg != nullptr && psExtraArg->nVersion > 1 &&
523,804✔
4105
        psExtraArg->bUseOnlyThisScale)
523,804✔
4106
        return -1;
109✔
4107
    /* -------------------------------------------------------------------- */
4108
    /*      Compute the desired downsampling factor.  It is                 */
4109
    /*      based on the least reduced axis, and represents the number      */
4110
    /*      of source pixels to one destination pixel.                      */
4111
    /* -------------------------------------------------------------------- */
4112
    const double dfDesiredDownsamplingFactor =
523,695✔
4113
        ((nXSize / static_cast<double>(nBufXSize)) <
523,695✔
4114
             (nYSize / static_cast<double>(nBufYSize)) ||
361,357✔
4115
         nBufYSize == 1)
4116
            ? nXSize / static_cast<double>(nBufXSize)
752,080✔
4117
            : nYSize / static_cast<double>(nBufYSize);
132,972✔
4118

4119
    /* -------------------------------------------------------------------- */
4120
    /*      Find the overview level that largest downsampling factor (most  */
4121
    /*      downsampled) that is still less than (or only a little more)    */
4122
    /*      downsampled than the request.                                   */
4123
    /* -------------------------------------------------------------------- */
4124
    const int nOverviewCount = poBand->GetOverviewCount();
523,695✔
4125
    GDALRasterBand *poBestOverview = nullptr;
523,695✔
4126
    double dfBestDownsamplingFactor = 0;
523,695✔
4127
    int nBestOverviewLevel = -1;
523,695✔
4128

4129
    const char *pszOversampligThreshold =
4130
        CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
523,695✔
4131

4132
    // Note: keep this logic for overview selection in sync between
4133
    // gdalwarp_lib.cpp and rasterio.cpp
4134
    // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
4135
    const double dfOversamplingThreshold =
4136
        pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
1,047,380✔
4137
        : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
523,686✔
4138
            ? 1.0
1,047,370✔
4139
            : 1.2;
523,695✔
4140
    for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
526,390✔
4141
    {
4142
        GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
5,547✔
4143
        if (poOverview == nullptr ||
11,094✔
4144
            poOverview->GetXSize() > poBand->GetXSize() ||
11,093✔
4145
            poOverview->GetYSize() > poBand->GetYSize())
5,546✔
4146
        {
4147
            continue;
1✔
4148
        }
4149

4150
        // Compute downsampling factor of this overview
4151
        const double dfDownsamplingFactor = std::min(
4152
            poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
5,546✔
4153
            poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
11,092✔
4154

4155
        // Is it nearly the requested factor and better (lower) than
4156
        // the current best factor?
4157
        // Use an epsilon because of numerical instability.
4158
        constexpr double EPSILON = 1e-1;
5,546✔
4159
        if (dfDownsamplingFactor >=
5,654✔
4160
                dfDesiredDownsamplingFactor * dfOversamplingThreshold +
5,546✔
4161
                    EPSILON ||
5,438✔
4162
            dfDownsamplingFactor <= dfBestDownsamplingFactor)
4163
        {
4164
            continue;
108✔
4165
        }
4166

4167
        // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
4168
        const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
5,438✔
4169

4170
        if (pszResampling != nullptr &&
5,438✔
4171
            STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
71✔
4172
            continue;
16✔
4173

4174
        // OK, this is our new best overview.
4175
        poBestOverview = poOverview;
5,422✔
4176
        nBestOverviewLevel = iOverview;
5,422✔
4177
        dfBestDownsamplingFactor = dfDownsamplingFactor;
5,422✔
4178

4179
        if (std::abs(dfDesiredDownsamplingFactor - dfDownsamplingFactor) <
5,422✔
4180
            EPSILON)
4181
        {
4182
            break;
2,852✔
4183
        }
4184
    }
4185

4186
    /* -------------------------------------------------------------------- */
4187
    /*      If we didn't find an overview that helps us, just return        */
4188
    /*      indicating failure and the full resolution image will be used.  */
4189
    /* -------------------------------------------------------------------- */
4190
    if (nBestOverviewLevel < 0)
523,695✔
4191
        return -1;
520,771✔
4192

4193
    /* -------------------------------------------------------------------- */
4194
    /*      Recompute the source window in terms of the selected            */
4195
    /*      overview.                                                       */
4196
    /* -------------------------------------------------------------------- */
4197
    const double dfXFactor =
4198
        poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
2,924✔
4199
    const double dfYFactor =
4200
        poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
2,924✔
4201
    CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
2,924✔
4202
             poBestOverview->GetYSize());
4203

4204
    const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
8,772✔
4205
                                static_cast<int>(nXOff / dfXFactor + 0.5));
2,924✔
4206
    const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
8,772✔
4207
                                static_cast<int>(nYOff / dfYFactor + 0.5));
2,924✔
4208
    int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
2,924✔
4209
    int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
2,924✔
4210
    if (nOXOff + nOXSize > poBestOverview->GetXSize())
2,924✔
4211
        nOXSize = poBestOverview->GetXSize() - nOXOff;
×
4212
    if (nOYOff + nOYSize > poBestOverview->GetYSize())
2,924✔
4213
        nOYSize = poBestOverview->GetYSize() - nOYOff;
2✔
4214

4215
    if (psExtraArg)
2,924✔
4216
    {
4217
        if (psExtraArg->bFloatingPointWindowValidity)
2,924✔
4218
        {
4219
            psExtraArg->dfXOff /= dfXFactor;
50✔
4220
            psExtraArg->dfXSize /= dfXFactor;
50✔
4221
            psExtraArg->dfYOff /= dfYFactor;
50✔
4222
            psExtraArg->dfYSize /= dfYFactor;
50✔
4223
        }
4224
        else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
2,874✔
4225
        {
4226
            psExtraArg->bFloatingPointWindowValidity = true;
16✔
4227
            psExtraArg->dfXOff = nXOff / dfXFactor;
16✔
4228
            psExtraArg->dfXSize = nXSize / dfXFactor;
16✔
4229
            psExtraArg->dfYOff = nYOff / dfYFactor;
16✔
4230
            psExtraArg->dfYSize = nYSize / dfYFactor;
16✔
4231
        }
4232
    }
4233

4234
    nXOff = nOXOff;
2,924✔
4235
    nYOff = nOYOff;
2,924✔
4236
    nXSize = nOXSize;
2,924✔
4237
    nYSize = nOYSize;
2,924✔
4238

4239
    return nBestOverviewLevel;
2,924✔
4240
}
4241

4242
/************************************************************************/
4243
/*                          OverviewRasterIO()                          */
4244
/*                                                                      */
4245
/*      Special work function to utilize available overviews to         */
4246
/*      more efficiently satisfy downsampled requests.  It will         */
4247
/*      return CE_Failure if there are no appropriate overviews         */
4248
/*      available but it doesn't emit any error messages.               */
4249
/************************************************************************/
4250

4251
//! @cond Doxygen_Suppress
4252
CPLErr GDALRasterBand::OverviewRasterIO(
2✔
4253
    GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4254
    void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4255
    GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
4256

4257
{
4258
    GDALRasterIOExtraArg sExtraArg;
4259
    GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
2✔
4260

4261
    const int nOverview = GDALBandGetBestOverviewLevel2(
2✔
4262
        this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
4263
    if (nOverview < 0)
2✔
4264
        return CE_Failure;
1✔
4265

4266
    /* -------------------------------------------------------------------- */
4267
    /*      Recast the call in terms of the new raster layer.               */
4268
    /* -------------------------------------------------------------------- */
4269
    GDALRasterBand *poOverviewBand = GetOverview(nOverview);
1✔
4270
    if (poOverviewBand == nullptr)
1✔
4271
        return CE_Failure;
×
4272

4273
    return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
1✔
4274
                                    pData, nBufXSize, nBufYSize, eBufType,
4275
                                    nPixelSpace, nLineSpace, &sExtraArg);
1✔
4276
}
4277

4278
/************************************************************************/
4279
/*                      TryOverviewRasterIO()                           */
4280
/************************************************************************/
4281

4282
CPLErr GDALRasterBand::TryOverviewRasterIO(
362,416✔
4283
    GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4284
    void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4285
    GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
4286
    int *pbTried)
4287
{
4288
    int nXOffMod = nXOff;
362,416✔
4289
    int nYOffMod = nYOff;
362,416✔
4290
    int nXSizeMod = nXSize;
362,416✔
4291
    int nYSizeMod = nYSize;
362,416✔
4292
    GDALRasterIOExtraArg sExtraArg;
4293

4294
    GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
362,416✔
4295

4296
    int iOvrLevel = GDALBandGetBestOverviewLevel2(
362,416✔
4297
        this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
4298
        &sExtraArg);
4299

4300
    if (iOvrLevel >= 0)
362,416✔
4301
    {
4302
        GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
50✔
4303
        if (poOverviewBand)
50✔
4304
        {
4305
            *pbTried = TRUE;
50✔
4306
            return poOverviewBand->RasterIO(
50✔
4307
                eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
4308
                nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
4309
                &sExtraArg);
50✔
4310
        }
4311
    }
4312

4313
    *pbTried = FALSE;
362,366✔
4314
    return CE_None;
362,366✔
4315
}
4316

4317
/************************************************************************/
4318
/*                      TryOverviewRasterIO()                           */
4319
/************************************************************************/
4320

4321
CPLErr GDALDataset::TryOverviewRasterIO(
158,477✔
4322
    GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4323
    void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4324
    int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4325
    GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
4326
    int *pbTried)
4327
{
4328
    int nXOffMod = nXOff;
158,477✔
4329
    int nYOffMod = nYOff;
158,477✔
4330
    int nXSizeMod = nXSize;
158,477✔
4331
    int nYSizeMod = nYSize;
158,477✔
4332
    GDALRasterIOExtraArg sExtraArg;
4333
    GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
158,477✔
4334

4335
    int iOvrLevel = GDALBandGetBestOverviewLevel2(
316,954✔
4336
        papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
158,477✔
4337
        nBufYSize, &sExtraArg);
4338

4339
    if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
158,518✔
4340
        papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
41✔
4341
    {
4342
        *pbTried = TRUE;
41✔
4343
        return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
41✔
4344
            eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
4345
            nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
4346
            nLineSpace, nBandSpace, &sExtraArg);
41✔
4347
    }
4348
    else
4349
    {
4350
        *pbTried = FALSE;
158,436✔
4351
        return CE_None;
158,436✔
4352
    }
4353
}
4354

4355
/************************************************************************/
4356
/*                        GetBestOverviewLevel()                        */
4357
/*                                                                      */
4358
/* Returns the best overview level to satisfy the query or -1 if none   */
4359
/* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid     */
4360
/* overview level                                                       */
4361
/************************************************************************/
4362

4363
static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
4✔
4364
                                           int &nYOff, int &nXSize, int &nYSize,
4365
                                           int nBufXSize, int nBufYSize,
4366
                                           int nBandCount,
4367
                                           const int *panBandMap,
4368
                                           GDALRasterIOExtraArg *psExtraArg)
4369
{
4370
    int nOverviewCount = 0;
4✔
4371
    GDALRasterBand *poFirstBand = nullptr;
4✔
4372

4373
    /* -------------------------------------------------------------------- */
4374
    /* Check that all bands have the same number of overviews and           */
4375
    /* that they have all the same size and block dimensions                */
4376
    /* -------------------------------------------------------------------- */
4377
    for (int iBand = 0; iBand < nBandCount; iBand++)
12✔
4378
    {
4379
        GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
8✔
4380
        if (poBand == nullptr)
8✔
4381
            return -1;
×
4382
        if (iBand == 0)
8✔
4383
        {
4384
            poFirstBand = poBand;
4✔
4385
            nOverviewCount = poBand->GetOverviewCount();
4✔
4386
        }
4387
        else if (nOverviewCount != poBand->GetOverviewCount())
4✔
4388
        {
4389
            CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
×
4390
                             "mismatched overview count, use std method.");
4391
            return -1;
×
4392
        }
4393
        else
4394
        {
4395
            for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
4✔
4396
            {
4397
                GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
×
4398
                GDALRasterBand *poOvrFirstBand =
4399
                    poFirstBand->GetOverview(iOverview);
×
4400
                if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
×
4401
                    continue;
×
4402

4403
                if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
×
4404
                    poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
×
4405
                {
4406
                    CPLDebug("GDAL",
×
4407
                             "GDALDataset::GetBestOverviewLevel() ... "
4408
                             "mismatched overview sizes, use std method.");
4409
                    return -1;
×
4410
                }
4411
                int nBlockXSizeFirst = 0;
×
4412
                int nBlockYSizeFirst = 0;
×
4413
                poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
×
4414
                                             &nBlockYSizeFirst);
4415

4416
                int nBlockXSizeCurrent = 0;
×
4417
                int nBlockYSizeCurrent = 0;
×
4418
                poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
×
4419
                                        &nBlockYSizeCurrent);
4420

4421
                if (nBlockXSizeFirst != nBlockXSizeCurrent ||
×
4422
                    nBlockYSizeFirst != nBlockYSizeCurrent)
×
4423
                {
4424
                    CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
×
4425
                                     "mismatched block sizes, use std method.");
4426
                    return -1;
×
4427
                }
4428
            }
4429
        }
4430
    }
4431
    if (poFirstBand == nullptr)
4✔
4432
        return -1;
×
4433

4434
    return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
4✔
4435
                                         nYSize, nBufXSize, nBufYSize,
4436
                                         psExtraArg);
4✔
4437
}
4438

4439
/************************************************************************/
4440
/*                         BlockBasedRasterIO()                         */
4441
/*                                                                      */
4442
/*      This convenience function implements a dataset level            */
4443
/*      RasterIO() interface based on calling down to fetch blocks,     */
4444
/*      much like the GDALRasterBand::IRasterIO(), but it handles       */
4445
/*      all bands at once, so that a format driver that handles a       */
4446
/*      request for different bands of the same block efficiently       */
4447
/*      (i.e. without re-reading interleaved data) will efficiently.    */
4448
/*                                                                      */
4449
/*      This method is intended to be called by an overridden           */
4450
/*      IRasterIO() method in the driver specific GDALDataset           */
4451
/*      derived class.                                                  */
4452
/*                                                                      */
4453
/*      Default internal implementation of RasterIO() ... utilizes      */
4454
/*      the Block access methods to satisfy the request.  This would    */
4455
/*      normally only be overridden by formats with overviews.          */
4456
/*                                                                      */
4457
/*      To keep things relatively simple, this method does not          */
4458
/*      currently take advantage of some special cases addressed in     */
4459
/*      GDALRasterBand::IRasterIO(), so it is likely best to only       */
4460
/*      call it when you know it will help.  That is in cases where     */
4461
/*      data is at 1:1 to the buffer, and you know the driver is        */
4462
/*      implementing interleaved IO efficiently on a block by block     */
4463
/*      basis. Overviews will be used when possible.                    */
4464
/************************************************************************/
4465

4466
CPLErr GDALDataset::BlockBasedRasterIO(
63,912✔
4467
    GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4468
    void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4469
    int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4470
    GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
4471

4472
{
4473
    CPLAssert(nullptr != pData);
63,912✔
4474

4475
    GByte **papabySrcBlock = nullptr;
63,912✔
4476
    GDALRasterBlock *poBlock = nullptr;
63,912✔
4477
    GDALRasterBlock **papoBlocks = nullptr;
63,912✔
4478
    int nLBlockX = -1;
63,912✔
4479
    int nLBlockY = -1;
63,912✔
4480
    int iBufYOff;
4481
    int iBufXOff;
4482
    int nBlockXSize = 1;
63,912✔
4483
    int nBlockYSize = 1;
63,912✔
4484
    CPLErr eErr = CE_None;
63,912✔
4485
    GDALDataType eDataType = GDT_Byte;
63,912✔
4486

4487
    const bool bUseIntegerRequestCoords =
63,912✔
4488
        (!psExtraArg->bFloatingPointWindowValidity ||
63,942✔
4489
         (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
30✔
4490
          nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
28✔
4491

4492
    /* -------------------------------------------------------------------- */
4493
    /*      Ensure that all bands share a common block size and data type.  */
4494
    /* -------------------------------------------------------------------- */
4495
    for (int iBand = 0; iBand < nBandCount; iBand++)
303,011✔
4496
    {
4497
        GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
239,098✔
4498

4499
        if (iBand == 0)
239,098✔
4500
        {
4501
            poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
63,911✔
4502
            eDataType = poBand->GetRasterDataType();
63,912✔
4503
        }
4504
        else
4505
        {
4506
            int nThisBlockXSize = 0;
175,187✔
4507
            int nThisBlockYSize = 0;
175,187✔
4508
            poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
175,187✔
4509
            if (nThisBlockXSize != nBlockXSize ||
175,188✔
4510
                nThisBlockYSize != nBlockYSize)
175,188✔
4511
            {
4512
                CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
×
4513
                                 "mismatched block sizes, use std method.");
4514
                return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
×
4515
                                         pData, nBufXSize, nBufYSize, eBufType,
4516
                                         nBandCount, panBandMap, nPixelSpace,
4517
                                         nLineSpace, nBandSpace, psExtraArg);
×
4518
            }
4519

4520
            if (eDataType != poBand->GetRasterDataType() &&
175,188✔
4521
                (nXSize != nBufXSize || nYSize != nBufYSize))
×
4522
            {
4523
                CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
1✔
4524
                                 "mismatched band data types, use std method.");
4525
                return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
×
4526
                                         pData, nBufXSize, nBufYSize, eBufType,
4527
                                         nBandCount, panBandMap, nPixelSpace,
4528
                                         nLineSpace, nBandSpace, psExtraArg);
×
4529
            }
4530
        }
4531
    }
4532

4533
    /* ==================================================================== */
4534
    /*      In this special case at full resolution we step through in      */
4535
    /*      blocks, turning the request over to the per-band                */
4536
    /*      IRasterIO(), but ensuring that all bands of one block are       */
4537
    /*      called before proceeding to the next.                           */
4538
    /* ==================================================================== */
4539

4540
    if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
63,913✔
4541
    {
4542
        GDALRasterIOExtraArg sDummyExtraArg;
4543
        INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
63,909✔
4544

4545
        int nChunkYSize = 0;
63,909✔
4546
        int nChunkXSize = 0;
63,909✔
4547

4548
        for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
210,046✔
4549
        {
4550
            const int nChunkYOff = iBufYOff + nYOff;
147,152✔
4551
            nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
147,152✔
4552
            if (nChunkYOff + nChunkYSize > nYOff + nYSize)
147,152✔
4553
                nChunkYSize = (nYOff + nYSize) - nChunkYOff;
59,103✔
4554

4555
            for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
816,579✔
4556
            {
4557
                const int nChunkXOff = iBufXOff + nXOff;
670,441✔
4558
                nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
670,441✔
4559
                if (nChunkXOff + nChunkXSize > nXOff + nXSize)
670,441✔
4560
                    nChunkXSize = (nXOff + nXSize) - nChunkXOff;
70,151✔
4561

4562
                GByte *pabyChunkData =
670,441✔
4563
                    static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
670,441✔
4564
                    static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
670,441✔
4565

4566
                for (int iBand = 0; iBand < nBandCount; iBand++)
3,265,410✔
4567
                {
4568
                    GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
2,595,980✔
4569

4570
                    eErr = poBand->IRasterIO(
5,191,960✔
4571
                        eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
4572
                        nChunkYSize,
4573
                        pabyChunkData +
2,595,980✔
4574
                            static_cast<GPtrDiff_t>(iBand) * nBandSpace,
2,595,980✔
4575
                        nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
4576
                        nLineSpace, &sDummyExtraArg);
2,595,980✔
4577
                    if (eErr != CE_None)
2,595,980✔
4578
                        return eErr;
1,014✔
4579
                }
4580
            }
4581

4582
            if (psExtraArg->pfnProgress != nullptr &&
164,926✔
4583
                !psExtraArg->pfnProgress(
18,788✔
4584
                    1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
164,926✔
4585
                        nBufYSize,
4586
                    "", psExtraArg->pProgressData))
4587
            {
UNCOV
4588
                return CE_Failure;
×
4589
            }
4590
        }
4591

4592
        return CE_None;
62,894✔
4593
    }
4594

4595
    /* Below code is not compatible with that case. It would need a complete */
4596
    /* separate code like done in GDALRasterBand::IRasterIO. */
4597
    if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
4✔
4598
    {
4599
        return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
×
4600
                                 nBufXSize, nBufYSize, eBufType, nBandCount,
4601
                                 panBandMap, nPixelSpace, nLineSpace,
4602
                                 nBandSpace, psExtraArg);
×
4603
    }
4604

4605
    /* We could have a smarter implementation, but that will do for now */
4606
    if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
4✔
4607
        (nBufXSize != nXSize || nBufYSize != nYSize))
×
4608
    {
4609
        return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
×
4610
                                 nBufXSize, nBufYSize, eBufType, nBandCount,
4611
                                 panBandMap, nPixelSpace, nLineSpace,
4612
                                 nBandSpace, psExtraArg);
×
4613
    }
4614

4615
    /* ==================================================================== */
4616
    /*      Loop reading required source blocks to satisfy output           */
4617
    /*      request.  This is the most general implementation.              */
4618
    /* ==================================================================== */
4619

4620
    const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
4✔
4621

4622
    papabySrcBlock =
4623
        static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
4✔
4624
    papoBlocks =
4625
        static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
4✔
4626

4627
    /* -------------------------------------------------------------------- */
4628
    /*      Select an overview level if appropriate.                        */
4629
    /* -------------------------------------------------------------------- */
4630

4631
    GDALRasterIOExtraArg sExtraArg;
4632
    GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4✔
4633
    const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
4✔
4634
        this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
4635
        panBandMap, &sExtraArg);
4636
    if (nOverviewLevel >= 0)
4✔
4637
    {
4638
        GetRasterBand(panBandMap[0])
2✔
4639
            ->GetOverview(nOverviewLevel)
2✔
4640
            ->GetBlockSize(&nBlockXSize, &nBlockYSize);
2✔
4641
    }
4642

4643
    double dfXOff = nXOff;
4✔
4644
    double dfYOff = nYOff;
4✔
4645
    double dfXSize = nXSize;
4✔
4646
    double dfYSize = nYSize;
4✔
4647
    if (sExtraArg.bFloatingPointWindowValidity)
4✔
4648
    {
4649
        dfXOff = sExtraArg.dfXOff;
2✔
4650
        dfYOff = sExtraArg.dfYOff;
2✔
4651
        dfXSize = sExtraArg.dfXSize;
2✔
4652
        dfYSize = sExtraArg.dfYSize;
2✔
4653
    }
4654

4655
    /* -------------------------------------------------------------------- */
4656
    /*      Compute stepping increment.                                     */
4657
    /* -------------------------------------------------------------------- */
4658
    const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
4✔
4659
    const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
4✔
4660

4661
    constexpr double EPS = 1e-10;
4✔
4662
    /* -------------------------------------------------------------------- */
4663
    /*      Loop over buffer computing source locations.                    */
4664
    /* -------------------------------------------------------------------- */
4665
    for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
36✔
4666
    {
4667
        GPtrDiff_t iSrcOffset;
4668

4669
        // Add small epsilon to avoid some numeric precision issues.
4670
        const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
32✔
4671
        const int iSrcY = static_cast<int>(std::min(
32✔
4672
            std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
32✔
4673

4674
        GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
32✔
4675
                                static_cast<GPtrDiff_t>(nLineSpace);
4676

4677
        for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
302✔
4678
        {
4679
            const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
270✔
4680
            const int iSrcX = static_cast<int>(std::min(
270✔
4681
                std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
270✔
4682

4683
            // FIXME: this code likely doesn't work if the dirty block gets
4684
            // flushed to disk before being completely written. In the meantime,
4685
            // bJustInitialize should probably be set to FALSE even if it is not
4686
            // ideal performance wise, and for lossy compression
4687

4688
            /* --------------------------------------------------------------------
4689
             */
4690
            /*      Ensure we have the appropriate block loaded. */
4691
            /* --------------------------------------------------------------------
4692
             */
4693
            if (iSrcX < nLBlockX * nBlockXSize ||
270✔
4694
                iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
270✔
4695
                iSrcY < nLBlockY * nBlockYSize ||
266✔
4696
                iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
266✔
4697
            {
4698
                nLBlockX = iSrcX / nBlockXSize;
4✔
4699
                nLBlockY = iSrcY / nBlockYSize;
4✔
4700

4701
                const bool bJustInitialize =
4✔
4702
                    eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
×
4703
                    nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
×
4704
                    nXOff <= nLBlockX * nBlockXSize &&
4✔
4705
                    nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
×
4706
                /*bool bMemZeroBuffer = FALSE;
4707
                if( eRWFlag == GF_Write && !bJustInitialize &&
4708
                    nXOff <= nLBlockX * nBlockXSize &&
4709
                    nYOff <= nLBlockY * nBlockYSize &&
4710
                    (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
4711
                     (nXOff + nXSize == GetRasterXSize() &&
4712
                     (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
4713
                    (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
4714
                     (nYOff + nYSize == GetRasterYSize() &&
4715
                     (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
4716
                {
4717
                    bJustInitialize = TRUE;
4718
                    bMemZeroBuffer = TRUE;
4719
                }*/
4720
                for (int iBand = 0; iBand < nBandCount; iBand++)
12✔
4721
                {
4722
                    GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
8✔
4723
                    if (nOverviewLevel >= 0)
8✔
4724
                        poBand = poBand->GetOverview(nOverviewLevel);
2✔
4725
                    poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
16✔
4726
                                                        bJustInitialize);
8✔
4727
                    if (poBlock == nullptr)
8✔
4728
                    {
4729
                        eErr = CE_Failure;
×
4730
                        goto CleanupAndReturn;
×
4731
                    }
4732

4733
                    if (eRWFlag == GF_Write)
8✔
4734
                        poBlock->MarkDirty();
×
4735

4736
                    if (papoBlocks[iBand] != nullptr)
8✔
4737
                        papoBlocks[iBand]->DropLock();
×
4738

4739
                    papoBlocks[iBand] = poBlock;
8✔
4740

4741
                    papabySrcBlock[iBand] =
8✔
4742
                        static_cast<GByte *>(poBlock->GetDataRef());
8✔
4743
                    /*if( bMemZeroBuffer )
4744
                    {
4745
                        memset(papabySrcBlock[iBand], 0,
4746
                            static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
4747
                    * nBlockYSize);
4748
                    }*/
4749
                }
4750
            }
4751

4752
            /* --------------------------------------------------------------------
4753
             */
4754
            /*      Copy over this pixel of data. */
4755
            /* --------------------------------------------------------------------
4756
             */
4757
            iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
270✔
4758
                          static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
270✔
4759
                          (static_cast<GPtrDiff_t>(iSrcY) -
270✔
4760
                           static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
270✔
4761
                              nBlockXSize) *
270✔
4762
                         nBandDataSize;
270✔
4763

4764
            for (int iBand = 0; iBand < nBandCount; iBand++)
980✔
4765
            {
4766
                GByte *pabySrcBlock = papabySrcBlock[iBand];
710✔
4767
                GPtrDiff_t iBandBufOffset =
710✔
4768
                    iBufOffset + static_cast<GPtrDiff_t>(iBand) *
710✔
4769
                                     static_cast<GPtrDiff_t>(nBandSpace);
4770

4771
                if (eDataType == eBufType)
710✔
4772
                {
4773
                    if (eRWFlag == GF_Read)
710✔
4774
                        memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
710✔
4775
                               pabySrcBlock + iSrcOffset, nBandDataSize);
710✔
4776
                    else
4777
                        memcpy(pabySrcBlock + iSrcOffset,
×
4778
                               static_cast<const GByte *>(pData) +
4779
                                   iBandBufOffset,
×
4780
                               nBandDataSize);
4781
                }
4782
                else
4783
                {
4784
                    /* type to type conversion ... ouch, this is expensive way
4785
                       of handling single words */
4786

4787
                    if (eRWFlag == GF_Read)
×
4788
                        GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
×
4789
                                        static_cast<GByte *>(pData) +
4790
                                            iBandBufOffset,
×
4791
                                        eBufType, 0, 1);
4792
                    else
4793
                        GDALCopyWords64(static_cast<const GByte *>(pData) +
×
4794
                                            iBandBufOffset,
×
4795
                                        eBufType, 0, pabySrcBlock + iSrcOffset,
×
4796
                                        eDataType, 0, 1);
4797
                }
4798
            }
4799

4800
            iBufOffset += static_cast<int>(nPixelSpace);
270✔
4801
        }
4802
    }
4803

4804
    /* -------------------------------------------------------------------- */
4805
    /*      CleanupAndReturn.                                               */
4806
    /* -------------------------------------------------------------------- */
4807
CleanupAndReturn:
4✔
4808
    CPLFree(papabySrcBlock);
4✔
4809
    if (papoBlocks != nullptr)
4✔
4810
    {
4811
        for (int iBand = 0; iBand < nBandCount; iBand++)
12✔
4812
        {
4813
            if (papoBlocks[iBand] != nullptr)
8✔
4814
                papoBlocks[iBand]->DropLock();
8✔
4815
        }
4816
        CPLFree(papoBlocks);
4✔
4817
    }
4818

4819
    return eErr;
4✔
4820
}
4821

4822
//! @endcond
4823

4824
/************************************************************************/
4825
/*                  GDALCopyWholeRasterGetSwathSize()                   */
4826
/************************************************************************/
4827

4828
static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
3,158✔
4829
                                            GDALRasterBand *poDstPrototypeBand,
4830
                                            int nBandCount,
4831
                                            int bDstIsCompressed,
4832
                                            int bInterleave, int *pnSwathCols,
4833
                                            int *pnSwathLines)
4834
{
4835
    GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
3,158✔
4836
    int nSrcBlockXSize = 0;
3,158✔
4837
    int nSrcBlockYSize = 0;
3,158✔
4838
    int nBlockXSize = 0;
3,158✔
4839
    int nBlockYSize = 0;
3,158✔
4840

4841
    int nXSize = poSrcPrototypeBand->GetXSize();
3,158✔
4842
    int nYSize = poSrcPrototypeBand->GetYSize();
3,158✔
4843

4844
    poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
3,158✔
4845
    poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
3,158✔
4846

4847
    const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
3,158✔
4848
    const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
3,158✔
4849

4850
    int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
3,158✔
4851
    if (bInterleave)
3,158✔
4852
        nPixelSize *= nBandCount;
549✔
4853

4854
    // aim for one row of blocks.  Do not settle for less.
4855
    int nSwathCols = nXSize;
3,158✔
4856
    int nSwathLines = nMaxBlockYSize;
3,158✔
4857

4858
    const char *pszSrcCompression =
4859
        poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
3,158✔
4860
    if (pszSrcCompression == nullptr)
3,158✔
4861
    {
4862
        auto poSrcDS = poSrcPrototypeBand->GetDataset();
3,132✔
4863
        if (poSrcDS)
3,132✔
4864
            pszSrcCompression =
4865
                poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
3,126✔
4866
    }
4867

4868
    /* -------------------------------------------------------------------- */
4869
    /*      What will our swath size be?                                    */
4870
    /* -------------------------------------------------------------------- */
4871
    // When writing interleaved data in a compressed format, we want to be sure
4872
    // that each block will only be written once, so the swath size must not be
4873
    // greater than the block cache.
4874
    const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
3,158✔
4875
    int nTargetSwathSize;
4876
    if (pszSwathSize != nullptr)
3,158✔
4877
        nTargetSwathSize = static_cast<int>(
×
4878
            std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
×
4879
    else
4880
    {
4881
        // As a default, take one 1/4 of the cache size.
4882
        nTargetSwathSize = static_cast<int>(
3,158✔
4883
            std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
3,158✔
4884

4885
        // but if the minimum idal swath buf size is less, then go for it to
4886
        // avoid unnecessarily abusing RAM usage.
4887
        // but try to use 10 MB at least.
4888
        GIntBig nIdealSwathBufSize =
3,158✔
4889
            static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
3,158✔
4890
        int nMinTargetSwathSize = 10 * 1000 * 1000;
3,158✔
4891

4892
        if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
3,158✔
4893
             GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
3,158✔
4894
        {
4895
            nMinTargetSwathSize = nTargetSwathSize;
2✔
4896
        }
4897

4898
        if (nIdealSwathBufSize < nTargetSwathSize &&
3,158✔
4899
            nIdealSwathBufSize < nMinTargetSwathSize)
3,148✔
4900
        {
4901
            nIdealSwathBufSize = nMinTargetSwathSize;
3,145✔
4902
        }
4903

4904
        if (pszSrcCompression != nullptr &&
3,158✔
4905
            EQUAL(pszSrcCompression, "JPEG2000") &&
180✔
4906
            (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
×
4907
                                   (nSrcBlockYSize % nBlockYSize) == 0)))
×
4908
        {
4909
            nIdealSwathBufSize =
2✔
4910
                std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
4✔
4911
                                                 nSrcBlockYSize * nPixelSize);
2✔
4912
        }
4913
        if (nTargetSwathSize > nIdealSwathBufSize)
3,158✔
4914
            nTargetSwathSize = static_cast<int>(
3,144✔
4915
                std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
3,144✔
4916
    }
4917

4918
    if (nTargetSwathSize < 1000000)
3,158✔
4919
        nTargetSwathSize = 1000000;
8✔
4920

4921
    /* But let's check that  */
4922
    if (bDstIsCompressed && bInterleave &&
3,376✔
4923
        nTargetSwathSize > GDALGetCacheMax64())
218✔
4924
    {
4925
        CPLError(CE_Warning, CPLE_AppDefined,
×
4926
                 "When translating into a compressed interleave format, "
4927
                 "the block cache size (" CPL_FRMT_GIB ") "
4928
                 "should be at least the size of the swath (%d) "
4929
                 "(GDAL_SWATH_SIZE config. option)",
4930
                 GDALGetCacheMax64(), nTargetSwathSize);
4931
    }
4932

4933
#define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
4934
#define ROUND_TO(x, y) (((x) / (y)) * (y))
4935

4936
    // if both input and output datasets are tiled, that the tile dimensions
4937
    // are "compatible", try to stick  to a swath dimension that is a multiple
4938
    // of input and output block dimensions.
4939
    if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
3,158✔
4940
        IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
37✔
4941
        IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
37✔
4942
        IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
37✔
4943
        IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
37✔
4944
    {
4945
        if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
37✔
4946
                nPixelSize <=
37✔
4947
            static_cast<GIntBig>(nTargetSwathSize))
37✔
4948
        {
4949
            nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
37✔
4950
            nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
37✔
4951
            if (nSwathCols == 0)
37✔
4952
                nSwathCols = nMaxBlockXSize;
×
4953
            if (nSwathCols > nXSize)
37✔
4954
                nSwathCols = nXSize;
35✔
4955
            nSwathLines = nMaxBlockYSize;
37✔
4956

4957
            if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
37✔
4958
                static_cast<GIntBig>(nTargetSwathSize))
37✔
4959
            {
4960
                nSwathCols = nXSize;
×
4961
                nSwathLines = nBlockYSize;
×
4962
            }
4963
        }
4964
    }
4965

4966
    const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
3,158✔
4967
    const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
3,158✔
4968
    if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
3,158✔
4969
    {
4970
        nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
1✔
4971
        if (nSwathLines == 0)
1✔
4972
            nSwathLines = 1;
1✔
4973

4974
        CPLDebug(
1✔
4975
            "GDAL",
4976
            "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
4977
            "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
4978
            "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
4979
            nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
1✔
4980
    }
4981
    // If we are processing single scans, try to handle several at once.
4982
    // If we are handling swaths already, only grow the swath if a row
4983
    // of blocks is substantially less than our target buffer size.
4984
    else if (nSwathLines == 1 ||
3,157✔
4985
             nMemoryPerCol * nSwathLines <
2,617✔
4986
                 static_cast<GIntBig>(nTargetSwathSize) / 10)
2,617✔
4987
    {
4988
        nSwathLines = std::min(
3,129✔
4989
            nYSize,
4990
            std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
3,129✔
4991

4992
        /* If possible try to align to source and target block height */
4993
        if ((nSwathLines % nMaxBlockYSize) != 0 &&
3,129✔
4994
            nSwathLines > nMaxBlockYSize &&
251✔
4995
            IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
251✔
4996
            IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
222✔
4997
            nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
202✔
4998
    }
4999

5000
    if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
3,158✔
5001
        (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
×
5002
                               IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
×
5003
    {
5004
        // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
5005
        if (nSwathLines < nSrcBlockYSize)
2✔
5006
        {
5007
            nSwathLines = nSrcBlockYSize;
×
5008

5009
            // Number of pixels that can be read/write simultaneously.
5010
            nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
×
5011
            nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
×
5012
            if (nSwathCols == 0)
×
5013
                nSwathCols = nSrcBlockXSize;
×
5014
            if (nSwathCols > nXSize)
×
5015
                nSwathCols = nXSize;
×
5016

5017
            CPLDebug(
×
5018
                "GDAL",
5019
                "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5020
                "too high block, "
5021
                "use partial width at one time");
5022
        }
5023
        else if ((nSwathLines % nSrcBlockYSize) != 0)
2✔
5024
        {
5025
            /* Round on a multiple of nSrcBlockYSize */
5026
            nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
×
5027
            CPLDebug(
×
5028
                "GDAL",
5029
                "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5030
                "round nSwathLines to block height : %d",
5031
                nSwathLines);
5032
        }
5033
    }
5034
    else if (bDstIsCompressed)
3,156✔
5035
    {
5036
        if (nSwathLines < nBlockYSize)
408✔
5037
        {
5038
            nSwathLines = nBlockYSize;
146✔
5039

5040
            // Number of pixels that can be read/write simultaneously.
5041
            nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
146✔
5042
            nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
146✔
5043
            if (nSwathCols == 0)
146✔
5044
                nSwathCols = nBlockXSize;
×
5045
            if (nSwathCols > nXSize)
146✔
5046
                nSwathCols = nXSize;
146✔
5047

5048
            CPLDebug(
146✔
5049
                "GDAL",
5050
                "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5051
                "too high block, "
5052
                "use partial width at one time");
5053
        }
5054
        else if ((nSwathLines % nBlockYSize) != 0)
262✔
5055
        {
5056
            // Round on a multiple of nBlockYSize.
5057
            nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
9✔
5058
            CPLDebug(
9✔
5059
                "GDAL",
5060
                "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5061
                "round nSwathLines to block height : %d",
5062
                nSwathLines);
5063
        }
5064
    }
5065

5066
    *pnSwathCols = nSwathCols;
3,158✔
5067
    *pnSwathLines = nSwathLines;
3,158✔
5068
}
3,158✔
5069

5070
/************************************************************************/
5071
/*                     GDALDatasetCopyWholeRaster()                     */
5072
/************************************************************************/
5073

5074
/**
5075
 * \brief Copy all dataset raster data.
5076
 *
5077
 * This function copies the complete raster contents of one dataset to
5078
 * another similarly configured dataset.  The source and destination
5079
 * dataset must have the same number of bands, and the same width
5080
 * and height.  The bands do not have to have the same data type.
5081
 *
5082
 * This function is primarily intended to support implementation of
5083
 * driver specific CreateCopy() functions.  It implements efficient copying,
5084
 * in particular "chunking" the copy in substantial blocks and, if appropriate,
5085
 * performing the transfer in a pixel interleaved fashion.
5086
 *
5087
 * Currently the only papszOptions value supported are :
5088
 * <ul>
5089
 * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
5090
 * write access pattern (this does not modify the layout of the destination
5091
 * data)</li> <li>"COMPRESSED=YES" to force alignment on target dataset block
5092
 * sizes to achieve best compression.</li> <li>"SKIP_HOLES=YES" to skip chunks
5093
 * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
5094
 * (GDAL &gt;= 2.2)</li>
5095
 * </ul>
5096
 * More options may be supported in the future.
5097
 *
5098
 * @param hSrcDS the source dataset
5099
 * @param hDstDS the destination dataset
5100
 * @param papszOptions transfer hints in "StringList" Name=Value format.
5101
 * @param pfnProgress progress reporting function.
5102
 * @param pProgressData callback data for progress function.
5103
 *
5104
 * @return CE_None on success, or CE_Failure on failure.
5105
 */
5106

5107
CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
3,131✔
5108
                                              GDALDatasetH hDstDS,
5109
                                              CSLConstList papszOptions,
5110
                                              GDALProgressFunc pfnProgress,
5111
                                              void *pProgressData)
5112

5113
{
5114
    VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
3,131✔
5115
    VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
3,131✔
5116

5117
    GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
3,131✔
5118
    GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
3,131✔
5119

5120
    if (pfnProgress == nullptr)
3,131✔
5121
        pfnProgress = GDALDummyProgress;
×
5122

5123
    /* -------------------------------------------------------------------- */
5124
    /*      Confirm the datasets match in size and band counts.             */
5125
    /* -------------------------------------------------------------------- */
5126
    const int nXSize = poDstDS->GetRasterXSize();
3,131✔
5127
    const int nYSize = poDstDS->GetRasterYSize();
3,131✔
5128
    const int nBandCount = poDstDS->GetRasterCount();
3,131✔
5129

5130
    if (poSrcDS->GetRasterXSize() != nXSize ||
3,131✔
5131
        poSrcDS->GetRasterYSize() != nYSize ||
6,262✔
5132
        poSrcDS->GetRasterCount() != nBandCount)
3,131✔
5133
    {
5134
        CPLError(CE_Failure, CPLE_AppDefined,
×
5135
                 "Input and output dataset sizes or band counts do not\n"
5136
                 "match in GDALDatasetCopyWholeRaster()");
5137
        return CE_Failure;
×
5138
    }
5139

5140
    /* -------------------------------------------------------------------- */
5141
    /*      Report preliminary (0) progress.                                */
5142
    /* -------------------------------------------------------------------- */
5143
    if (!pfnProgress(0.0, nullptr, pProgressData))
3,131✔
5144
    {
5145
        CPLError(CE_Failure, CPLE_UserInterrupt,
1✔
5146
                 "User terminated CreateCopy()");
5147
        return CE_Failure;
1✔
5148
    }
5149

5150
    /* -------------------------------------------------------------------- */
5151
    /*      Get our prototype band, and assume the others are similarly     */
5152
    /*      configured.                                                     */
5153
    /* -------------------------------------------------------------------- */
5154
    if (nBandCount == 0)
3,130✔
5155
        return CE_None;
×
5156

5157
    GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
3,130✔
5158
    GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
3,130✔
5159
    GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
3,130✔
5160

5161
    /* -------------------------------------------------------------------- */
5162
    /*      Do we want to try and do the operation in a pixel               */
5163
    /*      interleaved fashion?                                            */
5164
    /* -------------------------------------------------------------------- */
5165
    bool bInterleave = false;
3,130✔
5166
    const char *pszInterleave =
5167
        poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
3,130✔
5168
    if (pszInterleave != nullptr &&
3,130✔
5169
        (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
2,775✔
5170
        bInterleave = true;
184✔
5171

5172
    pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
3,130✔
5173
    if (pszInterleave != nullptr &&
3,130✔
5174
        (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
2,675✔
5175
        bInterleave = true;
496✔
5176

5177
    pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
3,130✔
5178
    if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
3,130✔
5179
        bInterleave = true;
5✔
5180
    else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
3,125✔
5181
        bInterleave = false;
13✔
5182
    // attributes is specific to the TileDB driver
5183
    else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
3,112✔
5184
        bInterleave = true;
4✔
5185
    else if (pszInterleave != nullptr)
3,108✔
5186
    {
5187
        CPLError(CE_Warning, CPLE_NotSupported,
×
5188
                 "Unsupported value for option INTERLEAVE");
5189
    }
5190

5191
    // If the destination is compressed, we must try to write blocks just once,
5192
    // to save disk space (GTiff case for example), and to avoid data loss
5193
    // (JPEG compression for example).
5194
    bool bDstIsCompressed = false;
3,130✔
5195
    const char *pszDstCompressed =
5196
        CSLFetchNameValue(papszOptions, "COMPRESSED");
3,130✔
5197
    if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
3,130✔
5198
        bDstIsCompressed = true;
383✔
5199

5200
    /* -------------------------------------------------------------------- */
5201
    /*      What will our swath size be?                                    */
5202
    /* -------------------------------------------------------------------- */
5203

5204
    int nSwathCols = 0;
3,130✔
5205
    int nSwathLines = 0;
3,130✔
5206
    GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
3,130✔
5207
                                    nBandCount, bDstIsCompressed, bInterleave,
5208
                                    &nSwathCols, &nSwathLines);
5209

5210
    int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
3,130✔
5211
    if (bInterleave)
3,130✔
5212
        nPixelSize *= nBandCount;
549✔
5213

5214
    void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
3,130✔
5215
    if (pSwathBuf == nullptr)
3,130✔
5216
    {
5217
        return CE_Failure;
×
5218
    }
5219

5220
    CPLDebug("GDAL",
3,130✔
5221
             "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
5222
             nSwathCols, nSwathLines, static_cast<int>(bInterleave));
5223

5224
    // Advise the source raster that we are going to read it completely
5225
    // Note: this might already have been done by GDALCreateCopy() in the
5226
    // likely case this function is indirectly called by it
5227
    poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
3,130✔
5228
                        nullptr, nullptr);
3,130✔
5229

5230
    /* ==================================================================== */
5231
    /*      Band oriented (uninterleaved) case.                             */
5232
    /* ==================================================================== */
5233
    CPLErr eErr = CE_None;
3,130✔
5234
    const bool bCheckHoles =
5235
        CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
3,130✔
5236

5237
    if (!bInterleave)
3,130✔
5238
    {
5239
        GDALRasterIOExtraArg sExtraArg;
5240
        INIT_RASTERIO_EXTRA_ARG(sExtraArg);
2,581✔
5241
        CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
2,581✔
5242

5243
        const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
7,743✔
5244
                                     DIV_ROUND_UP(nYSize, nSwathLines) *
2,581✔
5245
                                     DIV_ROUND_UP(nXSize, nSwathCols);
2,581✔
5246
        GIntBig nBlocksDone = 0;
2,581✔
5247

5248
        for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
7,551✔
5249
        {
5250
            int nBand = iBand + 1;
4,970✔
5251

5252
            for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
10,255✔
5253
            {
5254
                int nThisLines = nSwathLines;
5,285✔
5255

5256
                if (iY + nThisLines > nYSize)
5,285✔
5257
                    nThisLines = nYSize - iY;
375✔
5258

5259
                for (int iX = 0; iX < nXSize && eErr == CE_None;
10,570✔
5260
                     iX += nSwathCols)
5,285✔
5261
                {
5262
                    int nThisCols = nSwathCols;
5,285✔
5263

5264
                    if (iX + nThisCols > nXSize)
5,285✔
5265
                        nThisCols = nXSize - iX;
×
5266

5267
                    int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5,285✔
5268
                    if (bCheckHoles)
5,285✔
5269
                    {
5270
                        nStatus = poSrcDS->GetRasterBand(nBand)
5271
                                      ->GetDataCoverageStatus(
3,636✔
5272
                                          iX, iY, nThisCols, nThisLines,
5273
                                          GDAL_DATA_COVERAGE_STATUS_DATA);
5274
                    }
5275
                    if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5,285✔
5276
                    {
5277
                        sExtraArg.pfnProgress = GDALScaledProgress;
5,281✔
5278
                        sExtraArg.pProgressData = GDALCreateScaledProgress(
10,562✔
5279
                            nBlocksDone / static_cast<double>(nTotalBlocks),
5,281✔
5280
                            (nBlocksDone + 0.5) /
5,281✔
5281
                                static_cast<double>(nTotalBlocks),
5,281✔
5282
                            pfnProgress, pProgressData);
5283
                        if (sExtraArg.pProgressData == nullptr)
5,281✔
5284
                            sExtraArg.pfnProgress = nullptr;
1,619✔
5285

5286
                        eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5,281✔
5287
                                                 nThisLines, pSwathBuf,
5288
                                                 nThisCols, nThisLines, eDT, 1,
5289
                                                 &nBand, 0, 0, 0, &sExtraArg);
5290

5291
                        GDALDestroyScaledProgress(sExtraArg.pProgressData);
5,281✔
5292

5293
                        if (eErr == CE_None)
5,281✔
5294
                            eErr = poDstDS->RasterIO(
5,274✔
5295
                                GF_Write, iX, iY, nThisCols, nThisLines,
5296
                                pSwathBuf, nThisCols, nThisLines, eDT, 1,
5297
                                &nBand, 0, 0, 0, nullptr);
5298
                    }
5299

5300
                    nBlocksDone++;
5,285✔
5301
                    if (eErr == CE_None &&
10,528✔
5302
                        !pfnProgress(nBlocksDone /
5,243✔
5303
                                         static_cast<double>(nTotalBlocks),
5,243✔
5304
                                     nullptr, pProgressData))
5305
                    {
5306
                        eErr = CE_Failure;
2✔
5307
                        CPLError(CE_Failure, CPLE_UserInterrupt,
2✔
5308
                                 "User terminated CreateCopy()");
5309
                    }
5310
                }
5311
            }
5312
        }
5313
    }
5314

5315
    /* ==================================================================== */
5316
    /*      Pixel interleaved case.                                         */
5317
    /* ==================================================================== */
5318
    else /* if( bInterleave ) */
5319
    {
5320
        GDALRasterIOExtraArg sExtraArg;
5321
        INIT_RASTERIO_EXTRA_ARG(sExtraArg);
549✔
5322
        CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
549✔
5323

5324
        const GIntBig nTotalBlocks =
549✔
5325
            static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
549✔
5326
            DIV_ROUND_UP(nXSize, nSwathCols);
549✔
5327
        GIntBig nBlocksDone = 0;
549✔
5328

5329
        for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
1,313✔
5330
        {
5331
            int nThisLines = nSwathLines;
764✔
5332

5333
            if (iY + nThisLines > nYSize)
764✔
5334
                nThisLines = nYSize - iY;
191✔
5335

5336
            for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
1,533✔
5337
            {
5338
                int nThisCols = nSwathCols;
769✔
5339

5340
                if (iX + nThisCols > nXSize)
769✔
5341
                    nThisCols = nXSize - iX;
3✔
5342

5343
                int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
769✔
5344
                if (bCheckHoles)
769✔
5345
                {
5346
                    nStatus = 0;
539✔
5347
                    for (int iBand = 0; iBand < nBandCount; iBand++)
592✔
5348
                    {
5349
                        nStatus |= poSrcDS->GetRasterBand(iBand + 1)
573✔
5350
                                       ->GetDataCoverageStatus(
573✔
5351
                                           iX, iY, nThisCols, nThisLines,
5352
                                           GDAL_DATA_COVERAGE_STATUS_DATA);
5353
                        if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
573✔
5354
                            break;
520✔
5355
                    }
5356
                }
5357
                if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
769✔
5358
                {
5359
                    sExtraArg.pfnProgress = GDALScaledProgress;
750✔
5360
                    sExtraArg.pProgressData = GDALCreateScaledProgress(
1,500✔
5361
                        nBlocksDone / static_cast<double>(nTotalBlocks),
750✔
5362
                        (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
750✔
5363
                        pfnProgress, pProgressData);
5364
                    if (sExtraArg.pProgressData == nullptr)
750✔
5365
                        sExtraArg.pfnProgress = nullptr;
343✔
5366

5367
                    eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
750✔
5368
                                             nThisLines, pSwathBuf, nThisCols,
5369
                                             nThisLines, eDT, nBandCount,
5370
                                             nullptr, 0, 0, 0, &sExtraArg);
5371

5372
                    GDALDestroyScaledProgress(sExtraArg.pProgressData);
750✔
5373

5374
                    if (eErr == CE_None)
750✔
5375
                        eErr = poDstDS->RasterIO(
749✔
5376
                            GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
5377
                            nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
5378
                            0, 0, nullptr);
5379
                }
5380

5381
                nBlocksDone++;
769✔
5382
                if (eErr == CE_None &&
1,534✔
5383
                    !pfnProgress(nBlocksDone /
765✔
5384
                                     static_cast<double>(nTotalBlocks),
765✔
5385
                                 nullptr, pProgressData))
5386
                {
5387
                    eErr = CE_Failure;
1✔
5388
                    CPLError(CE_Failure, CPLE_UserInterrupt,
1✔
5389
                             "User terminated CreateCopy()");
5390
                }
5391
            }
5392
        }
5393
    }
5394

5395
    /* -------------------------------------------------------------------- */
5396
    /*      Cleanup                                                         */
5397
    /* -------------------------------------------------------------------- */
5398
    CPLFree(pSwathBuf);
3,130✔
5399

5400
    return eErr;
3,130✔
5401
}
5402

5403
/************************************************************************/
5404
/*                     GDALRasterBandCopyWholeRaster()                  */
5405
/************************************************************************/
5406

5407
/**
5408
 * \brief Copy a whole raster band
5409
 *
5410
 * This function copies the complete raster contents of one band to
5411
 * another similarly configured band.  The source and destination
5412
 * bands must have the same width and height.  The bands do not have
5413
 * to have the same data type.
5414
 *
5415
 * It implements efficient copying, in particular "chunking" the copy in
5416
 * substantial blocks.
5417
 *
5418
 * Currently the only papszOptions value supported are :
5419
 * <ul>
5420
 * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
5421
 * achieve best compression.</li>
5422
 * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
5423
 * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL &gt;= 2.2)</li>
5424
 * </ul>
5425
 *
5426
 * @param hSrcBand the source band
5427
 * @param hDstBand the destination band
5428
 * @param papszOptions transfer hints in "StringList" Name=Value format.
5429
 * @param pfnProgress progress reporting function.
5430
 * @param pProgressData callback data for progress function.
5431
 *
5432
 * @return CE_None on success, or CE_Failure on failure.
5433
 */
5434

5435
CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
28✔
5436
    GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
5437
    const char *const *const papszOptions, GDALProgressFunc pfnProgress,
5438
    void *pProgressData)
5439

5440
{
5441
    VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
28✔
5442
    VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
28✔
5443

5444
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
28✔
5445
    GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
28✔
5446
    CPLErr eErr = CE_None;
28✔
5447

5448
    if (pfnProgress == nullptr)
28✔
5449
        pfnProgress = GDALDummyProgress;
2✔
5450

5451
    /* -------------------------------------------------------------------- */
5452
    /*      Confirm the datasets match in size and band counts.             */
5453
    /* -------------------------------------------------------------------- */
5454
    int nXSize = poSrcBand->GetXSize();
28✔
5455
    int nYSize = poSrcBand->GetYSize();
28✔
5456

5457
    if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
28✔
5458
    {
5459
        CPLError(CE_Failure, CPLE_AppDefined,
×
5460
                 "Input and output band sizes do not\n"
5461
                 "match in GDALRasterBandCopyWholeRaster()");
5462
        return CE_Failure;
×
5463
    }
5464

5465
    /* -------------------------------------------------------------------- */
5466
    /*      Report preliminary (0) progress.                                */
5467
    /* -------------------------------------------------------------------- */
5468
    if (!pfnProgress(0.0, nullptr, pProgressData))
28✔
5469
    {
5470
        CPLError(CE_Failure, CPLE_UserInterrupt,
×
5471
                 "User terminated CreateCopy()");
5472
        return CE_Failure;
×
5473
    }
5474

5475
    GDALDataType eDT = poDstBand->GetRasterDataType();
28✔
5476

5477
    // If the destination is compressed, we must try to write blocks just once,
5478
    // to save disk space (GTiff case for example), and to avoid data loss
5479
    // (JPEG compression for example).
5480
    bool bDstIsCompressed = false;
28✔
5481
    const char *pszDstCompressed =
5482
        CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
28✔
5483
    if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
28✔
5484
        bDstIsCompressed = true;
25✔
5485

5486
    /* -------------------------------------------------------------------- */
5487
    /*      What will our swath size be?                                    */
5488
    /* -------------------------------------------------------------------- */
5489

5490
    int nSwathCols = 0;
28✔
5491
    int nSwathLines = 0;
28✔
5492
    GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
28✔
5493
                                    FALSE, &nSwathCols, &nSwathLines);
5494

5495
    const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
28✔
5496

5497
    void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
28✔
5498
    if (pSwathBuf == nullptr)
28✔
5499
    {
5500
        return CE_Failure;
×
5501
    }
5502

5503
    CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
28✔
5504
             nSwathCols, nSwathLines);
5505

5506
    const bool bCheckHoles =
5507
        CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
28✔
5508

5509
    // Advise the source raster that we are going to read it completely
5510
    poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
28✔
5511

5512
    /* ==================================================================== */
5513
    /*      Band oriented (uninterleaved) case.                             */
5514
    /* ==================================================================== */
5515

5516
    for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
70✔
5517
    {
5518
        int nThisLines = nSwathLines;
42✔
5519

5520
        if (iY + nThisLines > nYSize)
42✔
5521
            nThisLines = nYSize - iY;
8✔
5522

5523
        for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
84✔
5524
        {
5525
            int nThisCols = nSwathCols;
42✔
5526

5527
            if (iX + nThisCols > nXSize)
42✔
5528
                nThisCols = nXSize - iX;
×
5529

5530
            int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
42✔
5531
            if (bCheckHoles)
42✔
5532
            {
5533
                nStatus = poSrcBand->GetDataCoverageStatus(
×
5534
                    iX, iY, nThisCols, nThisLines,
5535
                    GDAL_DATA_COVERAGE_STATUS_DATA);
5536
            }
5537
            if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
42✔
5538
            {
5539
                eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
42✔
5540
                                           nThisLines, pSwathBuf, nThisCols,
5541
                                           nThisLines, eDT, 0, 0, nullptr);
5542

5543
                if (eErr == CE_None)
42✔
5544
                    eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
42✔
5545
                                               nThisLines, pSwathBuf, nThisCols,
5546
                                               nThisLines, eDT, 0, 0, nullptr);
5547
            }
5548

5549
            if (eErr == CE_None &&
84✔
5550
                !pfnProgress((iY + nThisLines) / static_cast<float>(nYSize),
42✔
5551
                             nullptr, pProgressData))
5552
            {
5553
                eErr = CE_Failure;
×
5554
                CPLError(CE_Failure, CPLE_UserInterrupt,
×
5555
                         "User terminated CreateCopy()");
5556
            }
5557
        }
5558
    }
5559

5560
    /* -------------------------------------------------------------------- */
5561
    /*      Cleanup                                                         */
5562
    /* -------------------------------------------------------------------- */
5563
    CPLFree(pSwathBuf);
28✔
5564

5565
    return eErr;
28✔
5566
}
5567

5568
/************************************************************************/
5569
/*                      GDALCopyRasterIOExtraArg ()                     */
5570
/************************************************************************/
5571

5572
void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
526,923✔
5573
                              GDALRasterIOExtraArg *psSrcArg)
5574
{
5575
    INIT_RASTERIO_EXTRA_ARG(*psDestArg);
526,923✔
5576
    if (psSrcArg)
526,923✔
5577
    {
5578
        psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
526,923✔
5579
        psDestArg->pfnProgress = psSrcArg->pfnProgress;
526,923✔
5580
        psDestArg->pProgressData = psSrcArg->pProgressData;
526,923✔
5581
        psDestArg->bFloatingPointWindowValidity =
526,923✔
5582
            psSrcArg->bFloatingPointWindowValidity;
526,923✔
5583
        if (psSrcArg->bFloatingPointWindowValidity)
526,923✔
5584
        {
5585
            psDestArg->dfXOff = psSrcArg->dfXOff;
204,140✔
5586
            psDestArg->dfYOff = psSrcArg->dfYOff;
204,140✔
5587
            psDestArg->dfXSize = psSrcArg->dfXSize;
204,140✔
5588
            psDestArg->dfYSize = psSrcArg->dfYSize;
204,140✔
5589
        }
5590
        if (psSrcArg->nVersion >= 2)
526,923✔
5591
        {
5592
            psDestArg->bUseOnlyThisScale = psSrcArg->bUseOnlyThisScale;
526,923✔
5593
        }
5594
    }
5595
}
526,923✔
5596

5597
/************************************************************************/
5598
/*                         HasOnlyNoData()                              */
5599
/************************************************************************/
5600

5601
template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
24,858,136✔
5602
{
5603
    return value == noDataValue;
24,858,136✔
5604
}
5605

5606
template <> bool IsEqualToNoData<GFloat16>(GFloat16 value, GFloat16 noDataValue)
×
5607
{
5608
    using std::isnan;
5609
    return isnan(noDataValue) ? isnan(value) : value == noDataValue;
×
5610
}
5611

5612
template <> bool IsEqualToNoData<float>(float value, float noDataValue)
560,462✔
5613
{
5614
    return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
560,462✔
5615
}
5616

5617
template <> bool IsEqualToNoData<double>(double value, double noDataValue)
13,481,900✔
5618
{
5619
    return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
13,481,900✔
5620
}
5621

5622
template <class T>
5623
static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
15,885✔
5624
                           size_t nHeight, size_t nLineStride,
5625
                           size_t nComponents)
5626
{
5627
    // Fast test: check the 4 corners and the middle pixel.
5628
    for (size_t iBand = 0; iBand < nComponents; iBand++)
30,871✔
5629
    {
5630
        if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
32,519✔
5631
              IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
16,172✔
5632
                              noDataValue) &&
15,948✔
5633
              IsEqualToNoData(
15,948✔
5634
                  pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
15,948✔
5635
                              nComponents +
15,948✔
5636
                          iBand],
5637
                  noDataValue) &&
14,999✔
5638
              IsEqualToNoData(
14,999✔
5639
                  pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
14,999✔
5640
                  noDataValue) &&
5641
              IsEqualToNoData(
14,991✔
5642
                  pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
14,991✔
5643
                              nComponents +
14,991✔
5644
                          iBand],
5645
                  noDataValue)))
5646
        {
5647
            return false;
1,361✔
5648
        }
5649
    }
5650

5651
    // Test all pixels.
5652
    for (size_t iY = 0; iY < nHeight; iY++)
45,036✔
5653
    {
5654
        const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
30,566✔
5655
        for (size_t iX = 0; iX < nWidth * nComponents; iX++)
38,852,446✔
5656
        {
5657
            if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
38,821,963✔
5658
            {
5659
                return false;
54✔
5660
            }
5661
        }
5662
    }
5663
    return true;
14,470✔
5664
}
5665

5666
/************************************************************************/
5667
/*                    GDALBufferHasOnlyNoData()                         */
5668
/************************************************************************/
5669

5670
bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
42,606✔
5671
                             size_t nWidth, size_t nHeight, size_t nLineStride,
5672
                             size_t nComponents, int nBitsPerSample,
5673
                             GDALBufferSampleFormat nSampleFormat)
5674
{
5675
    // In the case where the nodata is 0, we can compare several bytes at
5676
    // once. Select the largest natural integer type for the architecture.
5677
#if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
5678
    // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
5679
    typedef std::uint64_t WordType;
5680
#else
5681
    typedef std::uint32_t WordType;
5682
#endif
5683
    if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
42,606✔
5684
        // Do not use this optimized code path for floating point numbers,
5685
        // as it can't detect negative zero.
5686
        nSampleFormat != GSF_FLOATING_POINT)
5687
    {
5688
        const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
26,715✔
5689
        const size_t nSize =
26,715✔
5690
            (nWidth * nHeight * nComponents * nBitsPerSample + 7) / 8;
26,715✔
5691
        size_t i = 0;
26,715✔
5692
        const size_t nInitialIters =
5693
            std::min(sizeof(WordType) -
53,430✔
5694
                         static_cast<size_t>(
26,715✔
5695
                             reinterpret_cast<std::uintptr_t>(pabyBuffer) %
5696
                             sizeof(WordType)),
5697
                     nSize);
26,715✔
5698
        for (; i < nInitialIters; i++)
220,399✔
5699
        {
5700
            if (pabyBuffer[i])
198,058✔
5701
                return false;
4,374✔
5702
        }
5703
        for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
16,516,400✔
5704
        {
5705
            if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
16,501,200✔
5706
                return false;
7,197✔
5707
        }
5708
        for (; i < nSize; i++)
52,533✔
5709
        {
5710
            if (pabyBuffer[i])
37,394✔
5711
                return false;
5✔
5712
        }
5713
        return true;
15,139✔
5714
    }
5715

5716
    if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
15,891✔
5717
    {
5718
        return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
22,272✔
5719
               HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
11,136✔
5720
                              static_cast<uint8_t>(dfNoDataValue), nWidth,
11,136✔
5721
                              nHeight, nLineStride, nComponents);
11,136✔
5722
    }
5723
    if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
4,755✔
5724
    {
5725
        // Use unsigned implementation by converting the nodatavalue to
5726
        // unsigned
5727
        return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
63✔
5728
               HasOnlyNoDataT(
31✔
5729
                   static_cast<const uint8_t *>(pBuffer),
5730
                   static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
31✔
5731
                   nWidth, nHeight, nLineStride, nComponents);
32✔
5732
    }
5733
    if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
4,723✔
5734
    {
5735
        return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
21✔
5736
               HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
10✔
5737
                              static_cast<uint16_t>(dfNoDataValue), nWidth,
10✔
5738
                              nHeight, nLineStride, nComponents);
11✔
5739
    }
5740
    if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
4,712✔
5741
    {
5742
        // Use unsigned implementation by converting the nodatavalue to
5743
        // unsigned
5744
        return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
99✔
5745
               HasOnlyNoDataT(
49✔
5746
                   static_cast<const uint16_t *>(pBuffer),
5747
                   static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
49✔
5748
                   nWidth, nHeight, nLineStride, nComponents);
50✔
5749
    }
5750
    if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
4,662✔
5751
    {
5752
        return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
73✔
5753
               HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
36✔
5754
                              static_cast<uint32_t>(dfNoDataValue), nWidth,
5755
                              nHeight, nLineStride, nComponents);
37✔
5756
    }
5757
    if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
4,625✔
5758
    {
5759
        // Use unsigned implementation by converting the nodatavalue to
5760
        // unsigned
5761
        return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
23✔
5762
               HasOnlyNoDataT(
11✔
5763
                   static_cast<const uint32_t *>(pBuffer),
5764
                   static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
11✔
5765
                   nWidth, nHeight, nLineStride, nComponents);
12✔
5766
    }
5767
    if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
4,613✔
5768
    {
5769
        return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
56✔
5770
               HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
28✔
5771
                              static_cast<uint64_t>(dfNoDataValue), nWidth,
5772
                              nHeight, nLineStride, nComponents);
28✔
5773
    }
5774
    if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
4,585✔
5775
    {
5776
        // Use unsigned implementation by converting the nodatavalue to
5777
        // unsigned
5778
        return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
×
5779
               HasOnlyNoDataT(
×
5780
                   static_cast<const uint64_t *>(pBuffer),
5781
                   static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
×
5782
                   nWidth, nHeight, nLineStride, nComponents);
×
5783
    }
5784
    if (nBitsPerSample == 16 && nSampleFormat == GSF_FLOATING_POINT)
4,585✔
5785
    {
5786
        return (std::isnan(dfNoDataValue) ||
×
5787
                GDALIsValueInRange<GFloat16>(dfNoDataValue)) &&
×
5788
               HasOnlyNoDataT(static_cast<const GFloat16 *>(pBuffer),
×
5789
                              static_cast<GFloat16>(dfNoDataValue), nWidth,
5790
                              nHeight, nLineStride, nComponents);
×
5791
    }
5792
    if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
4,585✔
5793
    {
5794
        return (std::isnan(dfNoDataValue) ||
754✔
5795
                GDALIsValueInRange<float>(dfNoDataValue)) &&
1,507✔
5796
               HasOnlyNoDataT(static_cast<const float *>(pBuffer),
753✔
5797
                              static_cast<float>(dfNoDataValue), nWidth,
5798
                              nHeight, nLineStride, nComponents);
754✔
5799
    }
5800
    if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
3,831✔
5801
    {
5802
        return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
3,831✔
5803
                              dfNoDataValue, nWidth, nHeight, nLineStride,
5804
                              nComponents);
3,831✔
5805
    }
5806
    return false;
×
5807
}
5808

5809
#ifdef HAVE_SSE2
5810

5811
/************************************************************************/
5812
/*                    GDALDeinterleave3Byte()                           */
5813
/************************************************************************/
5814

5815
#if defined(__GNUC__) && !defined(__clang__)
5816
__attribute__((optimize("no-tree-vectorize")))
5817
#endif
5818
static void
5819
GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
156,912✔
5820
                      GByte *CPL_RESTRICT pabyDest0,
5821
                      GByte *CPL_RESTRICT pabyDest1,
5822
                      GByte *CPL_RESTRICT pabyDest2, size_t nIters)
5823
#ifdef USE_NEON_OPTIMIZATIONS
5824
{
5825
    return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
5826
                                       nIters);
5827
}
5828
#else
5829
{
5830
#ifdef HAVE_SSSE3_AT_COMPILE_TIME
5831
    if (CPLHaveRuntimeSSSE3())
156,912✔
5832
    {
5833
        return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
156,922✔
5834
                                           pabyDest2, nIters);
156,916✔
5835
    }
5836
#endif
5837

5838
    size_t i = 0;
1✔
5839
    if (((reinterpret_cast<uintptr_t>(pabySrc) |
1✔
5840
          reinterpret_cast<uintptr_t>(pabyDest0) |
1✔
5841
          reinterpret_cast<uintptr_t>(pabyDest1) |
1✔
5842
          reinterpret_cast<uintptr_t>(pabyDest2)) %
1✔
5843
         sizeof(unsigned int)) == 0)
5844
    {
5845
        // Slightly better than GCC autovectorizer
5846
        for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
17✔
5847
        {
5848
            unsigned int word0 =
15✔
5849
                *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
15✔
5850
            unsigned int word1 =
15✔
5851
                *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
15✔
5852
            unsigned int word2 =
15✔
5853
                *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
15✔
5854
            reinterpret_cast<unsigned int *>(pabyDest0)[j] =
15✔
5855
                (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
15✔
5856
                ((word2 >> 8) << 24);
15✔
5857
            reinterpret_cast<unsigned int *>(pabyDest1)[j] =
15✔
5858
                ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
15✔
5859
                (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
15✔
5860
            pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
15✔
5861
            pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
15✔
5862
            pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
15✔
5863
            pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
15✔
5864
        }
5865
    }
5866
#if defined(__clang__)
5867
#pragma clang loop vectorize(disable)
5868
#endif
5869
    for (; i < nIters; ++i)
2✔
5870
    {
5871
        pabyDest0[i] = pabySrc[3 * i + 0];
1✔
5872
        pabyDest1[i] = pabySrc[3 * i + 1];
1✔
5873
        pabyDest2[i] = pabySrc[3 * i + 2];
1✔
5874
    }
5875
}
5876
#endif
5877

5878
/************************************************************************/
5879
/*                    GDALDeinterleave4Byte()                           */
5880
/************************************************************************/
5881

5882
#if !defined(__GNUC__) || defined(__clang__)
5883

5884
/************************************************************************/
5885
/*                         deinterleave()                               */
5886
/************************************************************************/
5887

5888
template <bool SHIFT, bool MASK>
5889
inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
5890
                            __m128i &xmm2_ori, __m128i &xmm3_ori)
5891
{
5892
    // Set higher 24bit of each int32 packed word to 0
5893
    if (SHIFT)
5894
    {
5895
        xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
5896
        xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
5897
        xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
5898
        xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
5899
    }
5900
    __m128i xmm0;
5901
    __m128i xmm1;
5902
    __m128i xmm2;
5903
    __m128i xmm3;
5904
    if (MASK)
5905
    {
5906
        const __m128i xmm_mask = _mm_set1_epi32(0xff);
5907
        xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
5908
        xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
5909
        xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
5910
        xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
5911
    }
5912
    else
5913
    {
5914
        xmm0 = xmm0_ori;
5915
        xmm1 = xmm1_ori;
5916
        xmm2 = xmm2_ori;
5917
        xmm3 = xmm3_ori;
5918
    }
5919
    // Pack int32 to int16
5920
    xmm0 = _mm_packs_epi32(xmm0, xmm1);
5921
    xmm2 = _mm_packs_epi32(xmm2, xmm3);
5922
    // Pack int16 to uint8
5923
    xmm0 = _mm_packus_epi16(xmm0, xmm2);
5924
    return xmm0;
5925
}
5926

5927
static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
5928
                                  GByte *CPL_RESTRICT pabyDest0,
5929
                                  GByte *CPL_RESTRICT pabyDest1,
5930
                                  GByte *CPL_RESTRICT pabyDest2,
5931
                                  GByte *CPL_RESTRICT pabyDest3, size_t nIters)
5932
#ifdef USE_NEON_OPTIMIZATIONS
5933
{
5934
    return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
5935
                                       pabyDest3, nIters);
5936
}
5937
#else
5938
{
5939
#ifdef HAVE_SSSE3_AT_COMPILE_TIME
5940
    if (CPLHaveRuntimeSSSE3())
5941
    {
5942
        return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
5943
                                           pabyDest2, pabyDest3, nIters);
5944
    }
5945
#endif
5946

5947
    // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
5948
    // do something slightly better.
5949
    size_t i = 0;
5950
    for (; i + 15 < nIters; i += 16)
5951
    {
5952
        __m128i xmm0_ori = _mm_loadu_si128(
5953
            reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
5954
        __m128i xmm1_ori = _mm_loadu_si128(
5955
            reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
5956
        __m128i xmm2_ori = _mm_loadu_si128(
5957
            reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
5958
        __m128i xmm3_ori = _mm_loadu_si128(
5959
            reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
5960

5961
        _mm_storeu_si128(
5962
            reinterpret_cast<__m128i *>(pabyDest0 + i),
5963
            deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
5964
        _mm_storeu_si128(
5965
            reinterpret_cast<__m128i *>(pabyDest1 + i),
5966
            deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
5967
        _mm_storeu_si128(
5968
            reinterpret_cast<__m128i *>(pabyDest2 + i),
5969
            deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
5970
        _mm_storeu_si128(
5971
            reinterpret_cast<__m128i *>(pabyDest3 + i),
5972
            deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
5973
    }
5974

5975
#if defined(__clang__)
5976
#pragma clang loop vectorize(disable)
5977
#endif
5978
    for (; i < nIters; ++i)
5979
    {
5980
        pabyDest0[i] = pabySrc[4 * i + 0];
5981
        pabyDest1[i] = pabySrc[4 * i + 1];
5982
        pabyDest2[i] = pabySrc[4 * i + 2];
5983
        pabyDest3[i] = pabySrc[4 * i + 3];
5984
    }
5985
}
5986
#endif
5987
#else
5988
// GCC autovectorizer does an excellent job
5989
__attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
61,592✔
5990
    const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
5991
    GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
5992
    GByte *CPL_RESTRICT pabyDest3, size_t nIters)
5993
{
5994
    for (size_t i = 0; i < nIters; ++i)
528,568,000✔
5995
    {
5996
        pabyDest0[i] = pabySrc[4 * i + 0];
528,507,000✔
5997
        pabyDest1[i] = pabySrc[4 * i + 1];
528,507,000✔
5998
        pabyDest2[i] = pabySrc[4 * i + 2];
528,507,000✔
5999
        pabyDest3[i] = pabySrc[4 * i + 3];
528,507,000✔
6000
    }
6001
}
61,592✔
6002
#endif
6003

6004
#else
6005

6006
/************************************************************************/
6007
/*                    GDALDeinterleave3Byte()                           */
6008
/************************************************************************/
6009

6010
// TODO: Enabling below could help on non-Intel architectures where GCC knows
6011
// how to auto-vectorize
6012
// #if defined(__GNUC__)
6013
//__attribute__((optimize("tree-vectorize")))
6014
// #endif
6015
static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
6016
                                  GByte *CPL_RESTRICT pabyDest0,
6017
                                  GByte *CPL_RESTRICT pabyDest1,
6018
                                  GByte *CPL_RESTRICT pabyDest2, size_t nIters)
6019
{
6020
    for (size_t i = 0; i < nIters; ++i)
6021
    {
6022
        pabyDest0[i] = pabySrc[3 * i + 0];
6023
        pabyDest1[i] = pabySrc[3 * i + 1];
6024
        pabyDest2[i] = pabySrc[3 * i + 2];
6025
    }
6026
}
6027

6028
/************************************************************************/
6029
/*                    GDALDeinterleave4Byte()                           */
6030
/************************************************************************/
6031

6032
// TODO: Enabling below could help on non-Intel architectures where gcc knows
6033
// how to auto-vectorize
6034
// #if defined(__GNUC__)
6035
//__attribute__((optimize("tree-vectorize")))
6036
// #endif
6037
static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
6038
                                  GByte *CPL_RESTRICT pabyDest0,
6039
                                  GByte *CPL_RESTRICT pabyDest1,
6040
                                  GByte *CPL_RESTRICT pabyDest2,
6041
                                  GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6042
{
6043
    for (size_t i = 0; i < nIters; ++i)
6044
    {
6045
        pabyDest0[i] = pabySrc[4 * i + 0];
6046
        pabyDest1[i] = pabySrc[4 * i + 1];
6047
        pabyDest2[i] = pabySrc[4 * i + 2];
6048
        pabyDest3[i] = pabySrc[4 * i + 3];
6049
    }
6050
}
6051

6052
#endif
6053

6054
/************************************************************************/
6055
/*                      GDALDeinterleave()                              */
6056
/************************************************************************/
6057

6058
/*! Copy values from a pixel-interleave buffer to multiple per-component
6059
    buffers.
6060

6061
    In pseudo-code
6062
    \verbatim
6063
    for(size_t i = 0; i < nIters; ++i)
6064
        for(int iComp = 0; iComp < nComponents; iComp++ )
6065
            ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
6066
    \endverbatim
6067

6068
    The implementation is optimized for a few cases, like de-interleaving
6069
    of 3 or 4-components Byte buffers.
6070

6071
    \since GDAL 3.6
6072
 */
6073
void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
218,858✔
6074
                      int nComponents, void **ppDestBuffer,
6075
                      GDALDataType eDestDT, size_t nIters)
6076
{
6077
    if (eSourceDT == eDestDT)
218,858✔
6078
    {
6079
        if (eSourceDT == GDT_Byte || eSourceDT == GDT_Int8)
218,837✔
6080
        {
6081
            if (nComponents == 3)
218,516✔
6082
            {
6083
                const GByte *CPL_RESTRICT pabySrc =
156,914✔
6084
                    static_cast<const GByte *>(pSourceBuffer);
6085
                GByte *CPL_RESTRICT pabyDest0 =
156,914✔
6086
                    static_cast<GByte *>(ppDestBuffer[0]);
6087
                GByte *CPL_RESTRICT pabyDest1 =
156,914✔
6088
                    static_cast<GByte *>(ppDestBuffer[1]);
6089
                GByte *CPL_RESTRICT pabyDest2 =
156,914✔
6090
                    static_cast<GByte *>(ppDestBuffer[2]);
6091
                GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
156,914✔
6092
                                      nIters);
6093
                return;
156,855✔
6094
            }
6095
            else if (nComponents == 4)
61,602✔
6096
            {
6097
                const GByte *CPL_RESTRICT pabySrc =
61,592✔
6098
                    static_cast<const GByte *>(pSourceBuffer);
6099
                GByte *CPL_RESTRICT pabyDest0 =
61,592✔
6100
                    static_cast<GByte *>(ppDestBuffer[0]);
6101
                GByte *CPL_RESTRICT pabyDest1 =
61,592✔
6102
                    static_cast<GByte *>(ppDestBuffer[1]);
6103
                GByte *CPL_RESTRICT pabyDest2 =
61,592✔
6104
                    static_cast<GByte *>(ppDestBuffer[2]);
6105
                GByte *CPL_RESTRICT pabyDest3 =
61,592✔
6106
                    static_cast<GByte *>(ppDestBuffer[3]);
6107
                GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
61,592✔
6108
                                      pabyDest3, nIters);
6109
                return;
61,592✔
6110
            }
10✔
6111
        }
6112
#if ((defined(__GNUC__) && !defined(__clang__)) ||                             \
6113
     defined(__INTEL_CLANG_COMPILER)) &&                                       \
6114
    defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
6115
        else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
642✔
6116
                 CPLHaveRuntimeSSSE3())
321✔
6117
        {
6118
            if (nComponents == 3)
321✔
6119
            {
6120
                const GUInt16 *CPL_RESTRICT panSrc =
126✔
6121
                    static_cast<const GUInt16 *>(pSourceBuffer);
6122
                GUInt16 *CPL_RESTRICT panDest0 =
126✔
6123
                    static_cast<GUInt16 *>(ppDestBuffer[0]);
6124
                GUInt16 *CPL_RESTRICT panDest1 =
126✔
6125
                    static_cast<GUInt16 *>(ppDestBuffer[1]);
6126
                GUInt16 *CPL_RESTRICT panDest2 =
126✔
6127
                    static_cast<GUInt16 *>(ppDestBuffer[2]);
6128
                GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
126✔
6129
                                              panDest2, nIters);
6130
                return;
126✔
6131
            }
6132
#if !defined(__INTEL_CLANG_COMPILER)
6133
            // ICC autovectorizer doesn't do a good job, at least with icx
6134
            // 2022.1.0.20220316
6135
            else if (nComponents == 4)
195✔
6136
            {
6137
                const GUInt16 *CPL_RESTRICT panSrc =
195✔
6138
                    static_cast<const GUInt16 *>(pSourceBuffer);
6139
                GUInt16 *CPL_RESTRICT panDest0 =
195✔
6140
                    static_cast<GUInt16 *>(ppDestBuffer[0]);
6141
                GUInt16 *CPL_RESTRICT panDest1 =
195✔
6142
                    static_cast<GUInt16 *>(ppDestBuffer[1]);
6143
                GUInt16 *CPL_RESTRICT panDest2 =
195✔
6144
                    static_cast<GUInt16 *>(ppDestBuffer[2]);
6145
                GUInt16 *CPL_RESTRICT panDest3 =
195✔
6146
                    static_cast<GUInt16 *>(ppDestBuffer[3]);
6147
                GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
195✔
6148
                                              panDest2, panDest3, nIters);
6149
                return;
195✔
6150
            }
6151
#endif
6152
        }
6153
#endif
6154
    }
6155

6156
    const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
31✔
6157
    const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
29✔
6158
    for (int iComp = 0; iComp < nComponents; iComp++)
108✔
6159
    {
6160
        GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
79✔
6161
                            iComp * nSourceDTSize,
79✔
6162
                        eSourceDT, nComponents * nSourceDTSize,
6163
                        ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
79✔
6164
    }
6165
}
6166

6167
/************************************************************************/
6168
/*                    GDALTranspose2DSingleToSingle()                   */
6169
/************************************************************************/
6170
/**
6171
 * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
6172
 *
6173
 * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6174
 * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6175
 * @param nSrcWidth Width of pSrc array.
6176
 * @param nSrcHeight Height of pSrc array.
6177
 */
6178

6179
template <class DST, class SRC>
6180
void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
145✔
6181
                                   DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6182
                                   size_t nSrcHeight)
6183
{
6184
    constexpr size_t blocksize = 32;
145✔
6185
    for (size_t i = 0; i < nSrcHeight; i += blocksize)
315✔
6186
    {
6187
        const size_t max_k = std::min(i + blocksize, nSrcHeight);
170✔
6188
        for (size_t j = 0; j < nSrcWidth; j += blocksize)
390✔
6189
        {
6190
            // transpose the block beginning at [i,j]
6191
            const size_t max_l = std::min(j + blocksize, nSrcWidth);
220✔
6192
            for (size_t k = i; k < max_k; ++k)
2,509✔
6193
            {
6194
                for (size_t l = j; l < max_l; ++l)
41,017✔
6195
                {
6196
                    GDALCopyWord(pSrc[l + k * nSrcWidth],
38,728✔
6197
                                 pDst[k + l * nSrcHeight]);
38,728✔
6198
                }
6199
            }
6200
        }
6201
    }
6202
}
145✔
6203

6204
/************************************************************************/
6205
/*                   GDALTranspose2DComplexToComplex()                  */
6206
/************************************************************************/
6207
/**
6208
 * Transpose a 2D array of complex values into an array of complex values,
6209
 * in a efficient (cache-oblivious) way.
6210
 *
6211
 * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6212
 * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6213
 * @param nSrcWidth Width of pSrc array.
6214
 * @param nSrcHeight Height of pSrc array.
6215
 */
6216
template <class DST, class SRC>
6217
void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
25✔
6218
                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6219
                                     size_t nSrcHeight)
6220
{
6221
    constexpr size_t blocksize = 32;
25✔
6222
    for (size_t i = 0; i < nSrcHeight; i += blocksize)
50✔
6223
    {
6224
        const size_t max_k = std::min(i + blocksize, nSrcHeight);
25✔
6225
        for (size_t j = 0; j < nSrcWidth; j += blocksize)
50✔
6226
        {
6227
            // transpose the block beginning at [i,j]
6228
            const size_t max_l = std::min(j + blocksize, nSrcWidth);
25✔
6229
            for (size_t k = i; k < max_k; ++k)
75✔
6230
            {
6231
                for (size_t l = j; l < max_l; ++l)
200✔
6232
                {
6233
                    GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
150✔
6234
                                 pDst[2 * (k + l * nSrcHeight) + 0]);
150✔
6235
                    GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
150✔
6236
                                 pDst[2 * (k + l * nSrcHeight) + 1]);
150✔
6237
                }
6238
            }
6239
        }
6240
    }
6241
}
25✔
6242

6243
/************************************************************************/
6244
/*                   GDALTranspose2DComplexToSingle()                  */
6245
/************************************************************************/
6246
/**
6247
 * Transpose a 2D array of complex values into an array of non-complex values,
6248
 * in a efficient (cache-oblivious) way.
6249
 *
6250
 * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6251
 * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6252
 * @param nSrcWidth Width of pSrc array.
6253
 * @param nSrcHeight Height of pSrc array.
6254
 */
6255
template <class DST, class SRC>
6256
void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
55✔
6257
                                    DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6258
                                    size_t nSrcHeight)
6259
{
6260
    constexpr size_t blocksize = 32;
55✔
6261
    for (size_t i = 0; i < nSrcHeight; i += blocksize)
110✔
6262
    {
6263
        const size_t max_k = std::min(i + blocksize, nSrcHeight);
55✔
6264
        for (size_t j = 0; j < nSrcWidth; j += blocksize)
110✔
6265
        {
6266
            // transpose the block beginning at [i,j]
6267
            const size_t max_l = std::min(j + blocksize, nSrcWidth);
55✔
6268
            for (size_t k = i; k < max_k; ++k)
165✔
6269
            {
6270
                for (size_t l = j; l < max_l; ++l)
440✔
6271
                {
6272
                    GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
330✔
6273
                                 pDst[k + l * nSrcHeight]);
330✔
6274
                }
6275
            }
6276
        }
6277
    }
6278
}
55✔
6279

6280
/************************************************************************/
6281
/*                   GDALTranspose2DSingleToComplex()                  */
6282
/************************************************************************/
6283
/**
6284
 * Transpose a 2D array of non-complex values into an array of complex values,
6285
 * in a efficient (cache-oblivious) way.
6286
 *
6287
 * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6288
 * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6289
 * @param nSrcWidth Width of pSrc array.
6290
 * @param nSrcHeight Height of pSrc array.
6291
 */
6292
template <class DST, class SRC>
6293
void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
55✔
6294
                                    DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6295
                                    size_t nSrcHeight)
6296
{
6297
    constexpr size_t blocksize = 32;
55✔
6298
    for (size_t i = 0; i < nSrcHeight; i += blocksize)
110✔
6299
    {
6300
        const size_t max_k = std::min(i + blocksize, nSrcHeight);
55✔
6301
        for (size_t j = 0; j < nSrcWidth; j += blocksize)
110✔
6302
        {
6303
            // transpose the block beginning at [i,j]
6304
            const size_t max_l = std::min(j + blocksize, nSrcWidth);
55✔
6305
            for (size_t k = i; k < max_k; ++k)
165✔
6306
            {
6307
                for (size_t l = j; l < max_l; ++l)
440✔
6308
                {
6309
                    GDALCopyWord(pSrc[l + k * nSrcWidth],
330✔
6310
                                 pDst[2 * (k + l * nSrcHeight) + 0]);
330✔
6311
                    pDst[2 * (k + l * nSrcHeight) + 1] = 0;
330✔
6312
                }
6313
            }
6314
        }
6315
    }
6316
}
55✔
6317

6318
/************************************************************************/
6319
/*                        GDALTranspose2D()                             */
6320
/************************************************************************/
6321

6322
template <class DST, bool DST_IS_COMPLEX>
6323
static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
280✔
6324
                            size_t nSrcWidth, size_t nSrcHeight)
6325
{
6326
#define CALL_GDALTranspose2D_internal(SRC_TYPE)                                \
6327
    do                                                                         \
6328
    {                                                                          \
6329
        if constexpr (DST_IS_COMPLEX)                                          \
6330
        {                                                                      \
6331
            GDALTranspose2DSingleToComplex(                                    \
6332
                static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
6333
                nSrcHeight);                                                   \
6334
        }                                                                      \
6335
        else                                                                   \
6336
        {                                                                      \
6337
            GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
6338
                                          pDst, nSrcWidth, nSrcHeight);        \
6339
        }                                                                      \
6340
    } while (0)
6341

6342
#define CALL_GDALTranspose2DComplex_internal(SRC_TYPE)                         \
6343
    do                                                                         \
6344
    {                                                                          \
6345
        if constexpr (DST_IS_COMPLEX)                                          \
6346
        {                                                                      \
6347
            GDALTranspose2DComplexToComplex(                                   \
6348
                static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
6349
                nSrcHeight);                                                   \
6350
        }                                                                      \
6351
        else                                                                   \
6352
        {                                                                      \
6353
            GDALTranspose2DComplexToSingle(                                    \
6354
                static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
6355
                nSrcHeight);                                                   \
6356
        }                                                                      \
6357
    } while (0)
6358

6359
    // clang-format off
6360
    switch (eSrcType)
280✔
6361
    {
6362
        case GDT_Byte:     CALL_GDALTranspose2D_internal(uint8_t); break;
16✔
6363
        case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t); break;
15✔
6364
        case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t); break;
24✔
6365
        case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t); break;
16✔
6366
        case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t); break;
24✔
6367
        case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t); break;
16✔
6368
        case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t); break;
16✔
6369
        case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t); break;
16✔
6370
        case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16); break;
16✔
6371
        case GDT_Float32:  CALL_GDALTranspose2D_internal(float); break;
17✔
6372
        case GDT_Float64:  CALL_GDALTranspose2D_internal(double); break;
24✔
6373
        case GDT_CInt16:   CALL_GDALTranspose2DComplex_internal(int16_t); break;
16✔
6374
        case GDT_CInt32:   CALL_GDALTranspose2DComplex_internal(int32_t); break;
16✔
6375
        case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
16✔
6376
        case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
16✔
6377
        case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
16✔
6378
        case GDT_Unknown:
×
6379
        case GDT_TypeCount:
6380
            break;
×
6381
    }
6382
        // clang-format on
6383

6384
#undef CALL_GDALTranspose2D_internal
6385
#undef CALL_GDALTranspose2DComplex_internal
6386
}
280✔
6387

6388
/************************************************************************/
6389
/*                      GDALInterleave2Byte()                           */
6390
/************************************************************************/
6391

6392
#if defined(HAVE_SSE2) &&                                                      \
6393
    (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
6394

6395
// ICC autovectorizer doesn't do a good job at generating good SSE code,
6396
// at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
6397
#if defined(__GNUC__)
6398
__attribute__((noinline))
6399
#endif
6400
static void
6401
GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
6402
                    uint8_t *CPL_RESTRICT pDst, size_t nIters)
6403
{
6404
    size_t i = 0;
6405
    constexpr size_t VALS_PER_ITER = 16;
6406
    for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
6407
    {
6408
        __m128i xmm0 =
6409
            _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
6410
        __m128i xmm1 = _mm_loadu_si128(
6411
            reinterpret_cast<__m128i const *>(pSrc + i + nIters));
6412
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
6413
                         _mm_unpacklo_epi8(xmm0, xmm1));
6414
        _mm_storeu_si128(
6415
            reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
6416
            _mm_unpackhi_epi8(xmm0, xmm1));
6417
    }
6418
#if defined(__clang__)
6419
#pragma clang loop vectorize(disable)
6420
#endif
6421
    for (; i < nIters; ++i)
6422
    {
6423
        pDst[2 * i + 0] = pSrc[i + 0 * nIters];
6424
        pDst[2 * i + 1] = pSrc[i + 1 * nIters];
6425
    }
6426
}
6427

6428
#else
6429

6430
#if defined(__GNUC__) && !defined(__clang__)
6431
__attribute__((optimize("tree-vectorize")))
6432
#endif
6433
#if defined(__GNUC__)
6434
__attribute__((noinline))
6435
#endif
6436
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6437
// clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
6438
#pragma clang diagnostic push
6439
#pragma clang diagnostic ignored "-Wpass-failed"
6440
#endif
6441
static void
6442
GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
4✔
6443
                    uint8_t *CPL_RESTRICT pDst, size_t nIters)
6444
{
6445
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6446
#pragma clang loop vectorize(enable)
6447
#endif
6448
    for (size_t i = 0; i < nIters; ++i)
44✔
6449
    {
6450
        pDst[2 * i + 0] = pSrc[i + 0 * nIters];
40✔
6451
        pDst[2 * i + 1] = pSrc[i + 1 * nIters];
40✔
6452
    }
6453
}
4✔
6454
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6455
#pragma clang diagnostic pop
6456
#endif
6457

6458
#endif
6459

6460
/************************************************************************/
6461
/*                      GDALInterleave4Byte()                           */
6462
/************************************************************************/
6463

6464
#if defined(HAVE_SSE2) &&                                                      \
6465
    (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
6466

6467
// ICC autovectorizer doesn't do a good job at generating good SSE code,
6468
// at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
6469
#if defined(__GNUC__)
6470
__attribute__((noinline))
6471
#endif
6472
static void
6473
GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
6474
                    uint8_t *CPL_RESTRICT pDst, size_t nIters)
6475
{
6476
    size_t i = 0;
6477
    constexpr size_t VALS_PER_ITER = 16;
6478
    for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
6479
    {
6480
        __m128i xmm0 = _mm_loadu_si128(
6481
            reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
6482
        __m128i xmm1 = _mm_loadu_si128(
6483
            reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
6484
        __m128i xmm2 = _mm_loadu_si128(
6485
            reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
6486
        __m128i xmm3 = _mm_loadu_si128(
6487
            reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
6488
        auto tmp0 = _mm_unpacklo_epi8(
6489
            xmm0,
6490
            xmm1);  // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
6491
        auto tmp1 = _mm_unpackhi_epi8(
6492
            xmm0,
6493
            xmm1);  // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
6494
        auto tmp2 = _mm_unpacklo_epi8(
6495
            xmm2,
6496
            xmm3);  // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
6497
        auto tmp3 = _mm_unpackhi_epi8(
6498
            xmm2,
6499
            xmm3);  // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
6500
        auto tmp2_0 = _mm_unpacklo_epi16(
6501
            tmp0,
6502
            tmp2);  // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
6503
        auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
6504
        auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
6505
        auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
6506
        _mm_storeu_si128(
6507
            reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
6508
            tmp2_0);
6509
        _mm_storeu_si128(
6510
            reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
6511
            tmp2_1);
6512
        _mm_storeu_si128(
6513
            reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
6514
            tmp2_2);
6515
        _mm_storeu_si128(
6516
            reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
6517
            tmp2_3);
6518
    }
6519
#if defined(__clang__)
6520
#pragma clang loop vectorize(disable)
6521
#endif
6522
    for (; i < nIters; ++i)
6523
    {
6524
        pDst[4 * i + 0] = pSrc[i + 0 * nIters];
6525
        pDst[4 * i + 1] = pSrc[i + 1 * nIters];
6526
        pDst[4 * i + 2] = pSrc[i + 2 * nIters];
6527
        pDst[4 * i + 3] = pSrc[i + 3 * nIters];
6528
    }
6529
}
6530

6531
#else
6532

6533
#if defined(__GNUC__) && !defined(__clang__)
6534
__attribute__((optimize("tree-vectorize")))
6535
#endif
6536
#if defined(__GNUC__)
6537
__attribute__((noinline))
6538
#endif
6539
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6540
// clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
6541
#pragma clang diagnostic push
6542
#pragma clang diagnostic ignored "-Wpass-failed"
6543
#endif
6544
static void
6545
GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
2✔
6546
                    uint8_t *CPL_RESTRICT pDst, size_t nIters)
6547
{
6548
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6549
#pragma clang loop vectorize(enable)
6550
#endif
6551
    for (size_t i = 0; i < nIters; ++i)
36✔
6552
    {
6553
        pDst[4 * i + 0] = pSrc[i + 0 * nIters];
34✔
6554
        pDst[4 * i + 1] = pSrc[i + 1 * nIters];
34✔
6555
        pDst[4 * i + 2] = pSrc[i + 2 * nIters];
34✔
6556
        pDst[4 * i + 3] = pSrc[i + 3 * nIters];
34✔
6557
    }
6558
}
2✔
6559
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6560
#pragma clang diagnostic pop
6561
#endif
6562

6563
#endif
6564

6565
/************************************************************************/
6566
/*                        GDALTranspose2D()                             */
6567
/************************************************************************/
6568

6569
/**
6570
 * Transpose a 2D array in a efficient (cache-oblivious) way.
6571
 *
6572
 * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
6573
 * @param eSrcType Data type of pSrc.
6574
 * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
6575
 * @param eDstType Data type of pDst.
6576
 * @param nSrcWidth Width of pSrc array.
6577
 * @param nSrcHeight Height of pSrc array.
6578
 * @since GDAL 3.11
6579
 */
6580

6581
void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
305✔
6582
                     GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
6583
{
6584
    if (eSrcType == eDstType && (eSrcType == GDT_Byte || eSrcType == GDT_Int8))
305✔
6585
    {
6586
        if (nSrcHeight == 2)
25✔
6587
        {
6588
            GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
4✔
6589
                                static_cast<uint8_t *>(pDst), nSrcWidth);
6590
            return;
4✔
6591
        }
6592
        if (nSrcHeight == 4)
21✔
6593
        {
6594
            GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
2✔
6595
                                static_cast<uint8_t *>(pDst), nSrcWidth);
6596
            return;
2✔
6597
        }
6598
#if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
6599
     (defined(__x86_64) || defined(_M_X64)))
6600
        if (CPLHaveRuntimeSSSE3())
19✔
6601
        {
6602
            GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
19✔
6603
                                       static_cast<uint8_t *>(pDst), nSrcWidth,
6604
                                       nSrcHeight);
6605
            return;
19✔
6606
        }
6607
#elif defined(USE_NEON_OPTIMIZATIONS)
6608
        {
6609
            GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
6610
                                       static_cast<uint8_t *>(pDst), nSrcWidth,
6611
                                       nSrcHeight);
6612
            return;
6613
        }
6614
#endif
6615
    }
6616

6617
#define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX)                \
6618
    GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>(                                 \
6619
        pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
6620

6621
    // clang-format off
6622
    switch (eDstType)
280✔
6623
    {
6624
        case GDT_Byte:     CALL_GDALTranspose2D_internal(uint8_t, false); break;
15✔
6625
        case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t, false); break;
15✔
6626
        case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t, false); break;
24✔
6627
        case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t, false); break;
16✔
6628
        case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t, false); break;
24✔
6629
        case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t, false); break;
16✔
6630
        case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t, false); break;
16✔
6631
        case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t, false); break;
16✔
6632
        case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16, false); break;
16✔
6633
        case GDT_Float32:  CALL_GDALTranspose2D_internal(float, false); break;
17✔
6634
        case GDT_Float64:  CALL_GDALTranspose2D_internal(double, false); break;
25✔
6635
        case GDT_CInt16:   CALL_GDALTranspose2D_internal(int16_t, true); break;
16✔
6636
        case GDT_CInt32:   CALL_GDALTranspose2D_internal(int32_t, true); break;
16✔
6637
        case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
16✔
6638
        case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
16✔
6639
        case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
16✔
6640
        case GDT_Unknown:
×
6641
        case GDT_TypeCount:
6642
            break;
×
6643
    }
6644
        // clang-format on
6645

6646
#undef CALL_GDALTranspose2D_internal
6647
}
6648

6649
/************************************************************************/
6650
/*                     ExtractBitAndConvertTo255()                      */
6651
/************************************************************************/
6652

6653
#if defined(__GNUC__) || defined(_MSC_VER)
6654
// Signedness of char implementation dependent, so be explicit.
6655
// Assumes 2-complement integer types and sign extension of right shifting
6656
// GCC guarantees such:
6657
// https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
6658
static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
157,290✔
6659
{
6660
    return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
157,290✔
6661
                              7);
157,290✔
6662
}
6663
#else
6664
// Portable way
6665
static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
6666
{
6667
    return (byVal & (1 << nBit)) ? 255 : 0;
6668
}
6669
#endif
6670

6671
/************************************************************************/
6672
/*                   ExpandEightPackedBitsToByteAt255()                 */
6673
/************************************************************************/
6674

6675
static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
19,457✔
6676
                                                    GByte abyOutput[8])
6677
{
6678
    abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
19,457✔
6679
    abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
19,457✔
6680
    abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
19,457✔
6681
    abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
19,457✔
6682
    abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
19,457✔
6683
    abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
19,457✔
6684
    abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
19,457✔
6685
    abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
19,457✔
6686
}
19,457✔
6687

6688
/************************************************************************/
6689
/*                GDALExpandPackedBitsToByteAt0Or255()                  */
6690
/************************************************************************/
6691

6692
/** Expand packed-bits (ordered from most-significant bit to least one)
6693
  into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
6694
  at 1 to a byte at 255.
6695

6696
 The function does (in a possibly more optimized way) the following:
6697
 \code{.cpp}
6698
 for (size_t i = 0; i < nInputBits; ++i )
6699
 {
6700
     pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
6701
 }
6702
 \endcode
6703

6704
 @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
6705
 @param pabyOutput Output array of nInputBits bytes.
6706
 @param nInputBits Number of valid bits in pabyInput.
6707

6708
 @since 3.11
6709
*/
6710

6711
void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
44,445✔
6712
                                        GByte *CPL_RESTRICT pabyOutput,
6713
                                        size_t nInputBits)
6714
{
6715
    const size_t nInputWholeBytes = nInputBits / 8;
44,445✔
6716
    size_t iByte = 0;
44,445✔
6717

6718
#ifdef HAVE_SSE2
6719
    // Mask to isolate each bit
6720
    const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
44,445✔
6721
                                          8, 16, 32, 64, -128);
6722
    const __m128i zero = _mm_setzero_si128();
44,445✔
6723
    const __m128i all_ones = _mm_set1_epi8(-1);
44,445✔
6724
#ifdef __SSSE3__
6725
    const __m128i dispatch_two_bytes =
6726
        _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
6727
#endif
6728
    constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
44,445✔
6729
    for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
132,394✔
6730
    {
6731
        __m128i reg_ori = _mm_loadu_si128(
87,949✔
6732
            reinterpret_cast<const __m128i *>(pabyInput + iByte));
87,949✔
6733

6734
        constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
87,949✔
6735
        for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
791,541✔
6736
        {
6737
            // Given reg_ori = (A, B, ... 14 other bytes ...),
6738
            // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
6739
#ifdef __SSSE3__
6740
            __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
6741
#else
6742
            __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
703,592✔
6743
            reg = _mm_unpacklo_epi16(reg, reg);
703,592✔
6744
            reg = _mm_unpacklo_epi32(reg, reg);
703,592✔
6745
#endif
6746

6747
            // Test if bits of interest are set
6748
            reg = _mm_and_si128(reg, bit_mask);
703,592✔
6749

6750
            // Now test if those bits are set, by comparing to zero. So the
6751
            // result will be that bytes where bits are set will be at 0, and
6752
            // ones where they are cleared will be at 0xFF. So the inverse of
6753
            // the end result we want!
6754
            reg = _mm_cmpeq_epi8(reg, zero);
703,592✔
6755

6756
            // Invert the result
6757
            reg = _mm_andnot_si128(reg, all_ones);
703,592✔
6758

6759
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
6760

6761
            pabyOutput += SSE_REG_SIZE;
703,592✔
6762

6763
            // Right-shift of 2 bytes
6764
            reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
703,592✔
6765
        }
6766
    }
6767

6768
#endif  // HAVE_SSE2
6769

6770
    for (; iByte < nInputWholeBytes; ++iByte)
63,902✔
6771
    {
6772
        ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
19,457✔
6773
        pabyOutput += 8;
19,457✔
6774
    }
6775
    for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
46,079✔
6776
    {
6777
        *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
1,634✔
6778
        ++pabyOutput;
1,634✔
6779
    }
6780
}
44,445✔
6781

6782
/************************************************************************/
6783
/*                   ExpandEightPackedBitsToByteAt1()                   */
6784
/************************************************************************/
6785

6786
static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
136,113✔
6787
                                                  GByte abyOutput[8])
6788
{
6789
    abyOutput[0] = (byVal >> 7) & 0x1;
136,113✔
6790
    abyOutput[1] = (byVal >> 6) & 0x1;
136,113✔
6791
    abyOutput[2] = (byVal >> 5) & 0x1;
136,113✔
6792
    abyOutput[3] = (byVal >> 4) & 0x1;
136,113✔
6793
    abyOutput[4] = (byVal >> 3) & 0x1;
136,113✔
6794
    abyOutput[5] = (byVal >> 2) & 0x1;
136,113✔
6795
    abyOutput[6] = (byVal >> 1) & 0x1;
136,113✔
6796
    abyOutput[7] = (byVal >> 0) & 0x1;
136,113✔
6797
}
136,113✔
6798

6799
/************************************************************************/
6800
/*                GDALExpandPackedBitsToByteAt0Or1()                    */
6801
/************************************************************************/
6802

6803
/** Expand packed-bits (ordered from most-significant bit to least one)
6804
  into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
6805
  at 1 to a byte at 1.
6806

6807
 The function does (in a possibly more optimized way) the following:
6808
 \code{.cpp}
6809
 for (size_t i = 0; i < nInputBits; ++i )
6810
 {
6811
     pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
6812
 }
6813
 \endcode
6814

6815
 @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
6816
 @param pabyOutput Output array of nInputBits bytes.
6817
 @param nInputBits Number of valid bits in pabyInput.
6818

6819
 @since 3.11
6820
*/
6821

6822
void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
7,041✔
6823
                                      GByte *CPL_RESTRICT pabyOutput,
6824
                                      size_t nInputBits)
6825
{
6826
    const size_t nInputWholeBytes = nInputBits / 8;
7,041✔
6827
    size_t iByte = 0;
7,041✔
6828
    for (; iByte < nInputWholeBytes; ++iByte)
143,154✔
6829
    {
6830
        ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
136,113✔
6831
        pabyOutput += 8;
136,113✔
6832
    }
6833
    for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
18,902✔
6834
    {
6835
        *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
11,861✔
6836
        ++pabyOutput;
11,861✔
6837
    }
6838
}
7,041✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc