• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

OSGeo / gdal / 13872211292

15 Mar 2025 11:00AM UTC coverage: 70.445% (+0.009%) from 70.436%
13872211292

Pull #11951

github

web-flow
Merge 643845942 into bb4e0ed67
Pull Request #11951: Doc: Build docs using CMake

553795 of 786140 relevant lines covered (70.44%)

221892.63 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.5
/gcore/rasterio.cpp
1
/******************************************************************************
2
 *
3
 * Project:  GDAL Core
4
 * Purpose:  Contains default implementation of GDALRasterBand::IRasterIO()
5
 *           and supporting functions of broader utility.
6
 * Author:   Frank Warmerdam, warmerdam@pobox.com
7
 *
8
 ******************************************************************************
9
 * Copyright (c) 1998, Frank Warmerdam
10
 * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
11
 *
12
 * SPDX-License-Identifier: MIT
13
 ****************************************************************************/
14

15
#include "cpl_port.h"
16
#include "gdal.h"
17
#include "gdal_priv.h"
18

19
#include <cassert>
20
#include <climits>
21
#include <cmath>
22
#include <cstddef>
23
#include <cstdio>
24
#include <cstdlib>
25
#include <cstring>
26

27
#include <algorithm>
28
#include <limits>
29
#include <stdexcept>
30
#include <type_traits>
31

32
#include "cpl_conv.h"
33
#include "cpl_cpu_features.h"
34
#include "cpl_error.h"
35
#include "cpl_float.h"
36
#include "cpl_progress.h"
37
#include "cpl_string.h"
38
#include "cpl_vsi.h"
39
#include "gdal_priv_templates.hpp"
40
#include "gdal_vrt.h"
41
#include "gdalwarper.h"
42
#include "memdataset.h"
43
#include "vrtdataset.h"
44

45
#if defined(__x86_64) || defined(_M_X64)
46
#include <emmintrin.h>
47
#define HAVE_SSE2
48
#elif defined(USE_NEON_OPTIMIZATIONS)
49
#include "include_sse2neon.h"
50
#define HAVE_SSE2
51
#endif
52

53
#ifdef HAVE_SSSE3_AT_COMPILE_TIME
54
#include "rasterio_ssse3.h"
55
#ifdef __SSSE3__
56
#include <tmmintrin.h>
57
#endif
58
#endif
59

60
static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
61
                             int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
62
                             int nDstPixelStride, GPtrDiff_t nWordCount);
63

64
/************************************************************************/
65
/*                    DownsamplingIntegerXFactor()                      */
66
/************************************************************************/
67

68
template <bool bSameDataType, int DATA_TYPE_SIZE>
69
static bool DownsamplingIntegerXFactor(
413,231✔
70
    GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
71
    GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
72
    GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
73
    int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
74
{
75
    const int nBandDataSize =
413,231✔
76
        bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
77
    int nOuterLoopIters = nBufXSize - 1;
413,231✔
78
    const int nIncSrcOffset = nSrcXInc * nBandDataSize;
413,231✔
79
    const GByte *CPL_RESTRICT pabySrcData;
80
    int nEndBlockX = nBlockXSize + nStartBlockX;
413,231✔
81

82
    if (iSrcX < nEndBlockX)
413,231✔
83
    {
84
        CPLAssert(poBlock);
226,130✔
85
        goto no_reload_block;
226,130✔
86
    }
87
    goto reload_block;
187,101✔
88

89
    // Don't do the last iteration in the loop, as iSrcX might go beyond
90
    // nRasterXSize - 1
91
    while (--nOuterLoopIters >= 1)
932,842✔
92
    {
93
        iSrcX += nSrcXInc;
189,034✔
94
        pabySrcData += nIncSrcOffset;
189,034✔
95
        pabyDstData += nPixelSpace;
189,034✔
96

97
        /* --------------------------------------------------------------------
98
         */
99
        /*      Ensure we have the appropriate block loaded. */
100
        /* --------------------------------------------------------------------
101
         */
102
        if (iSrcX >= nEndBlockX)
189,034✔
103
        {
104
        reload_block:
189,034✔
105
        {
106
            const int nLBlockX = iSrcX / nBlockXSize;
388,725✔
107
            nStartBlockX = nLBlockX * nBlockXSize;
388,725✔
108
            nEndBlockX = nStartBlockX + nBlockXSize;
388,725✔
109

110
            if (poBlock != nullptr)
388,725✔
111
                poBlock->DropLock();
316,739✔
112

113
            poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
388,725✔
114
            if (poBlock == nullptr)
388,725✔
115
            {
116
                return false;
1✔
117
            }
118
        }
119

120
        no_reload_block:
388,724✔
121
            const GByte *pabySrcBlock =
122
                static_cast<const GByte *>(poBlock->GetDataRef());
932,842✔
123
            GPtrDiff_t iSrcOffset =
932,842✔
124
                (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
932,842✔
125
            pabySrcData = pabySrcBlock + iSrcOffset;
932,842✔
126
        }
127

128
        /* --------------------------------------------------------------------
129
         */
130
        /*      Copy the maximum run of pixels. */
131
        /* --------------------------------------------------------------------
132
         */
133

134
        const int nIters = std::min(
932,842✔
135
            (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
932,842✔
136
        if (bSameDataType)
137
        {
138
            memcpy(pabyDstData, pabySrcData, nBandDataSize);
932,437✔
139
            if (nIters > 1)
932,437✔
140
            {
141
                if (DATA_TYPE_SIZE == 1)
142
                {
143
                    pabySrcData += nIncSrcOffset;
276,282✔
144
                    pabyDstData += nPixelSpace;
276,282✔
145
                    GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
276,282✔
146
                                     nPixelSpace, nIters - 1);
276,282✔
147
                    pabySrcData +=
276,282✔
148
                        static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
276,282✔
149
                    pabyDstData +=
276,282✔
150
                        static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
276,282✔
151
                }
152
                else
153
                {
154
                    for (int i = 0; i < nIters - 1; i++)
4,443,828✔
155
                    {
156
                        pabySrcData += nIncSrcOffset;
4,245,254✔
157
                        pabyDstData += nPixelSpace;
4,245,254✔
158
                        memcpy(pabyDstData, pabySrcData, nBandDataSize);
4,245,254✔
159
                    }
160
                }
161
                iSrcX += nSrcXInc * (nIters - 1);
474,856✔
162
                nOuterLoopIters -= nIters - 1;
474,856✔
163
            }
164
        }
165
        else
166
        {
167
            // Type to type conversion ...
168
            GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
405✔
169
                            eBufType, nPixelSpace, std::max(1, nIters));
405✔
170
            if (nIters > 1)
405✔
171
            {
172
                pabySrcData +=
198✔
173
                    static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
198✔
174
                pabyDstData +=
198✔
175
                    static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
198✔
176
                iSrcX += nSrcXInc * (nIters - 1);
198✔
177
                nOuterLoopIters -= nIters - 1;
198✔
178
            }
179
        }
180
    }
181

182
    // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
183
    if (nOuterLoopIters == 0)
743,808✔
184
    {
185
        const int nRasterXSize = poBand->GetXSize();
330,578✔
186
        iSrcX =
330,578✔
187
            static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
661,156✔
188
                                      static_cast<GInt64>(nRasterXSize - 1)));
330,578✔
189
        pabyDstData += nPixelSpace;
330,578✔
190
        if (iSrcX < nEndBlockX)
330,578✔
191
        {
192
            goto no_reload_block;
317,988✔
193
        }
194
        goto reload_block;
12,590✔
195
    }
196
    return true;
413,230✔
197
}
198

199
/************************************************************************/
200
/*                             IRasterIO()                              */
201
/*                                                                      */
202
/*      Default internal implementation of RasterIO() ... utilizes      */
203
/*      the Block access methods to satisfy the request.  This would    */
204
/*      normally only be overridden by formats with overviews.          */
205
/************************************************************************/
206

207
CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
5,717,340✔
208
                                 int nXSize, int nYSize, void *pData,
209
                                 int nBufXSize, int nBufYSize,
210
                                 GDALDataType eBufType, GSpacing nPixelSpace,
211
                                 GSpacing nLineSpace,
212
                                 GDALRasterIOExtraArg *psExtraArg)
213

214
{
215
    if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
5,717,340✔
216
    {
217
        CPLError(eFlushBlockErr, CPLE_AppDefined,
×
218
                 "An error occurred while writing a dirty block "
219
                 "from GDALRasterBand::IRasterIO");
220
        CPLErr eErr = eFlushBlockErr;
×
221
        eFlushBlockErr = CE_None;
×
222
        return eErr;
×
223
    }
224
    if (nBlockXSize <= 0 || nBlockYSize <= 0)
5,717,340✔
225
    {
226
        CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
139✔
227
        return CE_Failure;
×
228
    }
229

230
    const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
5,717,200✔
231
    const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
5,717,220✔
232
    GByte dummyBlock[2] = {0, 0};
5,717,230✔
233
    GByte *pabySrcBlock =
5,717,230✔
234
        dummyBlock; /* to avoid Coverity warning about nullptr dereference */
235
    GDALRasterBlock *poBlock = nullptr;
5,717,230✔
236
    const bool bUseIntegerRequestCoords =
5,717,230✔
237
        (!psExtraArg->bFloatingPointWindowValidity ||
5,757,060✔
238
         (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
39,829✔
239
          nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
16,457✔
240

241
    /* ==================================================================== */
242
    /*      A common case is the data requested with the destination        */
243
    /*      is packed, and the block width is the raster width.             */
244
    /* ==================================================================== */
245
    if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
5,635,340✔
246
        nBlockXSize == GetXSize() && nBufXSize == nXSize &&
2,955,580✔
247
        nBufYSize == nYSize && bUseIntegerRequestCoords)
11,352,600✔
248
    {
249
        CPLErr eErr = CE_None;
2,823,200✔
250
        int nLBlockY = -1;
2,823,200✔
251

252
        for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
8,265,180✔
253
        {
254
            const int iSrcY = iBufYOff + nYOff;
5,442,950✔
255

256
            if (iSrcY < nLBlockY * nBlockYSize ||
5,442,950✔
257
                iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
5,442,780✔
258
            {
259
                nLBlockY = iSrcY / nBlockYSize;
3,074,120✔
260
                bool bJustInitialize =
3,074,120✔
261
                    eRWFlag == GF_Write && nXOff == 0 &&
99,598✔
262
                    nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
3,227,060✔
263
                    nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
53,342✔
264

265
                // Is this a partial tile at right and/or bottom edges of
266
                // the raster, and that is going to be completely written?
267
                // If so, do not load it from storage, but zero it so that
268
                // the content outsize of the validity area is initialized.
269
                bool bMemZeroBuffer = false;
3,074,120✔
270
                if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
99,598✔
271
                    nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
21,969✔
272
                    nYOff + nYSize == GetYSize() &&
3,173,810✔
273
                    nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
89✔
274
                {
275
                    bJustInitialize = true;
89✔
276
                    bMemZeroBuffer = true;
89✔
277
                }
278

279
                if (poBlock)
3,074,120✔
280
                    poBlock->DropLock();
250,909✔
281

282
                const GUInt32 nErrorCounter = CPLGetErrorCounter();
3,074,120✔
283
                poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
3,074,080✔
284
                if (poBlock == nullptr)
3,074,210✔
285
                {
286
                    if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
1,067✔
287
                        nullptr)
288
                    {
289
                        CPLError(CE_Failure, CPLE_AppDefined,
×
290
                                 "GetBlockRef failed at X block offset %d, "
291
                                 "Y block offset %d%s",
292
                                 0, nLBlockY,
293
                                 (nErrorCounter != CPLGetErrorCounter())
×
294
                                     ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
×
295
                                     : "");
296
                    }
297
                    eErr = CE_Failure;
1,067✔
298
                    break;
1,067✔
299
                }
300

301
                if (eRWFlag == GF_Write)
3,073,140✔
302
                    poBlock->MarkDirty();
99,598✔
303

304
                pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
3,073,140✔
305
                if (bMemZeroBuffer)
3,073,130✔
306
                {
307
                    memset(pabySrcBlock, 0,
89✔
308
                           static_cast<GPtrDiff_t>(nBandDataSize) *
89✔
309
                               nBlockXSize * nBlockYSize);
89✔
310
                }
311
            }
312

313
            const auto nSrcByteOffset =
5,441,960✔
314
                (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
5,441,960✔
315
                     nBlockXSize +
5,441,960✔
316
                 nXOff) *
5,441,960✔
317
                nBandDataSize;
5,441,960✔
318

319
            if (eDataType == eBufType)
5,441,960✔
320
            {
321
                if (eRWFlag == GF_Read)
1,789,600✔
322
                    memcpy(static_cast<GByte *>(pData) +
1,545,180✔
323
                               static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
1,545,180✔
324
                           pabySrcBlock + nSrcByteOffset,
1,545,180✔
325
                           static_cast<size_t>(nLineSpace));
326
                else
327
                    memcpy(pabySrcBlock + nSrcByteOffset,
244,418✔
328
                           static_cast<GByte *>(pData) +
244,418✔
329
                               static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
244,418✔
330
                           static_cast<size_t>(nLineSpace));
331
            }
332
            else
333
            {
334
                // Type to type conversion.
335
                if (eRWFlag == GF_Read)
3,652,360✔
336
                    GDALCopyWords64(
3,631,130✔
337
                        pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
3,631,130✔
338
                        static_cast<GByte *>(pData) +
339
                            static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
3,631,130✔
340
                        eBufType, static_cast<int>(nPixelSpace), nBufXSize);
341
                else
342
                    GDALCopyWords64(static_cast<GByte *>(pData) +
21,228✔
343
                                        static_cast<GPtrDiff_t>(iBufYOff) *
21,228✔
344
                                            nLineSpace,
345
                                    eBufType, static_cast<int>(nPixelSpace),
346
                                    pabySrcBlock + nSrcByteOffset, eDataType,
21,228✔
347
                                    nBandDataSize, nBufXSize);
348
            }
349

350
            if (psExtraArg->pfnProgress != nullptr &&
5,502,820✔
351
                !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
60,834✔
352
                                         psExtraArg->pProgressData))
353
            {
354
                eErr = CE_Failure;
5✔
355
                break;
5✔
356
            }
357
        }
358

359
        if (poBlock)
2,823,310✔
360
            poBlock->DropLock();
2,822,180✔
361

362
        return eErr;
2,823,250✔
363
    }
364

365
    /* ==================================================================== */
366
    /*      Do we have overviews that would be appropriate to satisfy       */
367
    /*      this request?                                                   */
368
    /* ==================================================================== */
369
    if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
2,894,050✔
370
        eRWFlag == GF_Read)
371
    {
372
        GDALRasterIOExtraArg sExtraArg;
373
        GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
2,830✔
374

375
        const int nOverview =
376
            GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
2,830✔
377
                                          nBufXSize, nBufYSize, &sExtraArg);
378
        if (nOverview >= 0)
2,830✔
379
        {
380
            GDALRasterBand *poOverviewBand = GetOverview(nOverview);
2,810✔
381
            if (poOverviewBand == nullptr)
2,810✔
382
                return CE_Failure;
2,810✔
383

384
            return poOverviewBand->RasterIO(
2,810✔
385
                eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
386
                nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
2,810✔
387
        }
388
    }
389

390
    if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
702,524✔
391
        nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
×
392
        nLineSpace == nPixelSpace * nBufXSize &&
3,593,770✔
393
        CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
×
394
    {
395
        memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
×
396
        return CE_None;
×
397
    }
398

399
    /* ==================================================================== */
400
    /*      The second case when we don't need subsample data but likely    */
401
    /*      need data type conversion.                                      */
402
    /* ==================================================================== */
403
    if (  // nPixelSpace == nBufDataSize &&
2,891,240✔
404
        nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
2,891,240✔
405
    {
406
#if DEBUG_VERBOSE
407
        printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
408
               nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
409
#endif
410

411
        /* --------------------------------------------------------------------
412
         */
413
        /*      Loop over buffer computing source locations. */
414
        /* --------------------------------------------------------------------
415
         */
416
        // Calculate starting values out of loop
417
        const int nLBlockXStart = nXOff / nBlockXSize;
2,528,280✔
418
        const int nXSpanEnd = nBufXSize + nXOff;
2,528,280✔
419

420
        int nYInc = 0;
2,528,280✔
421
        for (int iBufYOff = 0, iSrcY = nYOff; iBufYOff < nBufYSize;
5,090,900✔
422
             iBufYOff += nYInc, iSrcY += nYInc)
2,562,630✔
423
        {
424
            GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
2,562,500✔
425
                                    static_cast<GPtrDiff_t>(nLineSpace);
426
            int nLBlockY = iSrcY / nBlockYSize;
2,562,500✔
427
            int nLBlockX = nLBlockXStart;
2,562,500✔
428
            int iSrcX = nXOff;
2,562,500✔
429
            while (iSrcX < nXSpanEnd)
5,335,780✔
430
            {
431
                int nXSpan = nLBlockX * nBlockXSize;
2,773,050✔
432
                if (nXSpan < INT_MAX - nBlockXSize)
2,773,050✔
433
                    nXSpan += nBlockXSize;
2,773,050✔
434
                else
435
                    nXSpan = INT_MAX;
×
436
                const int nXRight = nXSpan;
2,773,050✔
437
                nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
2,773,050✔
438
                const size_t nXSpanSize =
2,773,050✔
439
                    nXSpan * static_cast<size_t>(nPixelSpace);
2,773,050✔
440

441
                bool bJustInitialize =
2,773,050✔
442
                    eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
2,042,180✔
443
                    nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
37,297✔
444
                    nXOff <= nLBlockX * nBlockXSize &&
4,840,840✔
445
                    nXOff + nXSize >= nXRight;
25,613✔
446

447
                // Is this a partial tile at right and/or bottom edges of
448
                // the raster, and that is going to be completely written?
449
                // If so, do not load it from storage, but zero it so that
450
                // the content outsize of the validity area is initialized.
451
                bool bMemZeroBuffer = false;
2,773,050✔
452
                if (eRWFlag == GF_Write && !bJustInitialize &&
2,042,180✔
453
                    nXOff <= nLBlockX * nBlockXSize &&
2,017,830✔
454
                    nYOff <= nLBlockY * nBlockYSize &&
2,016,200✔
455
                    (nXOff + nXSize >= nXRight ||
12,181✔
456
                     // cppcheck-suppress knownConditionTrueFalse
457
                     (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
4,817,950✔
458
                    (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
12,001✔
459
                     (nYOff + nYSize == GetYSize() &&
10,762✔
460
                      nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
1,975✔
461
                {
462
                    bJustInitialize = true;
3,214✔
463
                    bMemZeroBuffer = true;
3,214✔
464
                }
465

466
                /* --------------------------------------------------------------------
467
                 */
468
                /*      Ensure we have the appropriate block loaded. */
469
                /* --------------------------------------------------------------------
470
                 */
471
                const GUInt32 nErrorCounter = CPLGetErrorCounter();
2,773,050✔
472
                poBlock =
2,773,340✔
473
                    GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
2,772,940✔
474
                if (!poBlock)
2,773,340✔
475
                {
476
                    if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
74✔
477
                        nullptr)
478
                    {
479
                        CPLError(CE_Failure, CPLE_AppDefined,
×
480
                                 "GetBlockRef failed at X block offset %d, "
481
                                 "Y block offset %d%s",
482
                                 nLBlockX, nLBlockY,
483
                                 (nErrorCounter != CPLGetErrorCounter())
×
484
                                     ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
×
485
                                     : "");
486
                    }
487
                    return (CE_Failure);
74✔
488
                }
489

490
                if (eRWFlag == GF_Write)
2,773,260✔
491
                    poBlock->MarkDirty();
2,042,180✔
492

493
                pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
2,773,260✔
494
                if (bMemZeroBuffer)
2,773,250✔
495
                {
496
                    memset(pabySrcBlock, 0,
3,214✔
497
                           static_cast<GPtrDiff_t>(nBandDataSize) *
3,214✔
498
                               nBlockXSize * nBlockYSize);
3,214✔
499
                }
500
                /* --------------------------------------------------------------------
501
                 */
502
                /*      Copy over this chunk of data. */
503
                /* --------------------------------------------------------------------
504
                 */
505
                GPtrDiff_t iSrcOffset =
2,773,250✔
506
                    (static_cast<GPtrDiff_t>(iSrcX) -
2,773,250✔
507
                     static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
2,773,250✔
508
                     (static_cast<GPtrDiff_t>(iSrcY) -
2,773,250✔
509
                      static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
2,773,250✔
510
                         nBlockXSize) *
2,773,250✔
511
                    nBandDataSize;
2,773,250✔
512
                // Fill up as many rows as possible for the loaded block.
513
                const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
5,546,480✔
514
                                          nBufYSize - iBufYOff);
2,773,250✔
515
                for (int k = 0; k < kmax; k++)
58,573,000✔
516
                {
517
                    if (eDataType == eBufType && nPixelSpace == nBufDataSize)
55,800,000✔
518
                    {
519
                        if (eRWFlag == GF_Read)
51,842,800✔
520
                            memcpy(static_cast<GByte *>(pData) + iBufOffset +
47,408,000✔
521
                                       static_cast<GPtrDiff_t>(k) * nLineSpace,
47,408,000✔
522
                                   pabySrcBlock + iSrcOffset, nXSpanSize);
47,408,000✔
523
                        else
524
                            memcpy(pabySrcBlock + iSrcOffset,
4,434,820✔
525
                                   static_cast<GByte *>(pData) + iBufOffset +
4,434,820✔
526
                                       static_cast<GPtrDiff_t>(k) * nLineSpace,
4,434,820✔
527
                                   nXSpanSize);
528
                    }
529
                    else
530
                    {
531
                        /* type to type conversion */
532
                        if (eRWFlag == GF_Read)
3,957,170✔
533
                            GDALCopyWords64(
3,886,900✔
534
                                pabySrcBlock + iSrcOffset, eDataType,
3,886,900✔
535
                                nBandDataSize,
536
                                static_cast<GByte *>(pData) + iBufOffset +
3,886,900✔
537
                                    static_cast<GPtrDiff_t>(k) * nLineSpace,
3,886,900✔
538
                                eBufType, static_cast<int>(nPixelSpace),
539
                                nXSpan);
540
                        else
541
                            GDALCopyWords64(
70,269✔
542
                                static_cast<GByte *>(pData) + iBufOffset +
70,269✔
543
                                    static_cast<GPtrDiff_t>(k) * nLineSpace,
70,269✔
544
                                eBufType, static_cast<int>(nPixelSpace),
545
                                pabySrcBlock + iSrcOffset, eDataType,
70,269✔
546
                                nBandDataSize, nXSpan);
547
                    }
548

549
                    iSrcOffset +=
55,799,800✔
550
                        static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
55,799,800✔
551
                }
552

553
                iBufOffset =
554
                    CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
2,773,060✔
555
                nLBlockX++;
2,773,100✔
556
                iSrcX += nXSpan;
2,773,100✔
557

558
                poBlock->DropLock();
2,773,100✔
559
                poBlock = nullptr;
2,773,280✔
560
            }
561

562
            /* Compute the increment to go on a block boundary */
563
            nYInc = nBlockYSize - (iSrcY % nBlockYSize);
2,562,740✔
564

565
            if (psExtraArg->pfnProgress != nullptr &&
2,564,530✔
566
                !psExtraArg->pfnProgress(
1,791✔
567
                    1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
2,564,530✔
568
                    psExtraArg->pProgressData))
569
            {
570
                return CE_Failure;
117✔
571
            }
572
        }
573

574
        return CE_None;
2,528,400✔
575
    }
576

577
    /* ==================================================================== */
578
    /*      Loop reading required source blocks to satisfy output           */
579
    /*      request.  This is the most general implementation.              */
580
    /* ==================================================================== */
581

582
    double dfXOff = nXOff;
362,967✔
583
    double dfYOff = nYOff;
362,967✔
584
    double dfXSize = nXSize;
362,967✔
585
    double dfYSize = nYSize;
362,967✔
586
    if (psExtraArg->bFloatingPointWindowValidity)
362,967✔
587
    {
588
        dfXOff = psExtraArg->dfXOff;
28,187✔
589
        dfYOff = psExtraArg->dfYOff;
28,187✔
590
        dfXSize = psExtraArg->dfXSize;
28,187✔
591
        dfYSize = psExtraArg->dfYSize;
28,187✔
592
    }
593

594
    /* -------------------------------------------------------------------- */
595
    /*      Compute stepping increment.                                     */
596
    /* -------------------------------------------------------------------- */
597
    const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
362,967✔
598
    const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
362,967✔
599
    CPLErr eErr = CE_None;
362,967✔
600

601
    if (eRWFlag == GF_Write)
362,967✔
602
    {
603
        /* --------------------------------------------------------------------
604
         */
605
        /*    Write case */
606
        /*    Loop over raster window computing source locations in the buffer.
607
         */
608
        /* --------------------------------------------------------------------
609
         */
610
        GByte *pabyDstBlock = nullptr;
166,650✔
611
        int nLBlockX = -1;
166,650✔
612
        int nLBlockY = -1;
166,650✔
613

614
        for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
1,259,590✔
615
        {
616
            const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
1,092,940✔
617

618
            for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
12,063,600✔
619
            {
620
                const int iBufXOff =
10,970,600✔
621
                    static_cast<int>((iDstX - nXOff) / dfSrcXInc);
10,970,600✔
622
                GPtrDiff_t iBufOffset =
10,970,600✔
623
                    static_cast<GPtrDiff_t>(iBufYOff) *
10,970,600✔
624
                        static_cast<GPtrDiff_t>(nLineSpace) +
625
                    iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
10,970,600✔
626

627
                // FIXME: this code likely doesn't work if the dirty block gets
628
                // flushed to disk before being completely written.
629
                // In the meantime, bJustInitialize should probably be set to
630
                // FALSE even if it is not ideal performance wise, and for
631
                // lossy compression.
632

633
                /* --------------------------------------------------------------------
634
                 */
635
                /*      Ensure we have the appropriate block loaded. */
636
                /* --------------------------------------------------------------------
637
                 */
638
                if (iDstX < nLBlockX * nBlockXSize ||
10,970,600✔
639
                    iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
10,721,300✔
640
                    iDstY < nLBlockY * nBlockYSize ||
10,264,600✔
641
                    iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
10,264,600✔
642
                {
643
                    nLBlockX = iDstX / nBlockXSize;
738,642✔
644
                    nLBlockY = iDstY / nBlockYSize;
738,642✔
645

646
                    const bool bJustInitialize =
738,642✔
647
                        nYOff <= nLBlockY * nBlockYSize &&
1,065,870✔
648
                        nYOff + nYSize - nBlockYSize >=
327,231✔
649
                            nLBlockY * nBlockYSize &&
327,231✔
650
                        nXOff <= nLBlockX * nBlockXSize &&
1,116,140✔
651
                        nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
50,265✔
652
                    /*bool bMemZeroBuffer = FALSE;
653
                    if( !bJustInitialize &&
654
                        nXOff <= nLBlockX * nBlockXSize &&
655
                        nYOff <= nLBlockY * nBlockYSize &&
656
                        (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
657
                         (nXOff + nXSize == GetXSize() &&
658
                         (nLBlockX+1) * nBlockXSize > GetXSize())) &&
659
                        (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
660
                         (nYOff + nYSize == GetYSize() &&
661
                         (nLBlockY+1) * nBlockYSize > GetYSize())) )
662
                    {
663
                        bJustInitialize = TRUE;
664
                        bMemZeroBuffer = TRUE;
665
                    }*/
666
                    if (poBlock != nullptr)
738,642✔
667
                        poBlock->DropLock();
571,992✔
668

669
                    poBlock =
738,642✔
670
                        GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
738,642✔
671
                    if (poBlock == nullptr)
738,642✔
672
                    {
673
                        return (CE_Failure);
×
674
                    }
675

676
                    poBlock->MarkDirty();
738,642✔
677

678
                    pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
738,642✔
679
                    /*if( bMemZeroBuffer )
680
                    {
681
                        memset(pabyDstBlock, 0,
682
                            static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
683
                    * nBlockYSize);
684
                    }*/
685
                }
686

687
                // To make Coverity happy. Should not happen by design.
688
                if (pabyDstBlock == nullptr)
10,970,600✔
689
                {
690
                    CPLAssert(false);
×
691
                    eErr = CE_Failure;
692
                    break;
693
                }
694

695
                /* --------------------------------------------------------------------
696
                 */
697
                /*      Copy over this pixel of data. */
698
                /* --------------------------------------------------------------------
699
                 */
700
                GPtrDiff_t iDstOffset =
10,970,600✔
701
                    (static_cast<GPtrDiff_t>(iDstX) -
10,970,600✔
702
                     static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
10,970,600✔
703
                     (static_cast<GPtrDiff_t>(iDstY) -
10,970,600✔
704
                      static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
10,970,600✔
705
                         nBlockXSize) *
10,970,600✔
706
                    nBandDataSize;
10,970,600✔
707

708
                if (eDataType == eBufType)
10,970,600✔
709
                {
710
                    memcpy(pabyDstBlock + iDstOffset,
10,967,500✔
711
                           static_cast<GByte *>(pData) + iBufOffset,
10,967,500✔
712
                           nBandDataSize);
713
                }
714
                else
715
                {
716
                    /* type to type conversion ... ouch, this is expensive way
717
                    of handling single words */
718
                    GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
3,096✔
719
                                    eBufType, 0, pabyDstBlock + iDstOffset,
3,096✔
720
                                    eDataType, 0, 1);
721
                }
722
            }
723

724
            if (psExtraArg->pfnProgress != nullptr &&
1,092,940✔
725
                !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
×
726
                                         psExtraArg->pProgressData))
727
            {
728
                eErr = CE_Failure;
×
729
                break;
×
730
            }
731
        }
732
    }
733
    else
734
    {
735
        if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
196,317✔
736
        {
737
            if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
7,640✔
738
                 psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
2,496✔
739
                 psExtraArg->eResampleAlg == GRIORA_Bilinear ||
2,494✔
740
                 psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
5,149✔
741
                GetColorTable() != nullptr)
2,466✔
742
            {
743
                CPLError(CE_Warning, CPLE_NotSupported,
×
744
                         "Resampling method not supported on paletted band. "
745
                         "Falling back to nearest neighbour");
746
            }
747
            else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
2,575✔
748
                     GDALDataTypeIsComplex(eDataType))
3✔
749
            {
750
                CPLError(CE_Warning, CPLE_NotSupported,
×
751
                         "Resampling method not supported on complex data type "
752
                         "band. Falling back to nearest neighbour");
753
            }
754
            else
755
            {
756
                return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
2,572✔
757
                                         pData, nBufXSize, nBufYSize, eBufType,
758
                                         nPixelSpace, nLineSpace, psExtraArg);
2,572✔
759
            }
760
        }
761

762
        int nLimitBlockY = 0;
193,650✔
763
        const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
193,650✔
764
        int nStartBlockX = -nBlockXSize;
193,650✔
765
        const double EPS = 1e-10;
193,650✔
766
        int nLBlockY = -1;
193,650✔
767
        const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
193,650✔
768
        const bool bIntegerXFactor =
193,650✔
769
            bUseIntegerRequestCoords &&
170,979✔
770
            static_cast<int>(dfSrcXInc) == dfSrcXInc &&
265,636✔
771
            static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
71,986✔
772

773
        /* --------------------------------------------------------------------
774
         */
775
        /*      Read case */
776
        /*      Loop over buffer computing source locations. */
777
        /* --------------------------------------------------------------------
778
         */
779
        for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
1,957,320✔
780
        {
781
            // Add small epsilon to avoid some numeric precision issues.
782
            const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
1,763,680✔
783
            const int iSrcY = static_cast<int>(std::min(
1,763,680✔
784
                std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
1,763,680✔
785

786
            GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
1,763,680✔
787
                                    static_cast<GPtrDiff_t>(nLineSpace);
788

789
            if (iSrcY >= nLimitBlockY)
1,763,680✔
790
            {
791
                nLBlockY = iSrcY / nBlockYSize;
235,011✔
792
                nLimitBlockY = nLBlockY * nBlockYSize;
235,011✔
793
                if (nLimitBlockY < INT_MAX - nBlockYSize)
235,011✔
794
                    nLimitBlockY += nBlockYSize;
235,011✔
795
                else
796
                    nLimitBlockY = INT_MAX;
×
797
                // Make sure a new block is loaded.
798
                nStartBlockX = -nBlockXSize;
235,011✔
799
            }
800
            else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
1,528,670✔
801
            {
802
                // Make sure a new block is loaded.
803
                nStartBlockX = -nBlockXSize;
429,795✔
804
            }
805

806
            GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
1,763,680✔
807
                                       static_cast<GPtrDiff_t>(nBlockXSize);
1,763,680✔
808

809
            if (bIntegerXFactor)
1,763,680✔
810
            {
811
                int iSrcX = static_cast<int>(dfSrcXStart);
413,231✔
812
                const int nSrcXInc = static_cast<int>(dfSrcXInc);
413,231✔
813
                GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
413,231✔
814
                bool bRet = false;
413,231✔
815
                if (bByteCopy)
413,231✔
816
                {
817
                    bRet = DownsamplingIntegerXFactor<true, 1>(
302,844✔
818
                        this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
819
                        static_cast<int>(nPixelSpace), nBufXSize, GDT_Byte,
820
                        GDT_Byte, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
821
                }
822
                else if (eDataType == eBufType)
110,387✔
823
                {
824
                    switch (nBandDataSize)
110,182✔
825
                    {
826
                        case 2:
110,102✔
827
                            bRet = DownsamplingIntegerXFactor<true, 2>(
110,102✔
828
                                this, iSrcX, nSrcXInc, iSrcOffsetCst,
829
                                pabyDstData, static_cast<int>(nPixelSpace),
830
                                nBufXSize, eDataType, eDataType, nStartBlockX,
831
                                nBlockXSize, poBlock, nLBlockY);
832
                            break;
110,102✔
833
                        case 4:
22✔
834
                            bRet = DownsamplingIntegerXFactor<true, 4>(
22✔
835
                                this, iSrcX, nSrcXInc, iSrcOffsetCst,
836
                                pabyDstData, static_cast<int>(nPixelSpace),
837
                                nBufXSize, eDataType, eDataType, nStartBlockX,
838
                                nBlockXSize, poBlock, nLBlockY);
839
                            break;
22✔
840
                        case 8:
56✔
841
                            bRet = DownsamplingIntegerXFactor<true, 8>(
56✔
842
                                this, iSrcX, nSrcXInc, iSrcOffsetCst,
843
                                pabyDstData, static_cast<int>(nPixelSpace),
844
                                nBufXSize, eDataType, eDataType, nStartBlockX,
845
                                nBlockXSize, poBlock, nLBlockY);
846
                            break;
56✔
847
                        case 16:
2✔
848
                            bRet = DownsamplingIntegerXFactor<true, 16>(
2✔
849
                                this, iSrcX, nSrcXInc, iSrcOffsetCst,
850
                                pabyDstData, static_cast<int>(nPixelSpace),
851
                                nBufXSize, eDataType, eDataType, nStartBlockX,
852
                                nBlockXSize, poBlock, nLBlockY);
853
                            break;
2✔
854
                        default:
×
855
                            CPLAssert(false);
×
856
                            break;
857
                    }
858
                }
859
                else
860
                {
861
                    bRet = DownsamplingIntegerXFactor<false, 0>(
205✔
862
                        this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
863
                        static_cast<int>(nPixelSpace), nBufXSize, eDataType,
864
                        eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
865
                }
866
                if (!bRet)
413,231✔
867
                    eErr = CE_Failure;
1✔
868
            }
869
            else
870
            {
871
                double dfSrcX = dfSrcXStart;
1,350,450✔
872
                for (int iBufXOff = 0; iBufXOff < nBufXSize;
572,584,000✔
873
                     iBufXOff++, dfSrcX += dfSrcXInc)
571,233,000✔
874
                {
875
                    // TODO?: try to avoid the clamping for most iterations
876
                    const int iSrcX = static_cast<int>(
877
                        std::min(std::max(0.0, dfSrcX),
1,142,470,000✔
878
                                 static_cast<double>(nRasterXSize - 1)));
571,233,000✔
879

880
                    /* --------------------------------------------------------------------
881
                     */
882
                    /*      Ensure we have the appropriate block loaded. */
883
                    /* --------------------------------------------------------------------
884
                     */
885
                    if (iSrcX >= nBlockXSize + nStartBlockX)
571,233,000✔
886
                    {
887
                        const int nLBlockX = iSrcX / nBlockXSize;
1,705,570✔
888
                        nStartBlockX = nLBlockX * nBlockXSize;
1,705,570✔
889

890
                        if (poBlock != nullptr)
1,705,570✔
891
                            poBlock->DropLock();
1,583,910✔
892

893
                        poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
1,705,570✔
894
                        if (poBlock == nullptr)
1,705,570✔
895
                        {
896
                            eErr = CE_Failure;
9✔
897
                            break;
9✔
898
                        }
899

900
                        pabySrcBlock =
901
                            static_cast<GByte *>(poBlock->GetDataRef());
1,705,560✔
902
                    }
903
                    const GPtrDiff_t nDiffX =
571,233,000✔
904
                        static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
571,233,000✔
905

906
                    /* --------------------------------------------------------------------
907
                     */
908
                    /*      Copy over this pixel of data. */
909
                    /* --------------------------------------------------------------------
910
                     */
911

912
                    if (bByteCopy)
571,233,000✔
913
                    {
914
                        GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
517,794,000✔
915
                        static_cast<GByte *>(pData)[iBufOffset] =
517,794,000✔
916
                            pabySrcBlock[iSrcOffset];
517,794,000✔
917
                    }
918
                    else if (eDataType == eBufType)
53,439,200✔
919
                    {
920
                        GPtrDiff_t iSrcOffset =
48,225,500✔
921
                            (nDiffX + iSrcOffsetCst) * nBandDataSize;
48,225,500✔
922
                        memcpy(static_cast<GByte *>(pData) + iBufOffset,
48,225,500✔
923
                               pabySrcBlock + iSrcOffset, nBandDataSize);
48,225,500✔
924
                    }
925
                    else
926
                    {
927
                        // Type to type conversion ...
928
                        GPtrDiff_t iSrcOffset =
5,213,680✔
929
                            (nDiffX + iSrcOffsetCst) * nBandDataSize;
5,213,680✔
930
                        GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
5,213,680✔
931
                                        static_cast<GByte *>(pData) +
932
                                            iBufOffset,
5,213,680✔
933
                                        eBufType, 0, 1);
934
                    }
935

936
                    iBufOffset += static_cast<int>(nPixelSpace);
571,233,000✔
937
                }
938
            }
939
            if (eErr == CE_Failure)
1,763,680✔
940
                break;
11✔
941

942
            if (psExtraArg->pfnProgress != nullptr &&
1,983,390✔
943
                !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
219,720✔
944
                                         psExtraArg->pProgressData))
945
            {
946
                eErr = CE_Failure;
1✔
947
                break;
1✔
948
            }
949
        }
950
    }
951

952
    if (poBlock != nullptr)
360,300✔
953
        poBlock->DropLock();
360,290✔
954

955
    return eErr;
360,300✔
956
}
957

958
/************************************************************************/
959
/*                         GDALRasterIOTransformer()                    */
960
/************************************************************************/
961

962
struct GDALRasterIOTransformerStruct
963
{
964
    double dfXOff;
965
    double dfYOff;
966
    double dfXRatioDstToSrc;
967
    double dfYRatioDstToSrc;
968
};
969

970
static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
6,748✔
971
                                   int nPointCount, double *x, double *y,
972
                                   double * /* z */, int *panSuccess)
973
{
974
    GDALRasterIOTransformerStruct *psParams =
6,748✔
975
        static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
976
    if (bDstToSrc)
6,748✔
977
    {
978
        for (int i = 0; i < nPointCount; i++)
252,996✔
979
        {
980
            x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
246,836✔
981
            y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
246,836✔
982
            panSuccess[i] = TRUE;
246,836✔
983
        }
984
    }
985
    else
986
    {
987
        for (int i = 0; i < nPointCount; i++)
1,176✔
988
        {
989
            x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
588✔
990
            y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
588✔
991
            panSuccess[i] = TRUE;
588✔
992
        }
993
    }
994
    return TRUE;
6,748✔
995
}
996

997
/************************************************************************/
998
/*                          RasterIOResampled()                         */
999
/************************************************************************/
1000

1001
//! @cond Doxygen_Suppress
1002
CPLErr GDALRasterBand::RasterIOResampled(
2,572✔
1003
    GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1004
    void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1005
    GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
1006
{
1007
    // Determine if we use warping resampling or overview resampling
1008
    const bool bUseWarp =
1009
        (GDALDataTypeIsComplex(eDataType) &&
2,572✔
1010
         psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
2,729✔
1011
         psExtraArg->eResampleAlg != GRIORA_Mode);
157✔
1012

1013
    double dfXOff = nXOff;
2,572✔
1014
    double dfYOff = nYOff;
2,572✔
1015
    double dfXSize = nXSize;
2,572✔
1016
    double dfYSize = nYSize;
2,572✔
1017
    if (psExtraArg->bFloatingPointWindowValidity)
2,572✔
1018
    {
1019
        dfXOff = psExtraArg->dfXOff;
2,115✔
1020
        dfYOff = psExtraArg->dfYOff;
2,115✔
1021
        dfXSize = psExtraArg->dfXSize;
2,115✔
1022
        dfYSize = psExtraArg->dfYSize;
2,115✔
1023
    }
1024

1025
    const double dfXRatioDstToSrc = dfXSize / nBufXSize;
2,572✔
1026
    const double dfYRatioDstToSrc = dfYSize / nBufYSize;
2,572✔
1027

1028
    // Determine the coordinates in the "virtual" output raster to see
1029
    // if there are not integers, in which case we will use them as a shift
1030
    // so that subwindow extracts give the exact same results as entire raster
1031
    // scaling.
1032
    double dfDestXOff = dfXOff / dfXRatioDstToSrc;
2,572✔
1033
    bool bHasXOffVirtual = false;
2,572✔
1034
    int nDestXOffVirtual = 0;
2,572✔
1035
    if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
2,572✔
1036
    {
1037
        bHasXOffVirtual = true;
2,246✔
1038
        dfXOff = nXOff;
2,246✔
1039
        nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
2,246✔
1040
    }
1041

1042
    double dfDestYOff = dfYOff / dfYRatioDstToSrc;
2,572✔
1043
    bool bHasYOffVirtual = false;
2,572✔
1044
    int nDestYOffVirtual = 0;
2,572✔
1045
    if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
2,572✔
1046
    {
1047
        bHasYOffVirtual = true;
2,240✔
1048
        dfYOff = nYOff;
2,240✔
1049
        nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
2,240✔
1050
    }
1051

1052
    // Create a MEM dataset that wraps the output buffer.
1053
    GDALDataset *poMEMDS;
1054
    void *pTempBuffer = nullptr;
2,572✔
1055
    GSpacing nPSMem = nPixelSpace;
2,572✔
1056
    GSpacing nLSMem = nLineSpace;
2,572✔
1057
    void *pDataMem = pData;
2,572✔
1058
    GDALDataType eDTMem = eBufType;
2,572✔
1059
    if (eBufType != eDataType)
2,572✔
1060
    {
1061
        nPSMem = GDALGetDataTypeSizeBytes(eDataType);
40✔
1062
        nLSMem = nPSMem * nBufXSize;
40✔
1063
        pTempBuffer =
1064
            VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
40✔
1065
        if (pTempBuffer == nullptr)
40✔
1066
            return CE_Failure;
×
1067
        pDataMem = pTempBuffer;
40✔
1068
        eDTMem = eDataType;
40✔
1069
    }
1070

1071
    poMEMDS =
1072
        MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
2,572✔
1073
                           nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
1074
    GByte *pabyData = static_cast<GByte *>(pDataMem) -
2,572✔
1075
                      nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
2,572✔
1076
    GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
2,572✔
1077
        poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
1078
    poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
2,572✔
1079

1080
    const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
2,572✔
1081
    const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
2,572✔
1082
    if (pszNBITS)
2,572✔
1083
        GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
6✔
1084
            "NBITS", pszNBITS, "IMAGE_STRUCTURE");
6✔
1085

1086
    CPLErr eErr = CE_None;
2,572✔
1087

1088
    // Do the resampling.
1089
    if (bUseWarp)
2,572✔
1090
    {
1091
        int bHasNoData = FALSE;
149✔
1092
        double dfNoDataValue = GetNoDataValue(&bHasNoData);
149✔
1093

1094
        VRTDatasetH hVRTDS = nullptr;
149✔
1095
        GDALRasterBandH hVRTBand = nullptr;
149✔
1096
        if (GetDataset() == nullptr)
149✔
1097
        {
1098
            /* Create VRT dataset that wraps the whole dataset */
1099
            hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
×
1100
            VRTAddBand(hVRTDS, eDataType, nullptr);
×
1101
            hVRTBand = GDALGetRasterBand(hVRTDS, 1);
×
1102
            VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
×
1103
                               0, 0, nRasterXSize, nRasterYSize, nullptr,
1104
                               VRT_NODATA_UNSET);
1105

1106
            /* Add a mask band if needed */
1107
            if (GetMaskFlags() != GMF_ALL_VALID)
×
1108
            {
1109
                GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
×
1110
                VRTSourcedRasterBand *poVRTMaskBand =
1111
                    reinterpret_cast<VRTSourcedRasterBand *>(
1112
                        reinterpret_cast<GDALRasterBand *>(hVRTBand)
1113
                            ->GetMaskBand());
×
1114
                poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
×
1115
                                                 nRasterYSize, 0, 0,
×
1116
                                                 nRasterXSize, nRasterYSize);
×
1117
            }
1118
        }
1119

1120
        GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
149✔
1121
        switch (psExtraArg->eResampleAlg)
149✔
1122
        {
1123
            case GRIORA_NearestNeighbour:
×
1124
                psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
×
1125
                break;
×
1126
            case GRIORA_Bilinear:
147✔
1127
                psWarpOptions->eResampleAlg = GRA_Bilinear;
147✔
1128
                break;
147✔
1129
            case GRIORA_Cubic:
×
1130
                psWarpOptions->eResampleAlg = GRA_Cubic;
×
1131
                break;
×
1132
            case GRIORA_CubicSpline:
×
1133
                psWarpOptions->eResampleAlg = GRA_CubicSpline;
×
1134
                break;
×
1135
            case GRIORA_Lanczos:
×
1136
                psWarpOptions->eResampleAlg = GRA_Lanczos;
×
1137
                break;
×
1138
            case GRIORA_Average:
×
1139
                psWarpOptions->eResampleAlg = GRA_Average;
×
1140
                break;
×
1141
            case GRIORA_RMS:
2✔
1142
                psWarpOptions->eResampleAlg = GRA_RMS;
2✔
1143
                break;
2✔
1144
            case GRIORA_Mode:
×
1145
                psWarpOptions->eResampleAlg = GRA_Mode;
×
1146
                break;
×
1147
            default:
×
1148
                CPLAssert(false);
×
1149
                psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1150
                break;
1151
        }
1152
        psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
149✔
1153
        psWarpOptions->hDstDS = poMEMDS;
149✔
1154
        psWarpOptions->nBandCount = 1;
149✔
1155
        int nSrcBandNumber = hVRTDS ? 1 : nBand;
149✔
1156
        int nDstBandNumber = 1;
149✔
1157
        psWarpOptions->panSrcBands = &nSrcBandNumber;
149✔
1158
        psWarpOptions->panDstBands = &nDstBandNumber;
149✔
1159
        psWarpOptions->pfnProgress = psExtraArg->pfnProgress
298✔
1160
                                         ? psExtraArg->pfnProgress
149✔
1161
                                         : GDALDummyProgress;
1162
        psWarpOptions->pProgressArg = psExtraArg->pProgressData;
149✔
1163
        psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
149✔
1164
        if (bHasNoData)
149✔
1165
        {
1166
            psWarpOptions->papszWarpOptions = CSLSetNameValue(
×
1167
                psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
1168
            if (psWarpOptions->padfSrcNoDataReal == nullptr)
×
1169
            {
1170
                psWarpOptions->padfSrcNoDataReal =
×
1171
                    static_cast<double *>(CPLMalloc(sizeof(double)));
×
1172
                psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
×
1173
            }
1174

1175
            if (psWarpOptions->padfDstNoDataReal == nullptr)
×
1176
            {
1177
                psWarpOptions->padfDstNoDataReal =
×
1178
                    static_cast<double *>(CPLMalloc(sizeof(double)));
×
1179
                psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
×
1180
            }
1181
        }
1182

1183
        GDALRasterIOTransformerStruct sTransformer;
1184
        sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
149✔
1185
        sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
149✔
1186
        sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
149✔
1187
        sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
149✔
1188
        psWarpOptions->pTransformerArg = &sTransformer;
149✔
1189

1190
        GDALWarpOperationH hWarpOperation =
1191
            GDALCreateWarpOperation(psWarpOptions);
149✔
1192
        eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
149✔
1193
                                     nDestYOffVirtual, nBufXSize, nBufYSize);
1194
        GDALDestroyWarpOperation(hWarpOperation);
149✔
1195

1196
        psWarpOptions->panSrcBands = nullptr;
149✔
1197
        psWarpOptions->panDstBands = nullptr;
149✔
1198
        GDALDestroyWarpOptions(psWarpOptions);
149✔
1199

1200
        if (hVRTDS)
149✔
1201
            GDALClose(hVRTDS);
×
1202
    }
1203
    else
1204
    {
1205
        const char *pszResampling =
2,423✔
1206
            (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
2,610✔
1207
            : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
298✔
1208
            : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
220✔
1209
            : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
213✔
1210
            : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
159✔
1211
            : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
95✔
1212
            : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
43✔
1213
            : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
3✔
1214
                                                               : "UNKNOWN";
1215

1216
        int nKernelRadius = 0;
2,423✔
1217
        GDALResampleFunction pfnResampleFunc =
1218
            GDALGetResampleFunction(pszResampling, &nKernelRadius);
2,423✔
1219
        CPLAssert(pfnResampleFunc);
2,423✔
1220
        GDALDataType eWrkDataType =
1221
            GDALGetOvrWorkDataType(pszResampling, eDataType);
2,423✔
1222
        int nHasNoData = 0;
2,423✔
1223
        double dfNoDataValue = GetNoDataValue(&nHasNoData);
2,423✔
1224
        const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
2,423✔
1225
        if (!bHasNoData)
2,423✔
1226
            dfNoDataValue = 0.0;
2,359✔
1227

1228
        int nDstBlockXSize = nBufXSize;
2,423✔
1229
        int nDstBlockYSize = nBufYSize;
2,423✔
1230
        int nFullResXChunk = 0;
2,423✔
1231
        int nFullResYChunk = 0;
2,423✔
1232
        while (true)
1233
        {
1234
            nFullResXChunk =
2,423✔
1235
                3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
2,423✔
1236
            nFullResYChunk =
2,423✔
1237
                3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
2,423✔
1238
            if (nFullResXChunk > nRasterXSize)
2,423✔
1239
                nFullResXChunk = nRasterXSize;
2,234✔
1240
            if (nFullResYChunk > nRasterYSize)
2,423✔
1241
                nFullResYChunk = nRasterYSize;
217✔
1242
            if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
2,423✔
1243
                (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
2,377✔
1244
                 1024 * 1024))
1245
                break;
1246
            // When operating on the full width of a raster whose block width is
1247
            // the raster width, prefer doing chunks in height.
1248
            if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
×
1249
                nDstBlockYSize > 1)
1250
                nDstBlockYSize /= 2;
×
1251
            /* Otherwise cut the maximal dimension */
1252
            else if (nDstBlockXSize > 1 &&
×
1253
                     (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
×
1254
                nDstBlockXSize /= 2;
×
1255
            else
1256
                nDstBlockYSize /= 2;
×
1257
        }
1258

1259
        int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
2,423✔
1260
        int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
2,423✔
1261
        if (nOvrXFactor == 0)
2,423✔
1262
            nOvrXFactor = 1;
2,024✔
1263
        if (nOvrYFactor == 0)
2,423✔
1264
            nOvrYFactor = 1;
2,023✔
1265
        int nFullResXSizeQueried =
2,423✔
1266
            nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
2,423✔
1267
        int nFullResYSizeQueried =
2,423✔
1268
            nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
2,423✔
1269

1270
        if (nFullResXSizeQueried > nRasterXSize)
2,423✔
1271
            nFullResXSizeQueried = nRasterXSize;
2,136✔
1272
        if (nFullResYSizeQueried > nRasterYSize)
2,423✔
1273
            nFullResYSizeQueried = nRasterYSize;
130✔
1274

1275
        void *pChunk =
1276
            VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
2,423✔
1277
                                nFullResXSizeQueried, nFullResYSizeQueried);
1278
        GByte *pabyChunkNoDataMask = nullptr;
2,423✔
1279

1280
        GDALRasterBand *poMaskBand = GetMaskBand();
2,423✔
1281
        int l_nMaskFlags = GetMaskFlags();
2,423✔
1282

1283
        bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
2,423✔
1284
        if (bUseNoDataMask)
2,423✔
1285
        {
1286
            pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
126✔
1287
                nFullResXSizeQueried, nFullResYSizeQueried));
1288
        }
1289
        if (pChunk == nullptr ||
2,423✔
1290
            (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
126✔
1291
        {
1292
            GDALClose(poMEMDS);
×
1293
            CPLFree(pChunk);
×
1294
            CPLFree(pabyChunkNoDataMask);
×
1295
            VSIFree(pTempBuffer);
×
1296
            return CE_Failure;
×
1297
        }
1298

1299
        int nTotalBlocks = ((nBufXSize + nDstBlockXSize - 1) / nDstBlockXSize) *
2,423✔
1300
                           ((nBufYSize + nDstBlockYSize - 1) / nDstBlockYSize);
2,423✔
1301
        int nBlocksDone = 0;
2,423✔
1302

1303
        int nDstYOff;
1304
        for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
4,846✔
1305
             nDstYOff += nDstBlockYSize)
2,423✔
1306
        {
1307
            int nDstYCount;
1308
            if (nDstYOff + nDstBlockYSize <= nBufYSize)
2,423✔
1309
                nDstYCount = nDstBlockYSize;
2,423✔
1310
            else
1311
                nDstYCount = nBufYSize - nDstYOff;
×
1312

1313
            int nChunkYOff =
2,423✔
1314
                nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
2,423✔
1315
            int nChunkYOff2 = nYOff + 1 +
2,423✔
1316
                              static_cast<int>(ceil((nDstYOff + nDstYCount) *
2,423✔
1317
                                                    dfYRatioDstToSrc));
1318
            if (nChunkYOff2 > nRasterYSize)
2,423✔
1319
                nChunkYOff2 = nRasterYSize;
324✔
1320
            int nYCount = nChunkYOff2 - nChunkYOff;
2,423✔
1321
            CPLAssert(nYCount <= nFullResYChunk);
2,423✔
1322

1323
            int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
2,423✔
1324
            int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
2,423✔
1325
            if (nChunkYOffQueried < 0)
2,423✔
1326
            {
1327
                nChunkYSizeQueried += nChunkYOffQueried;
232✔
1328
                nChunkYOffQueried = 0;
232✔
1329
            }
1330
            if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
2,423✔
1331
                nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
332✔
1332
            CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
2,423✔
1333

1334
            int nDstXOff = 0;
2,423✔
1335
            for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
4,846✔
1336
                 nDstXOff += nDstBlockXSize)
2,423✔
1337
            {
1338
                int nDstXCount = 0;
2,423✔
1339
                if (nDstXOff + nDstBlockXSize <= nBufXSize)
2,423✔
1340
                    nDstXCount = nDstBlockXSize;
2,423✔
1341
                else
1342
                    nDstXCount = nBufXSize - nDstXOff;
×
1343

1344
                int nChunkXOff =
2,423✔
1345
                    nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
2,423✔
1346
                int nChunkXOff2 =
2,423✔
1347
                    nXOff + 1 +
2,423✔
1348
                    static_cast<int>(
2,423✔
1349
                        ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
2,423✔
1350
                if (nChunkXOff2 > nRasterXSize)
2,423✔
1351
                    nChunkXOff2 = nRasterXSize;
2,235✔
1352
                int nXCount = nChunkXOff2 - nChunkXOff;
2,423✔
1353
                CPLAssert(nXCount <= nFullResXChunk);
2,423✔
1354

1355
                int nChunkXOffQueried =
2,423✔
1356
                    nChunkXOff - nKernelRadius * nOvrXFactor;
2,423✔
1357
                int nChunkXSizeQueried =
2,423✔
1358
                    nXCount + 2 * nKernelRadius * nOvrXFactor;
2,423✔
1359
                if (nChunkXOffQueried < 0)
2,423✔
1360
                {
1361
                    nChunkXSizeQueried += nChunkXOffQueried;
2,149✔
1362
                    nChunkXOffQueried = 0;
2,149✔
1363
                }
1364
                if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
2,423✔
1365
                    nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
2,135✔
1366
                CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
2,423✔
1367

1368
                // Read the source buffers.
1369
                eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
2,423✔
1370
                                nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1371
                                nChunkXSizeQueried, nChunkYSizeQueried,
1372
                                eWrkDataType, 0, 0, nullptr);
1373

1374
                bool bSkipResample = false;
2,423✔
1375
                bool bNoDataMaskFullyOpaque = false;
2,423✔
1376
                if (eErr == CE_None && bUseNoDataMask)
2,423✔
1377
                {
1378
                    eErr = poMaskBand->RasterIO(
126✔
1379
                        GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1380
                        nChunkXSizeQueried, nChunkYSizeQueried,
1381
                        pabyChunkNoDataMask, nChunkXSizeQueried,
1382
                        nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
1383

1384
                    /* Optimizations if mask if fully opaque or transparent */
1385
                    int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
126✔
1386
                    GByte bVal = pabyChunkNoDataMask[0];
126✔
1387
                    int i = 1;
126✔
1388
                    for (; i < nPixels; i++)
241,310✔
1389
                    {
1390
                        if (pabyChunkNoDataMask[i] != bVal)
241,261✔
1391
                            break;
77✔
1392
                    }
1393
                    if (i == nPixels)
126✔
1394
                    {
1395
                        if (bVal == 0)
49✔
1396
                        {
1397
                            for (int j = 0; j < nDstYCount; j++)
712✔
1398
                            {
1399
                                GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
686✔
1400
                                                static_cast<GByte *>(pDataMem) +
1401
                                                    nLSMem * (j + nDstYOff) +
686✔
1402
                                                    nDstXOff * nPSMem,
686✔
1403
                                                eDTMem,
1404
                                                static_cast<int>(nPSMem),
1405
                                                nDstXCount);
1406
                            }
1407
                            bSkipResample = true;
26✔
1408
                        }
1409
                        else
1410
                        {
1411
                            bNoDataMaskFullyOpaque = true;
23✔
1412
                        }
1413
                    }
1414
                }
1415

1416
                if (!bSkipResample && eErr == CE_None)
2,423✔
1417
                {
1418
                    const bool bPropagateNoData = false;
2,395✔
1419
                    void *pDstBuffer = nullptr;
2,395✔
1420
                    GDALDataType eDstBufferDataType = GDT_Unknown;
2,395✔
1421
                    GDALRasterBand *poMEMBand =
1422
                        GDALRasterBand::FromHandle(hMEMBand);
2,395✔
1423
                    GDALOverviewResampleArgs args;
2,395✔
1424
                    args.eSrcDataType = eDataType;
2,395✔
1425
                    args.eOvrDataType = poMEMBand->GetRasterDataType();
2,395✔
1426
                    args.nOvrXSize = poMEMBand->GetXSize();
2,395✔
1427
                    args.nOvrYSize = poMEMBand->GetYSize();
2,395✔
1428
                    args.nOvrNBITS = nNBITS;
2,395✔
1429
                    args.dfXRatioDstToSrc = dfXRatioDstToSrc;
2,395✔
1430
                    args.dfYRatioDstToSrc = dfYRatioDstToSrc;
2,395✔
1431
                    args.dfSrcXDelta =
2,395✔
1432
                        dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
2,395✔
1433
                    args.dfSrcYDelta =
2,395✔
1434
                        dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
2,395✔
1435
                    args.eWrkDataType = eWrkDataType;
2,395✔
1436
                    args.pabyChunkNodataMask =
2,395✔
1437
                        bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
2,395✔
1438
                    args.nChunkXOff =
2,395✔
1439
                        nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
2,395✔
1440
                    args.nChunkXSize = nChunkXSizeQueried;
2,395✔
1441
                    args.nChunkYOff =
2,395✔
1442
                        nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
2,395✔
1443
                    args.nChunkYSize = nChunkYSizeQueried;
2,395✔
1444
                    args.nDstXOff = nDstXOff + nDestXOffVirtual;
2,395✔
1445
                    args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
2,395✔
1446
                    args.nDstYOff = nDstYOff + nDestYOffVirtual;
2,395✔
1447
                    args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
2,395✔
1448
                    args.pszResampling = pszResampling;
2,395✔
1449
                    args.bHasNoData = bHasNoData;
2,395✔
1450
                    args.dfNoDataValue = dfNoDataValue;
2,395✔
1451
                    args.poColorTable = GetColorTable();
2,395✔
1452
                    args.bPropagateNoData = bPropagateNoData;
2,395✔
1453
                    eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
2,395✔
1454
                                           &eDstBufferDataType);
1455
                    if (eErr == CE_None)
2,395✔
1456
                    {
1457
                        eErr = poMEMBand->RasterIO(
2,395✔
1458
                            GF_Write, nDstXOff + nDestXOffVirtual,
1459
                            nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
1460
                            pDstBuffer, nDstXCount, nDstYCount,
1461
                            eDstBufferDataType, 0, 0, nullptr);
1462
                    }
1463
                    CPLFree(pDstBuffer);
2,395✔
1464
                }
1465

1466
                nBlocksDone++;
2,423✔
1467
                if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
2,453✔
1468
                    !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
30✔
1469
                                             "", psExtraArg->pProgressData))
1470
                {
1471
                    eErr = CE_Failure;
1✔
1472
                }
1473
            }
1474
        }
1475

1476
        CPLFree(pChunk);
2,423✔
1477
        CPLFree(pabyChunkNoDataMask);
2,423✔
1478
    }
1479

1480
    if (eBufType != eDataType)
2,572✔
1481
    {
1482
        CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
40✔
1483
            GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
1484
            pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
1485
            nullptr));
1486
    }
1487
    GDALClose(poMEMDS);
2,572✔
1488
    VSIFree(pTempBuffer);
2,572✔
1489

1490
    return eErr;
2,572✔
1491
}
1492

1493
/************************************************************************/
1494
/*                          RasterIOResampled()                         */
1495
/************************************************************************/
1496

1497
CPLErr GDALDataset::RasterIOResampled(
297✔
1498
    GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1499
    void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1500
    int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
1501
    GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
1502

1503
{
1504
#if 0
1505
    // Determine if we use warping resampling or overview resampling
1506
    bool bUseWarp = false;
1507
    if( GDALDataTypeIsComplex( eDataType ) )
1508
        bUseWarp = true;
1509
#endif
1510

1511
    double dfXOff = nXOff;
297✔
1512
    double dfYOff = nYOff;
297✔
1513
    double dfXSize = nXSize;
297✔
1514
    double dfYSize = nYSize;
297✔
1515
    if (psExtraArg->bFloatingPointWindowValidity)
297✔
1516
    {
1517
        dfXOff = psExtraArg->dfXOff;
176✔
1518
        dfYOff = psExtraArg->dfYOff;
176✔
1519
        dfXSize = psExtraArg->dfXSize;
176✔
1520
        dfYSize = psExtraArg->dfYSize;
176✔
1521
    }
1522

1523
    const double dfXRatioDstToSrc = dfXSize / nBufXSize;
297✔
1524
    const double dfYRatioDstToSrc = dfYSize / nBufYSize;
297✔
1525

1526
    // Determine the coordinates in the "virtual" output raster to see
1527
    // if there are not integers, in which case we will use them as a shift
1528
    // so that subwindow extracts give the exact same results as entire raster
1529
    // scaling.
1530
    double dfDestXOff = dfXOff / dfXRatioDstToSrc;
297✔
1531
    bool bHasXOffVirtual = false;
297✔
1532
    int nDestXOffVirtual = 0;
297✔
1533
    if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
297✔
1534
    {
1535
        bHasXOffVirtual = true;
167✔
1536
        dfXOff = nXOff;
167✔
1537
        nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
167✔
1538
    }
1539

1540
    double dfDestYOff = dfYOff / dfYRatioDstToSrc;
297✔
1541
    bool bHasYOffVirtual = false;
297✔
1542
    int nDestYOffVirtual = 0;
297✔
1543
    if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
297✔
1544
    {
1545
        bHasYOffVirtual = true;
129✔
1546
        dfYOff = nYOff;
129✔
1547
        nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
129✔
1548
    }
1549

1550
    // Create a MEM dataset that wraps the output buffer.
1551
    GDALDataset *poMEMDS =
1552
        MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
297✔
1553
                           nDestYOffVirtual + nBufYSize, 0, eBufType, nullptr);
1554
    GDALRasterBand **papoDstBands = static_cast<GDALRasterBand **>(
1555
        CPLMalloc(nBandCount * sizeof(GDALRasterBand *)));
293✔
1556
    int nNBITS = 0;
288✔
1557
    for (int i = 0; i < nBandCount; i++)
1,267✔
1558
    {
1559
        char szBuffer[32] = {'\0'};
982✔
1560
        int nRet = CPLPrintPointer(
1,979✔
1561
            szBuffer,
1562
            static_cast<GByte *>(pData) - nPixelSpace * nDestXOffVirtual -
982✔
1563
                nLineSpace * nDestYOffVirtual + nBandSpace * i,
982✔
1564
            sizeof(szBuffer));
1565
        szBuffer[nRet] = 0;
997✔
1566

1567
        char szBuffer0[64] = {'\0'};
997✔
1568
        snprintf(szBuffer0, sizeof(szBuffer0), "DATAPOINTER=%s", szBuffer);
997✔
1569

1570
        char szBuffer1[64] = {'\0'};
997✔
1571
        snprintf(szBuffer1, sizeof(szBuffer1), "PIXELOFFSET=" CPL_FRMT_GIB,
997✔
1572
                 static_cast<GIntBig>(nPixelSpace));
1573

1574
        char szBuffer2[64] = {'\0'};
997✔
1575
        snprintf(szBuffer2, sizeof(szBuffer2), "LINEOFFSET=" CPL_FRMT_GIB,
997✔
1576
                 static_cast<GIntBig>(nLineSpace));
1577

1578
        char *apszOptions[4] = {szBuffer0, szBuffer1, szBuffer2, nullptr};
997✔
1579

1580
        poMEMDS->AddBand(eBufType, apszOptions);
997✔
1581

1582
        GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
985✔
1583
        papoDstBands[i] = poMEMDS->GetRasterBand(i + 1);
964✔
1584
        const char *pszNBITS =
1585
            poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
971✔
1586
        if (pszNBITS)
977✔
1587
        {
1588
            nNBITS = atoi(pszNBITS);
×
1589
            poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
×
1590
                                                           "IMAGE_STRUCTURE");
×
1591
        }
1592
    }
1593

1594
    CPLErr eErr = CE_None;
285✔
1595

1596
    // TODO(schwehr): Why disabled?  Why not just delete?
1597
    // Looks like this code was initially added as disable by copying
1598
    // from RasterIO here:
1599
    // https://trac.osgeo.org/gdal/changeset/29572
1600
#if 0
1601
    // Do the resampling.
1602
    if( bUseWarp )
1603
    {
1604
        VRTDatasetH hVRTDS = nullptr;
1605
        GDALRasterBandH hVRTBand = nullptr;
1606
        if( GetDataset() == nullptr )
1607
        {
1608
            /* Create VRT dataset that wraps the whole dataset */
1609
            hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1610
            VRTAddBand( hVRTDS, eDataType, nullptr );
1611
            hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1612
            VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
1613
                                (GDALRasterBandH)this,
1614
                                0, 0,
1615
                                nRasterXSize, nRasterYSize,
1616
                                0, 0,
1617
                                nRasterXSize, nRasterYSize,
1618
                                nullptr, VRT_NODATA_UNSET );
1619

1620
            /* Add a mask band if needed */
1621
            if( GetMaskFlags() != GMF_ALL_VALID )
1622
            {
1623
                ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
1624
                VRTSourcedRasterBand* poVRTMaskBand =
1625
                    (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
1626
                poVRTMaskBand->
1627
                    AddMaskBandSource( this,
1628
                                    0, 0,
1629
                                    nRasterXSize, nRasterYSize,
1630
                                    0, 0,
1631
                                    nRasterXSize, nRasterYSize);
1632
            }
1633
        }
1634

1635
        GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
1636
        psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
1637
        psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
1638
        psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
1639
        psWarpOptions->nBandCount = 1;
1640
        int nSrcBandNumber = (hVRTDS ? 1 : nBand);
1641
        int nDstBandNumber = 1;
1642
        psWarpOptions->panSrcBands = &nSrcBandNumber;
1643
        psWarpOptions->panDstBands = &nDstBandNumber;
1644
        psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
1645
                    psExtraArg->pfnProgress : GDALDummyProgress;
1646
        psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1647
        psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1648
        GDALRasterIOTransformerStruct sTransformer;
1649
        sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1650
        sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1651
        sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1652
        sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1653
        psWarpOptions->pTransformerArg = &sTransformer;
1654

1655
        GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
1656
        eErr = GDALChunkAndWarpImage( hWarpOperation,
1657
                                      nDestXOffVirtual, nDestYOffVirtual,
1658
                                      nBufXSize, nBufYSize );
1659
        GDALDestroyWarpOperation( hWarpOperation );
1660

1661
        psWarpOptions->panSrcBands = nullptr;
1662
        psWarpOptions->panDstBands = nullptr;
1663
        GDALDestroyWarpOptions( psWarpOptions );
1664

1665
        if( hVRTDS )
1666
            GDALClose(hVRTDS);
1667
    }
1668
    else
1669
#endif
1670
    {
1671
        const char *pszResampling =
285✔
1672
            (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
455✔
1673
            : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
170✔
1674
            : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
×
1675
            : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
×
1676
            : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
×
1677
            : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
×
1678
            : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
×
1679
            : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
×
1680
                                                               : "UNKNOWN";
1681

1682
        GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
285✔
1683
        GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
275✔
1684
        int nBlockXSize, nBlockYSize;
1685
        poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
278✔
1686

1687
        int nKernelRadius;
1688
        GDALResampleFunction pfnResampleFunc =
1689
            GDALGetResampleFunction(pszResampling, &nKernelRadius);
281✔
1690
        CPLAssert(pfnResampleFunc);
290✔
1691
#ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1692
        GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
1693
            GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
1694
#endif
1695
        GDALDataType eWrkDataType =
1696
            GDALGetOvrWorkDataType(pszResampling, eDataType);
290✔
1697

1698
        int nDstBlockXSize = nBufXSize;
276✔
1699
        int nDstBlockYSize = nBufYSize;
276✔
1700
        int nFullResXChunk, nFullResYChunk;
1701
        while (true)
1702
        {
1703
            nFullResXChunk =
276✔
1704
                3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
276✔
1705
            nFullResYChunk =
276✔
1706
                3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
276✔
1707
            if (nFullResXChunk > nRasterXSize)
276✔
1708
                nFullResXChunk = nRasterXSize;
147✔
1709
            if (nFullResYChunk > nRasterYSize)
276✔
1710
                nFullResYChunk = nRasterYSize;
33✔
1711
            if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
276✔
1712
                (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
274✔
1713
                 1024 * 1024))
1714
                break;
1715
            // When operating on the full width of a raster whose block width is
1716
            // the raster width, prefer doing chunks in height.
1717
            if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
×
1718
                nDstBlockYSize > 1)
1719
                nDstBlockYSize /= 2;
×
1720
            /* Otherwise cut the maximal dimension */
1721
            else if (nDstBlockXSize > 1 &&
×
1722
                     (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
×
1723
                nDstBlockXSize /= 2;
×
1724
            else
1725
                nDstBlockYSize /= 2;
×
1726
        }
1727

1728
        int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
553✔
1729
                                  static_cast<int>(0.5 + dfYRatioDstToSrc));
276✔
1730
        if (nOvrFactor == 0)
277✔
1731
            nOvrFactor = 1;
90✔
1732
        int nFullResXSizeQueried =
277✔
1733
            nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
277✔
1734
        int nFullResYSizeQueried =
277✔
1735
            nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
277✔
1736

1737
        if (nFullResXSizeQueried > nRasterXSize)
277✔
1738
            nFullResXSizeQueried = nRasterXSize;
167✔
1739
        if (nFullResYSizeQueried > nRasterYSize)
277✔
1740
            nFullResYSizeQueried = nRasterYSize;
36✔
1741

1742
        void *pChunk = VSI_MALLOC3_VERBOSE(
277✔
1743
            cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
1744
                              nBandCount),
1745
            nFullResXSizeQueried, nFullResYSizeQueried);
1746
        GByte *pabyChunkNoDataMask = nullptr;
294✔
1747

1748
        GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
294✔
1749
        int nMaskFlags = poFirstSrcBand->GetMaskFlags();
296✔
1750

1751
        bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
293✔
1752
        if (bUseNoDataMask)
293✔
1753
        {
1754
            pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
55✔
1755
                nFullResXSizeQueried, nFullResYSizeQueried));
1756
        }
1757
        if (pChunk == nullptr ||
293✔
1758
            (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
55✔
1759
        {
1760
            GDALClose(poMEMDS);
17✔
1761
            CPLFree(pChunk);
×
1762
            CPLFree(pabyChunkNoDataMask);
×
1763
            CPLFree(papoDstBands);
×
1764
            return CE_Failure;
×
1765
        }
1766

1767
        int nTotalBlocks = ((nBufXSize + nDstBlockXSize - 1) / nDstBlockXSize) *
276✔
1768
                           ((nBufYSize + nDstBlockYSize - 1) / nDstBlockYSize);
276✔
1769
        int nBlocksDone = 0;
276✔
1770

1771
        int nDstYOff;
1772
        for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
581✔
1773
             nDstYOff += nDstBlockYSize)
305✔
1774
        {
1775
            int nDstYCount;
1776
            if (nDstYOff + nDstBlockYSize <= nBufYSize)
280✔
1777
                nDstYCount = nDstBlockYSize;
280✔
1778
            else
1779
                nDstYCount = nBufYSize - nDstYOff;
×
1780

1781
            int nChunkYOff =
280✔
1782
                nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
280✔
1783
            int nChunkYOff2 = nYOff + 1 +
280✔
1784
                              static_cast<int>(ceil((nDstYOff + nDstYCount) *
280✔
1785
                                                    dfYRatioDstToSrc));
1786
            if (nChunkYOff2 > nRasterYSize)
280✔
1787
                nChunkYOff2 = nRasterYSize;
58✔
1788
            int nYCount = nChunkYOff2 - nChunkYOff;
280✔
1789
            CPLAssert(nYCount <= nFullResYChunk);
280✔
1790

1791
            int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
280✔
1792
            int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
280✔
1793
            if (nChunkYOffQueried < 0)
280✔
1794
            {
1795
                nChunkYSizeQueried += nChunkYOffQueried;
58✔
1796
                nChunkYOffQueried = 0;
58✔
1797
            }
1798
            if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
280✔
1799
                nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
70✔
1800
            CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
280✔
1801

1802
            int nDstXOff;
1803
            for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
582✔
1804
                 nDstXOff += nDstBlockXSize)
302✔
1805
            {
1806
                int nDstXCount;
1807
                if (nDstXOff + nDstBlockXSize <= nBufXSize)
277✔
1808
                    nDstXCount = nDstBlockXSize;
277✔
1809
                else
1810
                    nDstXCount = nBufXSize - nDstXOff;
×
1811

1812
                int nChunkXOff =
277✔
1813
                    nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
277✔
1814
                int nChunkXOff2 =
277✔
1815
                    nXOff + 1 +
277✔
1816
                    static_cast<int>(
277✔
1817
                        ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
277✔
1818
                if (nChunkXOff2 > nRasterXSize)
277✔
1819
                    nChunkXOff2 = nRasterXSize;
156✔
1820
                int nXCount = nChunkXOff2 - nChunkXOff;
277✔
1821
                CPLAssert(nXCount <= nFullResXChunk);
277✔
1822

1823
                int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
277✔
1824
                int nChunkXSizeQueried =
277✔
1825
                    nXCount + 2 * nKernelRadius * nOvrFactor;
277✔
1826
                if (nChunkXOffQueried < 0)
277✔
1827
                {
1828
                    nChunkXSizeQueried += nChunkXOffQueried;
141✔
1829
                    nChunkXOffQueried = 0;
141✔
1830
                }
1831
                if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
277✔
1832
                    nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
151✔
1833
                CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
277✔
1834

1835
                bool bSkipResample = false;
277✔
1836
                bool bNoDataMaskFullyOpaque = false;
277✔
1837
                if (eErr == CE_None && bUseNoDataMask)
277✔
1838
                {
1839
                    eErr = poMaskBand->RasterIO(
55✔
1840
                        GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1841
                        nChunkXSizeQueried, nChunkYSizeQueried,
1842
                        pabyChunkNoDataMask, nChunkXSizeQueried,
1843
                        nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
1844

1845
                    /* Optimizations if mask if fully opaque or transparent */
1846
                    const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
55✔
1847
                    const GByte bVal = pabyChunkNoDataMask[0];
55✔
1848
                    int i = 1;  // Used after for.
55✔
1849
                    for (; i < nPixels; i++)
123,794✔
1850
                    {
1851
                        if (pabyChunkNoDataMask[i] != bVal)
123,777✔
1852
                            break;
38✔
1853
                    }
1854
                    if (i == nPixels)
55✔
1855
                    {
1856
                        if (bVal == 0)
17✔
1857
                        {
1858
                            GByte abyZero[16] = {0};
16✔
1859
                            for (int iBand = 0; iBand < nBandCount; iBand++)
64✔
1860
                            {
1861
                                for (int j = 0; j < nDstYCount; j++)
2,016✔
1862
                                {
1863
                                    GDALCopyWords64(
1,968✔
1864
                                        abyZero, GDT_Byte, 0,
1865
                                        static_cast<GByte *>(pData) +
1866
                                            iBand * nBandSpace +
1,968✔
1867
                                            nLineSpace * (j + nDstYOff) +
1,968✔
1868
                                            nDstXOff * nPixelSpace,
1,968✔
1869
                                        eBufType, static_cast<int>(nPixelSpace),
1870
                                        nDstXCount);
1871
                                }
1872
                            }
1873
                            bSkipResample = true;
16✔
1874
                        }
1875
                        else
1876
                        {
1877
                            bNoDataMaskFullyOpaque = true;
1✔
1878
                        }
1879
                    }
1880
                }
1881

1882
                if (!bSkipResample && eErr == CE_None)
277✔
1883
                {
1884
                    /* Read the source buffers */
1885
                    eErr = RasterIO(
256✔
1886
                        GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1887
                        nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1888
                        nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
1889
                        nBandCount, panBandMap, 0, 0, 0, nullptr);
1890
                }
1891

1892
#ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1893
                if (pfnResampleFuncMultiBands && !bSkipResample &&
1894
                    eErr == CE_None)
1895
                {
1896
                    eErr = pfnResampleFuncMultiBands(
1897
                        dfXRatioDstToSrc, dfYRatioDstToSrc,
1898
                        dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
1899
                        dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
1900
                        eWrkDataType, (GByte *)pChunk, nBandCount,
1901
                        bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
1902
                        nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
1903
                        nChunkXSizeQueried,
1904
                        nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
1905
                        nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
1906
                        nDstXOff + nDestXOffVirtual + nDstXCount,
1907
                        nDstYOff + nDestYOffVirtual,
1908
                        nDstYOff + nDestYOffVirtual + nDstYCount, papoDstBands,
1909
                        pszResampling, FALSE /*bHasNoData*/,
1910
                        0.0 /* dfNoDataValue */, nullptr /* color table*/,
1911
                        eDataType);
1912
                }
1913
                else
1914
#endif
1915
                {
1916
                    size_t nChunkBandOffset =
1917
                        static_cast<size_t>(nChunkXSizeQueried) *
301✔
1918
                        nChunkYSizeQueried *
301✔
1919
                        GDALGetDataTypeSizeBytes(eWrkDataType);
301✔
1920
                    for (int i = 0;
1,251✔
1921
                         i < nBandCount && !bSkipResample && eErr == CE_None;
1,251✔
1922
                         i++)
1923
                    {
1924
                        const bool bPropagateNoData = false;
949✔
1925
                        void *pDstBuffer = nullptr;
949✔
1926
                        GDALDataType eDstBufferDataType = GDT_Unknown;
949✔
1927
                        GDALRasterBand *poMEMBand =
1928
                            poMEMDS->GetRasterBand(i + 1);
949✔
1929
                        GDALOverviewResampleArgs args;
947✔
1930
                        args.eSrcDataType = eDataType;
947✔
1931
                        args.eOvrDataType = poMEMBand->GetRasterDataType();
947✔
1932
                        args.nOvrXSize = poMEMBand->GetXSize();
949✔
1933
                        args.nOvrYSize = poMEMBand->GetYSize();
948✔
1934
                        args.nOvrNBITS = nNBITS;
948✔
1935
                        args.dfXRatioDstToSrc = dfXRatioDstToSrc;
948✔
1936
                        args.dfYRatioDstToSrc = dfYRatioDstToSrc;
948✔
1937
                        args.dfSrcXDelta =
948✔
1938
                            dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
948✔
1939
                        args.dfSrcYDelta =
948✔
1940
                            dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
948✔
1941
                        args.eWrkDataType = eWrkDataType;
948✔
1942
                        args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
948✔
1943
                                                       ? nullptr
948✔
1944
                                                       : pabyChunkNoDataMask;
1945
                        args.nChunkXOff =
948✔
1946
                            nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
948✔
1947
                        args.nChunkXSize = nChunkXSizeQueried;
948✔
1948
                        args.nChunkYOff =
948✔
1949
                            nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
948✔
1950
                        args.nChunkYSize = nChunkYSizeQueried;
948✔
1951
                        args.nDstXOff = nDstXOff + nDestXOffVirtual;
948✔
1952
                        args.nDstXOff2 =
948✔
1953
                            nDstXOff + nDestXOffVirtual + nDstXCount;
948✔
1954
                        args.nDstYOff = nDstYOff + nDestYOffVirtual;
948✔
1955
                        args.nDstYOff2 =
948✔
1956
                            nDstYOff + nDestYOffVirtual + nDstYCount;
948✔
1957
                        args.pszResampling = pszResampling;
948✔
1958
                        args.bHasNoData = false;
948✔
1959
                        args.dfNoDataValue = 0.0;
948✔
1960
                        args.poColorTable = nullptr;
948✔
1961
                        args.bPropagateNoData = bPropagateNoData;
948✔
1962

1963
                        eErr =
1964
                            pfnResampleFunc(args,
1,902✔
1965
                                            reinterpret_cast<GByte *>(pChunk) +
948✔
1966
                                                i * nChunkBandOffset,
948✔
1967
                                            &pDstBuffer, &eDstBufferDataType);
1968
                        if (eErr == CE_None)
954✔
1969
                        {
1970
                            eErr = poMEMBand->RasterIO(
954✔
1971
                                GF_Write, nDstXOff + nDestXOffVirtual,
1972
                                nDstYOff + nDestYOffVirtual, nDstXCount,
1973
                                nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
1974
                                eDstBufferDataType, 0, 0, nullptr);
1975
                        }
1976
                        CPLFree(pDstBuffer);
954✔
1977
                    }
1978
                }
1979

1980
                nBlocksDone++;
302✔
1981
                if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
304✔
1982
                    !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
2✔
1983
                                             "", psExtraArg->pProgressData))
1984
                {
1985
                    eErr = CE_Failure;
×
1986
                }
1987
            }
1988
        }
1989

1990
        CPLFree(pChunk);
301✔
1991
        CPLFree(pabyChunkNoDataMask);
300✔
1992
    }
1993

1994
    CPLFree(papoDstBands);
300✔
1995
    GDALClose(poMEMDS);
300✔
1996

1997
    return eErr;
300✔
1998
}
1999

2000
//! @endcond
2001

2002
/************************************************************************/
2003
/*                           GDALSwapWords()                            */
2004
/************************************************************************/
2005

2006
/**
2007
 * Byte swap words in-place.
2008
 *
2009
 * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2010
 * a memory array.  No assumption is made that the words being swapped are
2011
 * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2012
 * to determine if the current platform is big endian or little endian.  Use
2013
 * The macros like CPL_SWAP32() to byte swap single values without the overhead
2014
 * of a function call.
2015
 *
2016
 * @param pData pointer to start of data buffer.
2017
 * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2018
 * @param nWordCount the number of words to be swapped in this call.
2019
 * @param nWordSkip the byte offset from the start of one word to the start of
2020
 * the next. For packed buffers this is the same as nWordSize.
2021
 */
2022

2023
void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
438,921✔
2024
                               int nWordSkip)
2025

2026
{
2027
    if (nWordCount > 0)
438,921✔
2028
        VALIDATE_POINTER0(pData, "GDALSwapWords");
438,921✔
2029

2030
    GByte *pabyData = static_cast<GByte *>(pData);
438,921✔
2031

2032
    switch (nWordSize)
438,921✔
2033
    {
2034
        case 1:
7,234✔
2035
            break;
7,234✔
2036

2037
        case 2:
418,687✔
2038
            CPLAssert(nWordSkip >= 2 || nWordCount == 1);
418,687✔
2039
            for (int i = 0; i < nWordCount; i++)
289,291,000✔
2040
            {
2041
                CPL_SWAP16PTR(pabyData);
288,873,000✔
2042
                pabyData += nWordSkip;
288,873,000✔
2043
            }
2044
            break;
418,687✔
2045

2046
        case 4:
10,514✔
2047
            CPLAssert(nWordSkip >= 4 || nWordCount == 1);
10,514✔
2048
            if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
10,514✔
2049
            {
2050
                for (int i = 0; i < nWordCount; i++)
29,139,300✔
2051
                {
2052
                    *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
29,128,800✔
2053
                        *reinterpret_cast<const GUInt32 *>(pabyData));
2054
                    pabyData += nWordSkip;
29,128,800✔
2055
                }
10,511✔
2056
            }
2057
            else
2058
            {
2059
                for (int i = 0; i < nWordCount; i++)
9✔
2060
                {
2061
                    CPL_SWAP32PTR(pabyData);
6✔
2062
                    pabyData += nWordSkip;
6✔
2063
                }
2064
            }
2065
            break;
10,514✔
2066

2067
        case 8:
2,486✔
2068
            CPLAssert(nWordSkip >= 8 || nWordCount == 1);
2,486✔
2069
            if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
2,486✔
2070
            {
2071
                for (int i = 0; i < nWordCount; i++)
3,358,160✔
2072
                {
2073
                    *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
3,355,680✔
2074
                        *reinterpret_cast<const GUInt64 *>(pabyData));
2075
                    pabyData += nWordSkip;
3,355,680✔
2076
                }
2,485✔
2077
            }
2078
            else
2079
            {
2080
                for (int i = 0; i < nWordCount; i++)
3✔
2081
                {
2082
                    CPL_SWAP64PTR(pabyData);
2✔
2083
                    pabyData += nWordSkip;
2✔
2084
                }
2085
            }
2086
            break;
2,486✔
2087

2088
        default:
×
2089
            CPLAssert(false);
×
2090
    }
2091
}
2092

2093
/************************************************************************/
2094
/*                           GDALSwapWordsEx()                          */
2095
/************************************************************************/
2096

2097
/**
2098
 * Byte swap words in-place.
2099
 *
2100
 * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2101
 * a memory array.  No assumption is made that the words being swapped are
2102
 * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2103
 * to determine if the current platform is big endian or little endian.  Use
2104
 * The macros like CPL_SWAP32() to byte swap single values without the overhead
2105
 * of a function call.
2106
 *
2107
 * @param pData pointer to start of data buffer.
2108
 * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2109
 * @param nWordCount the number of words to be swapped in this call.
2110
 * @param nWordSkip the byte offset from the start of one word to the start of
2111
 * the next. For packed buffers this is the same as nWordSize.
2112
 * @since GDAL 2.1
2113
 */
2114
void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
6,118✔
2115
                                 int nWordSkip)
2116
{
2117
    GByte *pabyData = static_cast<GByte *>(pData);
6,118✔
2118
    while (nWordCount)
12,236✔
2119
    {
2120
        // Pick-up a multiple of 8 as max chunk size.
2121
        const int nWordCountSmall =
6,118✔
2122
            (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
6,118✔
2123
        GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
6,118✔
2124
        pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
6,118✔
2125
        nWordCount -= nWordCountSmall;
6,118✔
2126
    }
2127
}
6,118✔
2128

2129
// Place the new GDALCopyWords helpers in an anonymous namespace
2130
namespace
2131
{
2132

2133
/************************************************************************/
2134
/*                           GDALCopyWordsT()                           */
2135
/************************************************************************/
2136
/**
2137
 * Template function, used to copy data from pSrcData into buffer
2138
 * pDstData, with stride nSrcPixelStride in the source data and
2139
 * stride nDstPixelStride in the destination data. This template can
2140
 * deal with the case where the input data type is real or complex and
2141
 * the output is real.
2142
 *
2143
 * @param pSrcData the source data buffer
2144
 * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
2145
 *                      of interest.
2146
 * @param pDstData the destination buffer.
2147
 * @param nDstPixelStride the stride in the buffer pDstData for pixels of
2148
 *                      interest.
2149
 * @param nWordCount the total number of pixel words to copy
2150
 *
2151
 * @code
2152
 * // Assume an input buffer of type GUInt16 named pBufferIn
2153
 * GByte *pBufferOut = new GByte[numBytesOut];
2154
 * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
2155
 * @endcode
2156
 * @note
2157
 * This is a private function, and should not be exposed outside of
2158
 * rasterio.cpp. External users should call the GDALCopyWords driver function.
2159
 */
2160

2161
template <class Tin, class Tout>
2162
static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
46,673,638✔
2163
                                         int nSrcPixelStride,
2164
                                         Tout *const CPL_RESTRICT pDstData,
2165
                                         int nDstPixelStride,
2166
                                         GPtrDiff_t nWordCount)
2167
{
2168
    decltype(nWordCount) nDstOffset = 0;
46,673,638✔
2169

2170
    const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
46,673,638✔
2171
    char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
46,673,638✔
2172
    for (decltype(nWordCount) n = 0; n < nWordCount; n++)
591,864,967✔
2173
    {
2174
        const Tin tValue =
545,176,998✔
2175
            *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
545,176,998✔
2176
        Tout *const pOutPixel =
545,176,998✔
2177
            reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
545,176,998✔
2178

2179
        GDALCopyWord(tValue, *pOutPixel);
545,176,998✔
2180

2181
        nDstOffset += nDstPixelStride;
545,191,198✔
2182
    }
2183
}
46,687,835✔
2184

2185
template <class Tin, class Tout>
2186
static void inline GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
38,246,868✔
2187
                                  int nSrcPixelStride,
2188
                                  Tout *const CPL_RESTRICT pDstData,
2189
                                  int nDstPixelStride, GPtrDiff_t nWordCount)
2190
{
2191
    GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
38,246,868✔
2192
                          nWordCount);
2193
}
38,246,925✔
2194

2195
template <class Tin, class Tout>
2196
static void inline GDALCopyWordsT_8atatime(
199,543✔
2197
    const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
2198
    Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
2199
    GPtrDiff_t nWordCount)
2200
{
2201
    decltype(nWordCount) nDstOffset = 0;
199,543✔
2202

2203
    const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
199,543✔
2204
    char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
199,543✔
2205
    decltype(nWordCount) n = 0;
199,543✔
2206
    if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
199,543✔
2207
        nDstPixelStride == static_cast<int>(sizeof(Tout)))
2208
    {
2209
        for (; n < nWordCount - 7; n += 8)
22,981,310✔
2210
        {
2211
            const Tin *pInValues = reinterpret_cast<const Tin *>(
22,776,448✔
2212
                pSrcDataPtr + (n * nSrcPixelStride));
22,776,448✔
2213
            Tout *const pOutPixels =
22,776,448✔
2214
                reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
22,776,448✔
2215

2216
            GDALCopy8Words(pInValues, pOutPixels);
22,776,448✔
2217

2218
            nDstOffset += 8 * nDstPixelStride;
22,782,768✔
2219
        }
2220
    }
2221
    for (; n < nWordCount; n++)
694,146✔
2222
    {
2223
        const Tin tValue =
494,598✔
2224
            *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
494,598✔
2225
        Tout *const pOutPixel =
494,598✔
2226
            reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
494,598✔
2227

2228
        GDALCopyWord(tValue, *pOutPixel);
494,598✔
2229

2230
        nDstOffset += nDstPixelStride;
488,275✔
2231
    }
2232
}
199,548✔
2233

2234
#ifdef HAVE_SSE2
2235

2236
template <class Tout>
2237
void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
38,070✔
2238
                              int nSrcPixelStride,
2239
                              Tout *const CPL_RESTRICT pDstData,
2240
                              int nDstPixelStride, GPtrDiff_t nWordCount)
2241
{
2242
    static_assert(std::is_integral<Tout>::value &&
2243
                      sizeof(Tout) == sizeof(uint16_t),
2244
                  "Bad Tout");
2245
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
38,070✔
2246
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2247
    {
2248
        decltype(nWordCount) n = 0;
32,019✔
2249
        const __m128i xmm_zero = _mm_setzero_si128();
32,019✔
2250
        GByte *CPL_RESTRICT pabyDstDataPtr =
32,019✔
2251
            reinterpret_cast<GByte *>(pDstData);
2252
        for (; n < nWordCount - 15; n += 16)
1,410,371✔
2253
        {
2254
            __m128i xmm = _mm_loadu_si128(
1,378,352✔
2255
                reinterpret_cast<const __m128i *>(pSrcData + n));
1,378,352✔
2256
            __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
1,378,352✔
2257
            __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
1,378,352✔
2258
            _mm_storeu_si128(
2259
                reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
1,378,352✔
2260
            _mm_storeu_si128(
2261
                reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
1,378,352✔
2262
        }
2263
        for (; n < nWordCount; n++)
105,177✔
2264
        {
2265
            pDstData[n] = pSrcData[n];
73,158✔
2266
        }
32,019✔
2267
    }
2268
    else
2269
    {
2270
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
6,051✔
2271
                              nDstPixelStride, nWordCount);
2272
    }
2273
}
38,070✔
2274

2275
template <>
2276
void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
25,764✔
2277
                    int nSrcPixelStride, GUInt16 *const CPL_RESTRICT pDstData,
2278
                    int nDstPixelStride, GPtrDiff_t nWordCount)
2279
{
2280
    GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
25,764✔
2281
                             nDstPixelStride, nWordCount);
2282
}
25,764✔
2283

2284
template <>
2285
void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
12,306✔
2286
                    int nSrcPixelStride, GInt16 *const CPL_RESTRICT pDstData,
2287
                    int nDstPixelStride, GPtrDiff_t nWordCount)
2288
{
2289
    GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
12,306✔
2290
                             nDstPixelStride, nWordCount);
2291
}
12,306✔
2292

2293
template <class Tout>
2294
void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
12,281,938✔
2295
                              int nSrcPixelStride,
2296
                              Tout *const CPL_RESTRICT pDstData,
2297
                              int nDstPixelStride, GPtrDiff_t nWordCount)
2298
{
2299
    static_assert(std::is_integral<Tout>::value &&
2300
                      sizeof(Tout) == sizeof(uint32_t),
2301
                  "Bad Tout");
2302
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
12,281,938✔
2303
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2304
    {
2305
        decltype(nWordCount) n = 0;
6,202,938✔
2306
        const __m128i xmm_zero = _mm_setzero_si128();
6,202,938✔
2307
        GByte *CPL_RESTRICT pabyDstDataPtr =
6,202,938✔
2308
            reinterpret_cast<GByte *>(pDstData);
2309
        for (; n < nWordCount - 15; n += 16)
69,310,852✔
2310
        {
2311
            __m128i xmm = _mm_loadu_si128(
63,251,424✔
2312
                reinterpret_cast<const __m128i *>(pSrcData + n));
63,251,424✔
2313
            __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
63,193,724✔
2314
            __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
63,165,024✔
2315
            __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
63,039,824✔
2316
            __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
62,969,424✔
2317
            __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
63,011,824✔
2318
            __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
63,107,924✔
2319
            _mm_storeu_si128(
2320
                reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
63,107,924✔
2321
            _mm_storeu_si128(
2322
                reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
63,107,924✔
2323
            _mm_storeu_si128(
2324
                reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
63,107,924✔
2325
            _mm_storeu_si128(
2326
                reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
63,107,924✔
2327
        }
2328
        for (; n < nWordCount; n++)
14,139,439✔
2329
        {
2330
            pDstData[n] = pSrcData[n];
8,080,011✔
2331
        }
6,059,438✔
2332
    }
2333
    else
2334
    {
2335
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
6,078,960✔
2336
                              nDstPixelStride, nWordCount);
2337
    }
2338
}
12,110,938✔
2339

2340
template <>
2341
void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
438✔
2342
                    int nSrcPixelStride, GUInt32 *const CPL_RESTRICT pDstData,
2343
                    int nDstPixelStride, GPtrDiff_t nWordCount)
2344
{
2345
    GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
438✔
2346
                             nDstPixelStride, nWordCount);
2347
}
438✔
2348

2349
template <>
2350
void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
12,293,500✔
2351
                    int nSrcPixelStride, GInt32 *const CPL_RESTRICT pDstData,
2352
                    int nDstPixelStride, GPtrDiff_t nWordCount)
2353
{
2354
    GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
12,293,500✔
2355
                             nDstPixelStride, nWordCount);
2356
}
12,297,200✔
2357

2358
template <>
2359
void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2,470,600✔
2360
                    int nSrcPixelStride, float *const CPL_RESTRICT pDstData,
2361
                    int nDstPixelStride, GPtrDiff_t nWordCount)
2362
{
2363
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2,470,600✔
2364
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2365
    {
2366
        decltype(nWordCount) n = 0;
111,152✔
2367
        const __m128i xmm_zero = _mm_setzero_si128();
111,152✔
2368
        GByte *CPL_RESTRICT pabyDstDataPtr =
111,152✔
2369
            reinterpret_cast<GByte *>(pDstData);
2370
        for (; n < nWordCount - 15; n += 16)
3,272,970✔
2371
        {
2372
            __m128i xmm = _mm_loadu_si128(
3,161,820✔
2373
                reinterpret_cast<const __m128i *>(pSrcData + n));
3,161,820✔
2374
            __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
3,161,820✔
2375
            __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
3,161,820✔
2376
            __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
3,161,820✔
2377
            __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
3,161,820✔
2378
            __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
3,161,820✔
2379
            __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
3,161,820✔
2380
            __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
3,161,820✔
2381
            __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
3,161,820✔
2382
            __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
3,161,820✔
2383
            __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
3,161,820✔
2384
            _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
3,161,820✔
2385
                          xmm0_f);
2386
            _mm_storeu_ps(
2387
                reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
3,161,820✔
2388
            _mm_storeu_ps(
2389
                reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
3,161,820✔
2390
            _mm_storeu_ps(
2391
                reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
3,161,820✔
2392
        }
2393
        for (; n < nWordCount; n++)
472,130✔
2394
        {
2395
            pDstData[n] = pSrcData[n];
360,978✔
2396
        }
111,152✔
2397
    }
2398
    else
2399
    {
2400
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2,359,440✔
2401
                              nDstPixelStride, nWordCount);
2402
    }
2403
}
2,470,600✔
2404

2405
template <>
2406
void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
147,658✔
2407
                    int nSrcPixelStride, double *const CPL_RESTRICT pDstData,
2408
                    int nDstPixelStride, GPtrDiff_t nWordCount)
2409
{
2410
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
147,658✔
2411
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2412
    {
2413
        decltype(nWordCount) n = 0;
124,604✔
2414
        const __m128i xmm_zero = _mm_setzero_si128();
124,604✔
2415
        GByte *CPL_RESTRICT pabyDstDataPtr =
124,604✔
2416
            reinterpret_cast<GByte *>(pDstData);
2417
        for (; n < nWordCount - 15; n += 16)
1,425,580✔
2418
        {
2419
            __m128i xmm = _mm_loadu_si128(
1,300,980✔
2420
                reinterpret_cast<const __m128i *>(pSrcData + n));
1,300,980✔
2421
            __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
1,300,980✔
2422
            __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
1,300,980✔
2423
            __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
1,300,980✔
2424
            __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
1,300,980✔
2425
            __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
1,300,980✔
2426
            __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
1,300,980✔
2427

2428
            __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
1,300,980✔
2429
            __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
1,300,980✔
2430
            __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
1,300,980✔
2431
            __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
1,300,980✔
2432
            xmm0 = _mm_srli_si128(xmm0, 8);
1,300,980✔
2433
            xmm1 = _mm_srli_si128(xmm1, 8);
1,300,980✔
2434
            xmm2 = _mm_srli_si128(xmm2, 8);
1,300,980✔
2435
            xmm3 = _mm_srli_si128(xmm3, 8);
1,300,980✔
2436
            __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
1,300,980✔
2437
            __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
1,300,980✔
2438
            __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
1,300,980✔
2439
            __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
1,300,980✔
2440

2441
            _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
1,300,980✔
2442
                          xmm0_low_d);
2443
            _mm_storeu_pd(
2444
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
1,300,980✔
2445
                xmm0_high_d);
2446
            _mm_storeu_pd(
2447
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
1,300,980✔
2448
                xmm1_low_d);
2449
            _mm_storeu_pd(
2450
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
1,300,980✔
2451
                xmm1_high_d);
2452
            _mm_storeu_pd(
2453
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
1,300,980✔
2454
                xmm2_low_d);
2455
            _mm_storeu_pd(
2456
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
1,300,980✔
2457
                xmm2_high_d);
2458
            _mm_storeu_pd(
2459
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
1,300,980✔
2460
                xmm3_low_d);
2461
            _mm_storeu_pd(
2462
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
1,300,980✔
2463
                xmm3_high_d);
2464
        }
2465
        for (; n < nWordCount; n++)
236,108✔
2466
        {
2467
            pDstData[n] = pSrcData[n];
111,504✔
2468
        }
124,604✔
2469
    }
2470
    else
2471
    {
2472
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
23,054✔
2473
                              nDstPixelStride, nWordCount);
2474
    }
2475
}
147,658✔
2476

2477
template <>
2478
void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
6,008✔
2479
                    int nSrcPixelStride, GByte *const CPL_RESTRICT pDstData,
2480
                    int nDstPixelStride, GPtrDiff_t nWordCount)
2481
{
2482
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
6,008✔
2483
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2484
    {
2485
        decltype(nWordCount) n = 0;
5,033✔
2486
        // In SSE2, min_epu16 does not exist, so shift from
2487
        // UInt16 to SInt16 to be able to use min_epi16
2488
        const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
5,033✔
2489
        const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
5,033✔
2490
        for (; n < nWordCount - 7; n += 8)
138,473✔
2491
        {
2492
            __m128i xmm = _mm_loadu_si128(
133,440✔
2493
                reinterpret_cast<const __m128i *>(pSrcData + n));
133,440✔
2494
            xmm = _mm_add_epi16(xmm, xmm_UINT16_to_INT16);
133,440✔
2495
            xmm = _mm_min_epi16(xmm, xmm_m255_shifted);
133,440✔
2496
            xmm = _mm_sub_epi16(xmm, xmm_UINT16_to_INT16);
133,440✔
2497
            xmm = _mm_packus_epi16(xmm, xmm);
133,440✔
2498
            GDALCopyXMMToInt64(xmm,
133,440✔
2499
                               reinterpret_cast<GPtrDiff_t *>(pDstData + n));
133,440✔
2500
        }
2501
        for (; n < nWordCount; n++)
16,019✔
2502
        {
2503
            pDstData[n] =
10,986✔
2504
                pSrcData[n] >= 255 ? 255 : static_cast<GByte>(pSrcData[n]);
10,986✔
2505
        }
5,033✔
2506
    }
2507
    else
2508
    {
2509
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
975✔
2510
                              nDstPixelStride, nWordCount);
2511
    }
2512
}
6,008✔
2513

2514
template <>
2515
void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
21✔
2516
                    int nSrcPixelStride, GInt16 *const CPL_RESTRICT pDstData,
2517
                    int nDstPixelStride, GPtrDiff_t nWordCount)
2518
{
2519
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
21✔
2520
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2521
    {
2522
        decltype(nWordCount) n = 0;
15✔
2523
        // In SSE2, min_epu16 does not exist, so shift from
2524
        // UInt16 to SInt16 to be able to use min_epi16
2525
        const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
15✔
2526
        const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
15✔
2527
        for (; n < nWordCount - 7; n += 8)
31✔
2528
        {
2529
            __m128i xmm = _mm_loadu_si128(
16✔
2530
                reinterpret_cast<const __m128i *>(pSrcData + n));
16✔
2531
            xmm = _mm_add_epi16(xmm, xmm_UINT16_to_INT16);
16✔
2532
            xmm = _mm_min_epi16(xmm, xmm_32767_shifted);
16✔
2533
            xmm = _mm_sub_epi16(xmm, xmm_UINT16_to_INT16);
16✔
2534
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm);
16✔
2535
        }
2536
        for (; n < nWordCount; n++)
55✔
2537
        {
2538
            pDstData[n] =
40✔
2539
                pSrcData[n] >= 32767 ? 32767 : static_cast<GInt16>(pSrcData[n]);
40✔
2540
        }
15✔
2541
    }
2542
    else
2543
    {
2544
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
6✔
2545
                              nDstPixelStride, nWordCount);
2546
    }
2547
}
21✔
2548

2549
template <>
2550
void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
402✔
2551
                    int nSrcPixelStride, float *const CPL_RESTRICT pDstData,
2552
                    int nDstPixelStride, GPtrDiff_t nWordCount)
2553
{
2554
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
402✔
2555
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2556
    {
2557
        decltype(nWordCount) n = 0;
396✔
2558
        const __m128i xmm_zero = _mm_setzero_si128();
396✔
2559
        GByte *CPL_RESTRICT pabyDstDataPtr =
396✔
2560
            reinterpret_cast<GByte *>(pDstData);
2561
        for (; n < nWordCount - 7; n += 8)
1,480✔
2562
        {
2563
            __m128i xmm = _mm_loadu_si128(
1,084✔
2564
                reinterpret_cast<const __m128i *>(pSrcData + n));
1,084✔
2565
            __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
1,084✔
2566
            __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
1,084✔
2567
            __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
1,084✔
2568
            __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
1,084✔
2569
            _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
1,084✔
2570
                          xmm0_f);
2571
            _mm_storeu_ps(
2572
                reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
1,084✔
2573
        }
2574
        for (; n < nWordCount; n++)
1,453✔
2575
        {
2576
            pDstData[n] = pSrcData[n];
1,057✔
2577
        }
396✔
2578
    }
2579
    else
2580
    {
2581
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
6✔
2582
                              nDstPixelStride, nWordCount);
2583
    }
2584
}
402✔
2585

2586
template <>
2587
void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
262✔
2588
                    int nSrcPixelStride, double *const CPL_RESTRICT pDstData,
2589
                    int nDstPixelStride, GPtrDiff_t nWordCount)
2590
{
2591
    if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
262✔
2592
        nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2593
    {
2594
        decltype(nWordCount) n = 0;
153✔
2595
        const __m128i xmm_zero = _mm_setzero_si128();
153✔
2596
        GByte *CPL_RESTRICT pabyDstDataPtr =
153✔
2597
            reinterpret_cast<GByte *>(pDstData);
2598
        for (; n < nWordCount - 7; n += 8)
181✔
2599
        {
2600
            __m128i xmm = _mm_loadu_si128(
28✔
2601
                reinterpret_cast<const __m128i *>(pSrcData + n));
28✔
2602
            __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
28✔
2603
            __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
28✔
2604

2605
            __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
28✔
2606
            __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
28✔
2607
            xmm0 = _mm_srli_si128(xmm0, 8);
28✔
2608
            xmm1 = _mm_srli_si128(xmm1, 8);
28✔
2609
            __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
28✔
2610
            __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
28✔
2611

2612
            _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
28✔
2613
                          xmm0_low_d);
2614
            _mm_storeu_pd(
2615
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
28✔
2616
                xmm0_high_d);
2617
            _mm_storeu_pd(
2618
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
28✔
2619
                xmm1_low_d);
2620
            _mm_storeu_pd(
2621
                reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
28✔
2622
                xmm1_high_d);
2623
        }
2624
        for (; n < nWordCount; n++)
373✔
2625
        {
2626
            pDstData[n] = pSrcData[n];
220✔
2627
        }
153✔
2628
    }
2629
    else
2630
    {
2631
        GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
109✔
2632
                              nDstPixelStride, nWordCount);
2633
    }
2634
}
262✔
2635

2636
template <>
2637
void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
814✔
2638
                    int nSrcPixelStride, GUInt16 *const CPL_RESTRICT pDstData,
2639
                    int nDstPixelStride, GPtrDiff_t nWordCount)
2640
{
2641
    GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
814✔
2642
                            nDstPixelStride, nWordCount);
2643
}
814✔
2644

2645
#endif  // HAVE_SSE2
2646

2647
template <>
2648
void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
121,967✔
2649
                    int nSrcPixelStride, GByte *const CPL_RESTRICT pDstData,
2650
                    int nDstPixelStride, GPtrDiff_t nWordCount)
2651
{
2652
    GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
121,967✔
2653
                            nDstPixelStride, nWordCount);
2654
}
121,970✔
2655

2656
template <>
2657
void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
15,126✔
2658
                    int nSrcPixelStride, GInt16 *const CPL_RESTRICT pDstData,
2659
                    int nDstPixelStride, GPtrDiff_t nWordCount)
2660
{
2661
    GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
15,126✔
2662
                            nDstPixelStride, nWordCount);
2663
}
15,126✔
2664

2665
template <>
2666
void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
61,643✔
2667
                    int nSrcPixelStride, GUInt16 *const CPL_RESTRICT pDstData,
2668
                    int nDstPixelStride, GPtrDiff_t nWordCount)
2669
{
2670
    GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
61,643✔
2671
                            nDstPixelStride, nWordCount);
2672
}
61,644✔
2673

2674
/************************************************************************/
2675
/*                   GDALCopyWordsComplexT()                            */
2676
/************************************************************************/
2677
/**
2678
 * Template function, used to copy data from pSrcData into buffer
2679
 * pDstData, with stride nSrcPixelStride in the source data and
2680
 * stride nDstPixelStride in the destination data. Deals with the
2681
 * complex case, where input is complex and output is complex.
2682
 *
2683
 * @param pSrcData the source data buffer
2684
 * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
2685
 *                      of interest.
2686
 * @param pDstData the destination buffer.
2687
 * @param nDstPixelStride the stride in the buffer pDstData for pixels of
2688
 *                      interest.
2689
 * @param nWordCount the total number of pixel words to copy
2690
 *
2691
 */
2692
template <class Tin, class Tout>
2693
inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
125,220✔
2694
                                  int nSrcPixelStride,
2695
                                  Tout *const CPL_RESTRICT pDstData,
2696
                                  int nDstPixelStride, GPtrDiff_t nWordCount)
2697
{
2698
    decltype(nWordCount) nDstOffset = 0;
125,220✔
2699
    const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
125,220✔
2700
    char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
125,220✔
2701

2702
    for (decltype(nWordCount) n = 0; n < nWordCount; n++)
7,338,017✔
2703
    {
2704
        const Tin *const pPixelIn =
7,212,792✔
2705
            reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
7,212,792✔
2706
        Tout *const pPixelOut =
7,212,792✔
2707
            reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
7,212,792✔
2708

2709
        GDALCopyWord(pPixelIn[0], pPixelOut[0]);
7,212,792✔
2710
        GDALCopyWord(pPixelIn[1], pPixelOut[1]);
7,212,792✔
2711

2712
        nDstOffset += nDstPixelStride;
7,212,792✔
2713
    }
2714
}
125,220✔
2715

2716
/************************************************************************/
2717
/*                   GDALCopyWordsComplexOutT()                         */
2718
/************************************************************************/
2719
/**
2720
 * Template function, used to copy data from pSrcData into buffer
2721
 * pDstData, with stride nSrcPixelStride in the source data and
2722
 * stride nDstPixelStride in the destination data. Deals with the
2723
 * case where the value is real coming in, but complex going out.
2724
 *
2725
 * @param pSrcData the source data buffer
2726
 * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
2727
 *                      of interest, in bytes.
2728
 * @param pDstData the destination buffer.
2729
 * @param nDstPixelStride the stride in the buffer pDstData for pixels of
2730
 *                      interest, in bytes.
2731
 * @param nWordCount the total number of pixel words to copy
2732
 *
2733
 */
2734
template <class Tin, class Tout>
2735
inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
3,274✔
2736
                                     int nSrcPixelStride,
2737
                                     Tout *const CPL_RESTRICT pDstData,
2738
                                     int nDstPixelStride, GPtrDiff_t nWordCount)
2739
{
2740
    decltype(nWordCount) nDstOffset = 0;
3,274✔
2741

2742
    const Tout tOutZero = static_cast<Tout>(0);
3,274✔
2743

2744
    const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3,274✔
2745
    char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3,274✔
2746

2747
    for (decltype(nWordCount) n = 0; n < nWordCount; n++)
1,112,544✔
2748
    {
2749
        const Tin tValue =
1,109,270✔
2750
            *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
1,109,270✔
2751
        Tout *const pPixelOut =
1,109,270✔
2752
            reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
1,109,270✔
2753
        GDALCopyWord(tValue, *pPixelOut);
1,109,270✔
2754

2755
        pPixelOut[1] = tOutZero;
1,109,270✔
2756

2757
        nDstOffset += nDstPixelStride;
1,109,270✔
2758
    }
2759
}
3,274✔
2760

2761
/************************************************************************/
2762
/*                           GDALCopyWordsFromT()                       */
2763
/************************************************************************/
2764
/**
2765
 * Template driver function. Given the input type T, call the appropriate
2766
 * GDALCopyWordsT function template for the desired output type. You should
2767
 * never call this function directly (call GDALCopyWords instead).
2768
 *
2769
 * @param pSrcData source data buffer
2770
 * @param nSrcPixelStride pixel stride in input buffer, in pixel words
2771
 * @param bInComplex input is complex
2772
 * @param pDstData destination data buffer
2773
 * @param eDstType destination data type
2774
 * @param nDstPixelStride pixel stride in output buffer, in pixel words
2775
 * @param nWordCount number of pixel words to be copied
2776
 */
2777
template <class T>
2778
inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
53,535,822✔
2779
                               int nSrcPixelStride, bool bInComplex,
2780
                               void *CPL_RESTRICT pDstData,
2781
                               GDALDataType eDstType, int nDstPixelStride,
2782
                               GPtrDiff_t nWordCount)
2783
{
2784
    switch (eDstType)
53,535,822✔
2785
    {
2786
        case GDT_Byte:
4,563,385✔
2787
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
4,563,385✔
2788
                           static_cast<unsigned char *>(pDstData),
2789
                           nDstPixelStride, nWordCount);
2790
            break;
4,563,506✔
2791
        case GDT_Int8:
529✔
2792
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
529✔
2793
                           static_cast<signed char *>(pDstData),
2794
                           nDstPixelStride, nWordCount);
2795
            break;
529✔
2796
        case GDT_UInt16:
101,204✔
2797
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
101,204✔
2798
                           static_cast<unsigned short *>(pDstData),
2799
                           nDstPixelStride, nWordCount);
2800
            break;
101,200✔
2801
        case GDT_Int16:
4,125,060✔
2802
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
4,125,060✔
2803
                           static_cast<short *>(pDstData), nDstPixelStride,
2804
                           nWordCount);
2805
            break;
4,125,060✔
2806
        case GDT_UInt32:
4,229✔
2807
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
4,229✔
2808
                           static_cast<unsigned int *>(pDstData),
2809
                           nDstPixelStride, nWordCount);
2810
            break;
4,229✔
2811
        case GDT_Int32:
25,468,596✔
2812
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
25,468,596✔
2813
                           static_cast<int *>(pDstData), nDstPixelStride,
2814
                           nWordCount);
2815
            break;
25,493,399✔
2816
        case GDT_UInt64:
631✔
2817
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
631✔
2818
                           static_cast<std::uint64_t *>(pDstData),
2819
                           nDstPixelStride, nWordCount);
2820
            break;
631✔
2821
        case GDT_Int64:
4,204✔
2822
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
4,204✔
2823
                           static_cast<std::int64_t *>(pDstData),
2824
                           nDstPixelStride, nWordCount);
2825
            break;
4,204✔
2826
        case GDT_Float16:
118✔
2827
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
118✔
2828
                           static_cast<GFloat16 *>(pDstData), nDstPixelStride,
2829
                           nWordCount);
2830
            break;
118✔
2831
        case GDT_Float32:
3,869,182✔
2832
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
3,869,182✔
2833
                           static_cast<float *>(pDstData), nDstPixelStride,
2834
                           nWordCount);
2835
            break;
3,869,182✔
2836
        case GDT_Float64:
15,244,221✔
2837
            GDALCopyWordsT(pSrcData, nSrcPixelStride,
15,244,221✔
2838
                           static_cast<double *>(pDstData), nDstPixelStride,
2839
                           nWordCount);
2840
            break;
15,244,281✔
2841
        case GDT_CInt16:
122,429✔
2842
            if (bInComplex)
122,429✔
2843
            {
2844
                GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
121,400✔
2845
                                      static_cast<short *>(pDstData),
2846
                                      nDstPixelStride, nWordCount);
2847
            }
2848
            else  // input is not complex, so we need to promote to a complex
2849
                  // buffer
2850
            {
2851
                GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
1,029✔
2852
                                         static_cast<short *>(pDstData),
2853
                                         nDstPixelStride, nWordCount);
2854
            }
2855
            break;
122,429✔
2856
        case GDT_CInt32:
828✔
2857
            if (bInComplex)
828✔
2858
            {
2859
                GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
421✔
2860
                                      static_cast<int *>(pDstData),
2861
                                      nDstPixelStride, nWordCount);
2862
            }
2863
            else  // input is not complex, so we need to promote to a complex
2864
                  // buffer
2865
            {
2866
                GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
407✔
2867
                                         static_cast<int *>(pDstData),
2868
                                         nDstPixelStride, nWordCount);
2869
            }
2870
            break;
828✔
2871
        case GDT_CFloat16:
57✔
2872
            if (bInComplex)
57✔
2873
            {
2874
                GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
16✔
2875
                                      static_cast<GFloat16 *>(pDstData),
2876
                                      nDstPixelStride, nWordCount);
2877
            }
2878
            else  // input is not complex, so we need to promote to a complex
2879
                  // buffer
2880
            {
2881
                GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
41✔
2882
                                         static_cast<GFloat16 *>(pDstData),
2883
                                         nDstPixelStride, nWordCount);
2884
            }
2885
            break;
57✔
2886
        case GDT_CFloat32:
3,176✔
2887
            if (bInComplex)
3,176✔
2888
            {
2889
                GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
2,595✔
2890
                                      static_cast<float *>(pDstData),
2891
                                      nDstPixelStride, nWordCount);
2892
            }
2893
            else  // input is not complex, so we need to promote to a complex
2894
                  // buffer
2895
            {
2896
                GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
581✔
2897
                                         static_cast<float *>(pDstData),
2898
                                         nDstPixelStride, nWordCount);
2899
            }
2900
            break;
3,176✔
2901
        case GDT_CFloat64:
2,004✔
2902
            if (bInComplex)
2,004✔
2903
            {
2904
                GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
788✔
2905
                                      static_cast<double *>(pDstData),
2906
                                      nDstPixelStride, nWordCount);
2907
            }
2908
            else  // input is not complex, so we need to promote to a complex
2909
                  // buffer
2910
            {
2911
                GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
1,216✔
2912
                                         static_cast<double *>(pDstData),
2913
                                         nDstPixelStride, nWordCount);
2914
            }
2915
            break;
2,004✔
2916
        case GDT_Unknown:
×
2917
        case GDT_TypeCount:
2918
            CPLAssert(false);
×
2919
    }
2920
}
53,560,756✔
2921

2922
}  // end anonymous namespace
2923

2924
/************************************************************************/
2925
/*                          GDALReplicateWord()                         */
2926
/************************************************************************/
2927

2928
template <class T>
2929
inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
525,024✔
2930
                               GPtrDiff_t nWordCount)
2931
{
2932
    const T valSet = *static_cast<const T *>(pDstData);
525,024✔
2933
    if (nDstPixelStride == static_cast<int>(sizeof(T)))
525,024✔
2934
    {
2935
        T *pDstPtr = static_cast<T *>(pDstData) + 1;
496,631✔
2936
        while (nWordCount >= 4)
20,192,617✔
2937
        {
2938
            nWordCount -= 4;
19,695,964✔
2939
            pDstPtr[0] = valSet;
19,695,964✔
2940
            pDstPtr[1] = valSet;
19,695,964✔
2941
            pDstPtr[2] = valSet;
19,695,964✔
2942
            pDstPtr[3] = valSet;
19,695,964✔
2943
            pDstPtr += 4;
19,695,964✔
2944
        }
2945
        while (nWordCount > 0)
1,262,900✔
2946
        {
2947
            --nWordCount;
766,269✔
2948
            *pDstPtr = valSet;
766,269✔
2949
            pDstPtr++;
766,269✔
2950
        }
2951
    }
2952
    else
2953
    {
2954
        GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
28,407✔
2955
        while (nWordCount > 0)
954,322✔
2956
        {
2957
            --nWordCount;
925,915✔
2958
            *reinterpret_cast<T *>(pabyDstPtr) = valSet;
925,915✔
2959
            pabyDstPtr += nDstPixelStride;
925,915✔
2960
        }
2961
    }
2962
}
525,024✔
2963

2964
static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
905,368✔
2965
                              GDALDataType eSrcType,
2966
                              void *CPL_RESTRICT pDstData,
2967
                              GDALDataType eDstType, int nDstPixelStride,
2968
                              GPtrDiff_t nWordCount)
2969
{
2970
    /* -----------------------------------------------------------------------
2971
     */
2972
    /* Special case when the source data is always the same value */
2973
    /* (for VRTSourcedRasterBand::IRasterIO and
2974
     * VRTDerivedRasterBand::IRasterIO*/
2975
    /*  for example) */
2976
    /* -----------------------------------------------------------------------
2977
     */
2978
    // Let the general translation case do the necessary conversions
2979
    // on the first destination element.
2980
    GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
905,368✔
2981

2982
    // Now copy the first element to the nWordCount - 1 following destination
2983
    // elements.
2984
    nWordCount--;
903,598✔
2985
    GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
903,598✔
2986

2987
    switch (eDstType)
903,598✔
2988
    {
2989
        case GDT_Byte:
378,248✔
2990
        case GDT_Int8:
2991
        {
2992
            if (nDstPixelStride == 1)
378,248✔
2993
            {
2994
                if (nWordCount > 0)
344,496✔
2995
                    memset(pabyDstWord,
344,496✔
2996
                           *reinterpret_cast<const GByte *>(pDstData),
344,496✔
2997
                           nWordCount);
2998
            }
2999
            else
3000
            {
3001
                GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
33,752✔
3002
                while (nWordCount > 0)
5,438,530✔
3003
                {
3004
                    --nWordCount;
5,404,780✔
3005
                    *pabyDstWord = valSet;
5,404,780✔
3006
                    pabyDstWord += nDstPixelStride;
5,404,780✔
3007
                }
3008
            }
3009
            break;
378,248✔
3010
        }
3011

3012
#define CASE_DUPLICATE_SIMPLE(enum_type, c_type)                               \
3013
    case enum_type:                                                            \
3014
    {                                                                          \
3015
        GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount);     \
3016
        break;                                                                 \
3017
    }
3018

3019
            CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
354✔
3020
            CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
169,648✔
3021
            CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
56✔
3022
            CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
296,994✔
3023
            CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
21✔
3024
            CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
662✔
3025
            CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
×
3026
            CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
52,218✔
3027
            CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
5,089✔
3028

3029
#define CASE_DUPLICATE_COMPLEX(enum_type, c_type)                              \
3030
    case enum_type:                                                            \
3031
    {                                                                          \
3032
        c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0];        \
3033
        c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1];        \
3034
        while (nWordCount > 0)                                                 \
3035
        {                                                                      \
3036
            --nWordCount;                                                      \
3037
            reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1;              \
3038
            reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2;              \
3039
            pabyDstWord += nDstPixelStride;                                    \
3040
        }                                                                      \
3041
        break;                                                                 \
3042
    }
3043

3044
            CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
784✔
3045
            CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
784✔
3046
            CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
×
3047
            CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
784✔
3048
            CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
784✔
3049

3050
        case GDT_Unknown:
×
3051
        case GDT_TypeCount:
3052
            CPLAssert(false);
×
3053
    }
3054
}
905,705✔
3055

3056
/************************************************************************/
3057
/*                        GDALUnrolledCopy()                            */
3058
/************************************************************************/
3059

3060
template <class T, int srcStride, int dstStride>
3061
static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
5,329,985✔
3062
                                           const T *CPL_RESTRICT pSrc,
3063
                                           GPtrDiff_t nIters)
3064
{
3065
    if (nIters >= 16)
5,329,985✔
3066
    {
3067
        for (GPtrDiff_t i = nIters / 16; i != 0; i--)
138,941,404✔
3068
        {
3069
            pDest[0 * dstStride] = pSrc[0 * srcStride];
133,741,652✔
3070
            pDest[1 * dstStride] = pSrc[1 * srcStride];
133,741,652✔
3071
            pDest[2 * dstStride] = pSrc[2 * srcStride];
133,741,652✔
3072
            pDest[3 * dstStride] = pSrc[3 * srcStride];
133,741,652✔
3073
            pDest[4 * dstStride] = pSrc[4 * srcStride];
133,741,652✔
3074
            pDest[5 * dstStride] = pSrc[5 * srcStride];
133,741,652✔
3075
            pDest[6 * dstStride] = pSrc[6 * srcStride];
133,741,652✔
3076
            pDest[7 * dstStride] = pSrc[7 * srcStride];
133,741,652✔
3077
            pDest[8 * dstStride] = pSrc[8 * srcStride];
133,741,652✔
3078
            pDest[9 * dstStride] = pSrc[9 * srcStride];
133,741,652✔
3079
            pDest[10 * dstStride] = pSrc[10 * srcStride];
133,741,652✔
3080
            pDest[11 * dstStride] = pSrc[11 * srcStride];
133,741,652✔
3081
            pDest[12 * dstStride] = pSrc[12 * srcStride];
133,741,652✔
3082
            pDest[13 * dstStride] = pSrc[13 * srcStride];
133,741,652✔
3083
            pDest[14 * dstStride] = pSrc[14 * srcStride];
133,741,652✔
3084
            pDest[15 * dstStride] = pSrc[15 * srcStride];
133,741,652✔
3085
            pDest += 16 * dstStride;
133,741,652✔
3086
            pSrc += 16 * srcStride;
133,741,652✔
3087
        }
3088
        nIters = nIters % 16;
5,199,676✔
3089
    }
3090
    for (GPtrDiff_t i = 0; i < nIters; i++)
7,581,052✔
3091
    {
3092
        pDest[i * dstStride] = *pSrc;
2,251,072✔
3093
        pSrc += srcStride;
2,251,072✔
3094
    }
3095
}
5,329,985✔
3096

3097
template <class T, int srcStride, int dstStride>
3098
static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
5,323,885✔
3099
                                    const T *CPL_RESTRICT pSrc,
3100
                                    GPtrDiff_t nIters)
3101
{
3102
    GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
5,323,885✔
3103
}
5,323,855✔
3104

3105
#ifdef HAVE_SSE2
3106

3107
template <>
3108
void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
304,624✔
3109
                                   const GByte *CPL_RESTRICT pSrc,
3110
                                   GPtrDiff_t nIters)
3111
{
3112
    decltype(nIters) i = 0;
304,624✔
3113
    if (nIters > 16)
304,624✔
3114
    {
3115
        const __m128i xmm_mask = _mm_set1_epi16(0xff);
146,453✔
3116
        // If we were sure that there would always be 1 trailing byte, we could
3117
        // check against nIters - 15
3118
        for (; i < nIters - 16; i += 16)
2,583,810✔
3119
        {
3120
            __m128i xmm0 =
3121
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
2,437,360✔
3122
            __m128i xmm1 =
3123
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
4,874,710✔
3124
            // Set higher 8bit of each int16 packed word to 0
3125
            xmm0 = _mm_and_si128(xmm0, xmm_mask);
2,437,360✔
3126
            xmm1 = _mm_and_si128(xmm1, xmm_mask);
2,437,360✔
3127
            // Pack int16 to uint8 and merge back both vector
3128
            xmm0 = _mm_packus_epi16(xmm0, xmm1);
2,437,360✔
3129

3130
            // Store result
3131
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
2,437,360✔
3132

3133
            pSrc += 2 * 16;
2,437,360✔
3134
        }
3135
    }
3136
    for (; i < nIters; i++)
3,886,020✔
3137
    {
3138
        pDest[i] = *pSrc;
3,581,400✔
3139
        pSrc += 2;
3,581,400✔
3140
    }
3141
}
304,624✔
3142

3143
#ifdef HAVE_SSSE3_AT_COMPILE_TIME
3144

3145
template <>
3146
void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
190,438✔
3147
                                   const GByte *CPL_RESTRICT pSrc,
3148
                                   GPtrDiff_t nIters)
3149
{
3150
    if (nIters > 16 && CPLHaveRuntimeSSSE3())
190,438✔
3151
    {
3152
        GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
184,338✔
3153
    }
3154
    else
3155
    {
3156
        GDALUnrolledCopyGeneric<GByte, 3, 1>(pDest, pSrc, nIters);
6,100✔
3157
    }
3158
}
190,438✔
3159

3160
#endif
3161

3162
template <>
3163
void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
105,199✔
3164
                                   const GByte *CPL_RESTRICT pSrc,
3165
                                   GPtrDiff_t nIters)
3166
{
3167
    decltype(nIters) i = 0;
105,199✔
3168
    if (nIters > 16)
105,199✔
3169
    {
3170
        const __m128i xmm_mask = _mm_set1_epi32(0xff);
99,906✔
3171
        // If we were sure that there would always be 3 trailing bytes, we could
3172
        // check against nIters - 15
3173
        for (; i < nIters - 16; i += 16)
8,826,290✔
3174
        {
3175
            __m128i xmm0 =
3176
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
8,724,920✔
3177
            __m128i xmm1 =
3178
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
8,724,920✔
3179
            __m128i xmm2 =
3180
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
8,724,920✔
3181
            __m128i xmm3 =
3182
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
17,449,800✔
3183
            // Set higher 24bit of each int32 packed word to 0
3184
            xmm0 = _mm_and_si128(xmm0, xmm_mask);
8,724,920✔
3185
            xmm1 = _mm_and_si128(xmm1, xmm_mask);
8,724,920✔
3186
            xmm2 = _mm_and_si128(xmm2, xmm_mask);
8,724,920✔
3187
            xmm3 = _mm_and_si128(xmm3, xmm_mask);
8,724,920✔
3188
            // Pack int32 to int16
3189
            xmm0 = _mm_packs_epi32(xmm0, xmm1);
8,725,640✔
3190
            xmm2 = _mm_packs_epi32(xmm2, xmm3);
8,725,800✔
3191
            // Pack int16 to uint8
3192
            xmm0 = _mm_packus_epi16(xmm0, xmm2);
8,726,380✔
3193

3194
            // Store result
3195
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
8,726,380✔
3196

3197
            pSrc += 4 * 16;
8,726,380✔
3198
        }
3199
    }
3200
    for (; i < nIters; i++)
1,119,910✔
3201
    {
3202
        pDest[i] = *pSrc;
1,013,250✔
3203
        pSrc += 4;
1,013,250✔
3204
    }
3205
}
106,663✔
3206
#endif  // HAVE_SSE2
3207

3208
/************************************************************************/
3209
/*                         GDALFastCopy()                               */
3210
/************************************************************************/
3211

3212
template <class T>
3213
static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
39,811,100✔
3214
                                const T *CPL_RESTRICT pSrc, int nSrcStride,
3215
                                GPtrDiff_t nIters)
3216
{
3217
    constexpr int sizeofT = static_cast<int>(sizeof(T));
39,811,100✔
3218
    if (nIters == 1)
39,811,100✔
3219
    {
3220
        *pDest = *pSrc;
22,302,360✔
3221
    }
3222
    else if (nDestStride == sizeofT)
17,508,732✔
3223
    {
3224
        if (nSrcStride == sizeofT)
12,251,620✔
3225
        {
3226
            memcpy(pDest, pSrc, nIters * sizeof(T));
11,511,995✔
3227
        }
3228
        else if (nSrcStride == 2 * sizeofT)
739,577✔
3229
        {
3230
            GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
307,577✔
3231
        }
3232
        else if (nSrcStride == 3 * sizeofT)
432,000✔
3233
        {
3234
            GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
295,746✔
3235
        }
3236
        else if (nSrcStride == 4 * sizeofT)
136,254✔
3237
        {
3238
            GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
134,067✔
3239
        }
3240
        else
3241
        {
3242
            while (nIters-- > 0)
12,963,030✔
3243
            {
3244
                *pDest = *pSrc;
12,960,930✔
3245
                pSrc += nSrcStride / sizeofT;
12,960,930✔
3246
                pDest++;
12,960,930✔
3247
            }
3248
        }
3249
    }
3250
    else if (nSrcStride == sizeofT)
5,257,162✔
3251
    {
3252
        if (nDestStride == 2 * sizeofT)
5,246,027✔
3253
        {
3254
            GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
131,271✔
3255
        }
3256
        else if (nDestStride == 3 * sizeofT)
5,114,762✔
3257
        {
3258
            GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
4,412,233✔
3259
        }
3260
        else if (nDestStride == 4 * sizeofT)
702,525✔
3261
        {
3262
            GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
643,252✔
3263
        }
3264
        else
3265
        {
3266
            while (nIters-- > 0)
12,628,400✔
3267
            {
3268
                *pDest = *pSrc;
12,569,180✔
3269
                pSrc++;
12,569,180✔
3270
                pDest += nDestStride / sizeofT;
12,569,180✔
3271
            }
3272
        }
3273
    }
3274
    else
3275
    {
3276
        while (nIters-- > 0)
1,110,028✔
3277
        {
3278
            *pDest = *pSrc;
1,098,896✔
3279
            pSrc += nSrcStride / sizeofT;
1,098,896✔
3280
            pDest += nDestStride / sizeofT;
1,098,896✔
3281
        }
3282
    }
3283
}
39,811,100✔
3284

3285
/************************************************************************/
3286
/*                         GDALFastCopyByte()                           */
3287
/************************************************************************/
3288

3289
static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
276,282✔
3290
                             int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
3291
                             int nDstPixelStride, GPtrDiff_t nWordCount)
3292
{
3293
    GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
276,282✔
3294
                 nWordCount);
3295
}
276,282✔
3296

3297
/************************************************************************/
3298
/*                           GDALCopyWords()                            */
3299
/************************************************************************/
3300

3301
/**
3302
 * Copy pixel words from buffer to buffer.
3303
 *
3304
 * @see GDALCopyWords64()
3305
 */
3306
void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
87,089,200✔
3307
                               GDALDataType eSrcType, int nSrcPixelStride,
3308
                               void *CPL_RESTRICT pDstData,
3309
                               GDALDataType eDstType, int nDstPixelStride,
3310
                               int nWordCount)
3311
{
3312
    GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
87,089,200✔
3313
                    nDstPixelStride, nWordCount);
3314
}
87,097,700✔
3315

3316
/************************************************************************/
3317
/*                          GDALCopyWords64()                           */
3318
/************************************************************************/
3319

3320
/**
3321
 * Copy pixel words from buffer to buffer.
3322
 *
3323
 * This function is used to copy pixel word values from one memory buffer
3324
 * to another, with support for conversion between data types, and differing
3325
 * step factors. The data type conversion is done using the following
3326
 * rules:
3327
 * <ul>
3328
 * <li>Values assigned to a lower range integer type are clipped. For
3329
 * instance assigning GDT_Int16 values to a GDT_Byte buffer will cause values
3330
 * less the 0 to be set to 0, and values larger than 255 to be set to 255.
3331
 * </li>
3332
 * <li>
3333
 * Assignment from floating point to integer rounds to closest integer.
3334
 * +Infinity is mapped to the largest integer. -Infinity is mapped to the
3335
 * smallest integer. NaN is mapped to 0.
3336
 * </li>
3337
 * <li>
3338
 * Assignment from non-complex to complex will result in the imaginary part
3339
 * being set to zero on output.
3340
 * </li>
3341
 * <li> Assignment from complex to
3342
 * non-complex will result in the complex portion being lost and the real
3343
 * component being preserved (<i>not magnitude!</i>).
3344
 * </li>
3345
 * </ul>
3346
 *
3347
 * No assumptions are made about the source or destination words occurring
3348
 * on word boundaries.  It is assumed that all values are in native machine
3349
 * byte order.
3350
 *
3351
 * @param pSrcData Pointer to source data to be converted.
3352
 * @param eSrcType the source data type (see GDALDataType enum)
3353
 * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
3354
 * in bytes
3355
 * @param pDstData Pointer to buffer where destination data should go
3356
 * @param eDstType the destination data type (see GDALDataType enum)
3357
 * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
3358
 * words), in bytes
3359
 * @param nWordCount number of words to be copied
3360
 *
3361
 * @note
3362
 * When adding a new data type to GDAL, you must do the following to
3363
 * support it properly within the GDALCopyWords function:
3364
 * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
3365
 *    This should invoke the appropriate GDALCopyWordsFromT wrapper.
3366
 * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
3367
 *    This should call the appropriate GDALCopyWordsT template.
3368
 * 3. If appropriate, overload the appropriate CopyWord template in the
3369
 *    above namespace. This will ensure that any conversion issues are
3370
 *    handled (cases like the float -> int32 case, where the min/max)
3371
 *    values are subject to roundoff error.
3372
 */
3373

3374
void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
108,220,000✔
3375
                                 GDALDataType eSrcType, int nSrcPixelStride,
3376
                                 void *CPL_RESTRICT pDstData,
3377
                                 GDALDataType eDstType, int nDstPixelStride,
3378
                                 GPtrDiff_t nWordCount)
3379

3380
{
3381
    // On platforms where alignment matters, be careful
3382
    const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
108,220,000✔
3383
    const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
108,213,000✔
3384
    if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
108,232,000✔
3385
    {
3386
        CPLError(CE_Failure, CPLE_NotSupported,
2✔
3387
                 "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
3388
                 "argument");
3389
        return;
2✔
3390
    }
3391
    if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
108,232,000✔
3392
        ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
60,188,000✔
3393
         (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
60,191,600✔
3394
         (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
60,181,900✔
3395
         (nDstPixelStride % nDstDataTypeSize) != 0))
60,176,900✔
3396
    {
3397
        if (eSrcType == eDstType)
905✔
3398
        {
3399
            for (decltype(nWordCount) i = 0; i < nWordCount; i++)
34,800✔
3400
            {
3401
                memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
34,000✔
3402
                       static_cast<const GByte *>(pSrcData) +
3403
                           nSrcPixelStride * i,
34,000✔
3404
                       nDstDataTypeSize);
3405
            }
3406
        }
3407
        else
3408
        {
3409
            const auto getAlignedPtr = [](GByte *ptr, int align)
210✔
3410
            {
3411
                return ptr +
3412
                       ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
210✔
3413
                        align);
210✔
3414
            };
3415

3416
            // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
3417
            // be sure to get correctly aligned pointer.
3418
            constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
105✔
3419
            GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
3420
            GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
3421
            GByte *pabySrcBuffer =
3422
                getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
105✔
3423
            GByte *pabyDstBuffer =
3424
                getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
105✔
3425
            for (decltype(nWordCount) i = 0; i < nWordCount; i++)
3,360✔
3426
            {
3427
                memcpy(pabySrcBuffer,
3,255✔
3428
                       static_cast<const GByte *>(pSrcData) +
3429
                           nSrcPixelStride * i,
3,255✔
3430
                       nSrcDataTypeSize);
3431
                GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
3,255✔
3432
                                eDstType, 0, 1);
3433
                memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
3,255✔
3434
                       pabyDstBuffer, nDstDataTypeSize);
3435
            }
3436
        }
3437
        return;
905✔
3438
    }
3439

3440
    // Deal with the case where we're replicating a single word into the
3441
    // provided buffer
3442
    if (nSrcPixelStride == 0 && nWordCount > 1)
108,231,000✔
3443
    {
3444
        GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
905,472✔
3445
                          nDstPixelStride, nWordCount);
3446
        return;
905,555✔
3447
    }
3448

3449
    if (eSrcType == eDstType)
107,326,000✔
3450
    {
3451
        if (eSrcType == GDT_Byte || eSrcType == GDT_Int8)
54,011,500✔
3452
        {
3453
            GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
18,583,300✔
3454
                         static_cast<const GByte *>(pSrcData), nSrcPixelStride,
3455
                         nWordCount);
3456
            return;
18,582,100✔
3457
        }
3458

3459
        if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
35,428,200✔
3460
            (nDstPixelStride % 2) == 0)
20,957,700✔
3461
        {
3462
            GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
20,957,700✔
3463
                         static_cast<const short *>(pSrcData), nSrcPixelStride,
3464
                         nWordCount);
3465
            return;
20,957,500✔
3466
        }
3467

3468
        if (nWordCount == 1)
14,470,500✔
3469
        {
3470
#if defined(CSA_BUILD) || defined(__COVERITY__)
3471
            // Avoid false positives...
3472
            memcpy(pDstData, pSrcData, nSrcDataTypeSize);
3473
#else
3474
            if (nSrcDataTypeSize == 2)
14,056,700✔
3475
                memcpy(pDstData, pSrcData, 2);
×
3476
            else if (nSrcDataTypeSize == 4)
14,056,700✔
3477
                memcpy(pDstData, pSrcData, 4);
14,014,000✔
3478
            else if (nSrcDataTypeSize == 8)
42,665✔
3479
                memcpy(pDstData, pSrcData, 8);
26,148✔
3480
            else /* if( eSrcType == GDT_CFloat64 ) */
3481
                memcpy(pDstData, pSrcData, 16);
16,517✔
3482
#endif
3483
            return;
14,056,700✔
3484
        }
3485

3486
        // Let memcpy() handle the case where we're copying a packed buffer
3487
        // of pixels.
3488
        if (nSrcPixelStride == nDstPixelStride)
413,854✔
3489
        {
3490
            if (nSrcPixelStride == nSrcDataTypeSize)
259,608✔
3491
            {
3492
                memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
257,434✔
3493
                return;
257,434✔
3494
            }
3495
        }
3496
    }
3497

3498
    // Handle the more general case -- deals with conversion of data types
3499
    // directly.
3500
    switch (eSrcType)
53,470,400✔
3501
    {
3502
        case GDT_Byte:
14,961,600✔
3503
            GDALCopyWordsFromT<unsigned char>(
14,961,600✔
3504
                static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
3505
                false, pDstData, eDstType, nDstPixelStride, nWordCount);
3506
            break;
14,951,400✔
3507
        case GDT_Int8:
1,015✔
3508
            GDALCopyWordsFromT<signed char>(
1,015✔
3509
                static_cast<const signed char *>(pSrcData), nSrcPixelStride,
3510
                false, pDstData, eDstType, nDstPixelStride, nWordCount);
3511
            break;
1,015✔
3512
        case GDT_UInt16:
52,883✔
3513
            GDALCopyWordsFromT<unsigned short>(
52,883✔
3514
                static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
3515
                false, pDstData, eDstType, nDstPixelStride, nWordCount);
3516
            break;
52,883✔
3517
        case GDT_Int16:
4,512,270✔
3518
            GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
4,512,270✔
3519
                                      nSrcPixelStride, false, pDstData,
3520
                                      eDstType, nDstPixelStride, nWordCount);
3521
            break;
4,512,260✔
3522
        case GDT_UInt32:
6,512✔
3523
            GDALCopyWordsFromT<unsigned int>(
6,512✔
3524
                static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
3525
                false, pDstData, eDstType, nDstPixelStride, nWordCount);
3526
            break;
6,512✔
3527
        case GDT_Int32:
12,254,600✔
3528
            GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
12,254,600✔
3529
                                    nSrcPixelStride, false, pDstData, eDstType,
3530
                                    nDstPixelStride, nWordCount);
3531
            break;
12,254,600✔
3532
        case GDT_UInt64:
1,465✔
3533
            GDALCopyWordsFromT<std::uint64_t>(
1,465✔
3534
                static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
3535
                false, pDstData, eDstType, nDstPixelStride, nWordCount);
3536
            break;
1,465✔
3537
        case GDT_Int64:
7,299✔
3538
            GDALCopyWordsFromT<std::int64_t>(
7,299✔
3539
                static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
3540
                false, pDstData, eDstType, nDstPixelStride, nWordCount);
3541
            break;
7,299✔
3542
        case GDT_Float16:
505✔
3543
            GDALCopyWordsFromT<GFloat16>(
505✔
3544
                static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
3545
                pDstData, eDstType, nDstPixelStride, nWordCount);
3546
            break;
505✔
3547
        case GDT_Float32:
322,009✔
3548
            GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
322,009✔
3549
                                      nSrcPixelStride, false, pDstData,
3550
                                      eDstType, nDstPixelStride, nWordCount);
3551
            break;
321,982✔
3552
        case GDT_Float64:
20,678,400✔
3553
            GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
20,678,400✔
3554
                                       nSrcPixelStride, false, pDstData,
3555
                                       eDstType, nDstPixelStride, nWordCount);
3556
            break;
20,678,500✔
3557
        case GDT_CInt16:
566,929✔
3558
            GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
566,929✔
3559
                                      nSrcPixelStride, true, pDstData, eDstType,
3560
                                      nDstPixelStride, nWordCount);
3561
            break;
566,929✔
3562
        case GDT_CInt32:
365✔
3563
            GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
365✔
3564
                                    nSrcPixelStride, true, pDstData, eDstType,
3565
                                    nDstPixelStride, nWordCount);
3566
            break;
365✔
3567
        case GDT_CFloat16:
212✔
3568
            GDALCopyWordsFromT<GFloat16>(
212✔
3569
                static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
3570
                pDstData, eDstType, nDstPixelStride, nWordCount);
3571
            break;
212✔
3572
        case GDT_CFloat32:
1,331✔
3573
            GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
1,331✔
3574
                                      nSrcPixelStride, true, pDstData, eDstType,
3575
                                      nDstPixelStride, nWordCount);
3576
            break;
1,331✔
3577
        case GDT_CFloat64:
172,542✔
3578
            GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
172,542✔
3579
                                       nSrcPixelStride, true, pDstData,
3580
                                       eDstType, nDstPixelStride, nWordCount);
3581
            break;
172,542✔
3582
        case GDT_Unknown:
×
3583
        case GDT_TypeCount:
3584
            CPLAssert(false);
×
3585
    }
3586
}
3587

3588
/************************************************************************/
3589
/*                            GDALCopyBits()                            */
3590
/************************************************************************/
3591

3592
/**
3593
 * Bitwise word copying.
3594
 *
3595
 * A function for moving sets of partial bytes around.  Loosely
3596
 * speaking this is a bitwise analog to GDALCopyWords().
3597
 *
3598
 * It copies nStepCount "words" where each word is nBitCount bits long.
3599
 * The nSrcStep and nDstStep are the number of bits from the start of one
3600
 * word to the next (same as nBitCount if they are packed).  The nSrcOffset
3601
 * and nDstOffset are the offset into the source and destination buffers
3602
 * to start at, also measured in bits.
3603
 *
3604
 * All bit offsets are assumed to start from the high order bit in a byte
3605
 * (i.e. most significant bit first).  Currently this function is not very
3606
 * optimized, but it may be improved for some common cases in the future
3607
 * as needed.
3608
 *
3609
 * @param pabySrcData the source data buffer.
3610
 * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
3611
 * first word to copy.
3612
 * @param nSrcStep the offset in bits from the start one source word to the
3613
 * start of the next.
3614
 * @param pabyDstData the destination data buffer.
3615
 * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
3616
 * first word to copy over.
3617
 * @param nDstStep the offset in bits from the start one word to the
3618
 * start of the next.
3619
 * @param nBitCount the number of bits in a word to be copied.
3620
 * @param nStepCount the number of words to copy.
3621
 */
3622

3623
void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
×
3624
                  GByte *pabyDstData, int nDstOffset, int nDstStep,
3625
                  int nBitCount, int nStepCount)
3626

3627
{
3628
    VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
×
3629

3630
    for (int iStep = 0; iStep < nStepCount; iStep++)
×
3631
    {
3632
        for (int iBit = 0; iBit < nBitCount; iBit++)
×
3633
        {
3634
            if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
×
3635
                pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
×
3636
            else
3637
                pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
×
3638

3639
            nSrcOffset++;
×
3640
            nDstOffset++;
×
3641
        }
3642

3643
        nSrcOffset += (nSrcStep - nBitCount);
×
3644
        nDstOffset += (nDstStep - nBitCount);
×
3645
    }
3646
}
3647

3648
/************************************************************************/
3649
/*                    GDALGetBestOverviewLevel()                        */
3650
/*                                                                      */
3651
/* Returns the best overview level to satisfy the query or -1 if none   */
3652
/* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when        */
3653
/* returning a valid overview level                                     */
3654
/************************************************************************/
3655

3656
int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
×
3657
                                 int &nXSize, int &nYSize, int nBufXSize,
3658
                                 int nBufYSize)
3659
{
3660
    return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
×
3661
                                         nBufXSize, nBufYSize, nullptr);
×
3662
}
3663

3664
int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
322,828✔
3665
                                  int &nYOff, int &nXSize, int &nYSize,
3666
                                  int nBufXSize, int nBufYSize,
3667
                                  GDALRasterIOExtraArg *psExtraArg)
3668
{
3669
    /* -------------------------------------------------------------------- */
3670
    /*      Compute the desired downsampling factor.  It is                 */
3671
    /*      based on the least reduced axis, and represents the number      */
3672
    /*      of source pixels to one destination pixel.                      */
3673
    /* -------------------------------------------------------------------- */
3674
    const double dfDesiredDownsamplingFactor =
322,828✔
3675
        ((nXSize / static_cast<double>(nBufXSize)) <
322,828✔
3676
             (nYSize / static_cast<double>(nBufYSize)) ||
160,491✔
3677
         nBufYSize == 1)
3678
            ? nXSize / static_cast<double>(nBufXSize)
354,204✔
3679
            : nYSize / static_cast<double>(nBufYSize);
129,115✔
3680

3681
    /* -------------------------------------------------------------------- */
3682
    /*      Find the overview level that largest downsampling factor (most  */
3683
    /*      downsampled) that is still less than (or only a little more)    */
3684
    /*      downsampled than the request.                                   */
3685
    /* -------------------------------------------------------------------- */
3686
    const int nOverviewCount = poBand->GetOverviewCount();
322,828✔
3687
    GDALRasterBand *poBestOverview = nullptr;
322,828✔
3688
    double dfBestDownsamplingFactor = 0;
322,828✔
3689
    int nBestOverviewLevel = -1;
322,828✔
3690

3691
    const char *pszOversampligThreshold =
3692
        CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
322,828✔
3693

3694
    // Note: keep this logic for overview selection in sync between
3695
    // gdalwarp_lib.cpp and rasterio.cpp
3696
    // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
3697
    const double dfOversamplingThreshold =
3698
        pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
645,647✔
3699
        : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
322,819✔
3700
            ? 1.0
645,638✔
3701
            : 1.2;
322,828✔
3702
    for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
325,518✔
3703
    {
3704
        GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
5,527✔
3705
        if (poOverview == nullptr ||
11,054✔
3706
            poOverview->GetXSize() > poBand->GetXSize() ||
11,053✔
3707
            poOverview->GetYSize() > poBand->GetYSize())
5,526✔
3708
        {
3709
            continue;
1✔
3710
        }
3711

3712
        // Compute downsampling factor of this overview
3713
        const double dfDownsamplingFactor = std::min(
3714
            poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
5,526✔
3715
            poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
11,052✔
3716

3717
        // Is it nearly the requested factor and better (lower) than
3718
        // the current best factor?
3719
        // Use an epsilon because of numerical instability.
3720
        constexpr double EPSILON = 1e-1;
5,526✔
3721
        if (dfDownsamplingFactor >=
5,634✔
3722
                dfDesiredDownsamplingFactor * dfOversamplingThreshold +
5,526✔
3723
                    EPSILON ||
5,418✔
3724
            dfDownsamplingFactor <= dfBestDownsamplingFactor)
3725
        {
3726
            continue;
108✔
3727
        }
3728

3729
        // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
3730
        const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
5,418✔
3731

3732
        if (pszResampling != nullptr &&
5,418✔
3733
            STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
71✔
3734
            continue;
16✔
3735

3736
        // OK, this is our new best overview.
3737
        poBestOverview = poOverview;
5,402✔
3738
        nBestOverviewLevel = iOverview;
5,402✔
3739
        dfBestDownsamplingFactor = dfDownsamplingFactor;
5,402✔
3740

3741
        if (std::abs(dfDesiredDownsamplingFactor - dfDownsamplingFactor) <
5,402✔
3742
            EPSILON)
3743
        {
3744
            break;
2,837✔
3745
        }
3746
    }
3747

3748
    /* -------------------------------------------------------------------- */
3749
    /*      If we didn't find an overview that helps us, just return        */
3750
    /*      indicating failure and the full resolution image will be used.  */
3751
    /* -------------------------------------------------------------------- */
3752
    if (nBestOverviewLevel < 0)
322,828✔
3753
        return -1;
319,924✔
3754

3755
    /* -------------------------------------------------------------------- */
3756
    /*      Recompute the source window in terms of the selected            */
3757
    /*      overview.                                                       */
3758
    /* -------------------------------------------------------------------- */
3759
    const double dfXFactor =
3760
        poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
2,904✔
3761
    const double dfYFactor =
3762
        poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
2,904✔
3763
    CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
2,904✔
3764
             poBestOverview->GetYSize());
3765

3766
    const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
8,712✔
3767
                                static_cast<int>(nXOff / dfXFactor + 0.5));
2,904✔
3768
    const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
8,712✔
3769
                                static_cast<int>(nYOff / dfYFactor + 0.5));
2,904✔
3770
    int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
2,904✔
3771
    int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
2,904✔
3772
    if (nOXOff + nOXSize > poBestOverview->GetXSize())
2,904✔
3773
        nOXSize = poBestOverview->GetXSize() - nOXOff;
×
3774
    if (nOYOff + nOYSize > poBestOverview->GetYSize())
2,904✔
3775
        nOYSize = poBestOverview->GetYSize() - nOYOff;
2✔
3776

3777
    if (psExtraArg)
2,904✔
3778
    {
3779
        if (psExtraArg->bFloatingPointWindowValidity)
2,904✔
3780
        {
3781
            psExtraArg->dfXOff /= dfXFactor;
45✔
3782
            psExtraArg->dfXSize /= dfXFactor;
45✔
3783
            psExtraArg->dfYOff /= dfYFactor;
45✔
3784
            psExtraArg->dfYSize /= dfYFactor;
45✔
3785
        }
3786
        else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
2,859✔
3787
        {
3788
            psExtraArg->bFloatingPointWindowValidity = true;
16✔
3789
            psExtraArg->dfXOff = nXOff / dfXFactor;
16✔
3790
            psExtraArg->dfXSize = nXSize / dfXFactor;
16✔
3791
            psExtraArg->dfYOff = nYOff / dfYFactor;
16✔
3792
            psExtraArg->dfYSize = nYSize / dfYFactor;
16✔
3793
        }
3794
    }
3795

3796
    nXOff = nOXOff;
2,904✔
3797
    nYOff = nOYOff;
2,904✔
3798
    nXSize = nOXSize;
2,904✔
3799
    nYSize = nOYSize;
2,904✔
3800

3801
    return nBestOverviewLevel;
2,904✔
3802
}
3803

3804
/************************************************************************/
3805
/*                          OverviewRasterIO()                          */
3806
/*                                                                      */
3807
/*      Special work function to utilize available overviews to         */
3808
/*      more efficiently satisfy downsampled requests.  It will         */
3809
/*      return CE_Failure if there are no appropriate overviews         */
3810
/*      available but it doesn't emit any error messages.               */
3811
/************************************************************************/
3812

3813
//! @cond Doxygen_Suppress
3814
CPLErr GDALRasterBand::OverviewRasterIO(
2✔
3815
    GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
3816
    void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
3817
    GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
3818

3819
{
3820
    GDALRasterIOExtraArg sExtraArg;
3821
    GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
2✔
3822

3823
    const int nOverview = GDALBandGetBestOverviewLevel2(
2✔
3824
        this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
3825
    if (nOverview < 0)
2✔
3826
        return CE_Failure;
1✔
3827

3828
    /* -------------------------------------------------------------------- */
3829
    /*      Recast the call in terms of the new raster layer.               */
3830
    /* -------------------------------------------------------------------- */
3831
    GDALRasterBand *poOverviewBand = GetOverview(nOverview);
1✔
3832
    if (poOverviewBand == nullptr)
1✔
3833
        return CE_Failure;
×
3834

3835
    return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
1✔
3836
                                    pData, nBufXSize, nBufYSize, eBufType,
3837
                                    nPixelSpace, nLineSpace, &sExtraArg);
1✔
3838
}
3839

3840
/************************************************************************/
3841
/*                      TryOverviewRasterIO()                           */
3842
/************************************************************************/
3843

3844
CPLErr GDALRasterBand::TryOverviewRasterIO(
161,948✔
3845
    GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
3846
    void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
3847
    GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
3848
    int *pbTried)
3849
{
3850
    int nXOffMod = nXOff;
161,948✔
3851
    int nYOffMod = nYOff;
161,948✔
3852
    int nXSizeMod = nXSize;
161,948✔
3853
    int nYSizeMod = nYSize;
161,948✔
3854
    GDALRasterIOExtraArg sExtraArg;
3855

3856
    GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
161,948✔
3857

3858
    int iOvrLevel = GDALBandGetBestOverviewLevel2(
161,948✔
3859
        this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
3860
        &sExtraArg);
3861

3862
    if (iOvrLevel >= 0)
161,948✔
3863
    {
3864
        GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
49✔
3865
        if (poOverviewBand)
49✔
3866
        {
3867
            *pbTried = TRUE;
49✔
3868
            return poOverviewBand->RasterIO(
49✔
3869
                eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
3870
                nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
3871
                &sExtraArg);
49✔
3872
        }
3873
    }
3874

3875
    *pbTried = FALSE;
161,899✔
3876
    return CE_None;
161,899✔
3877
}
3878

3879
/************************************************************************/
3880
/*                      TryOverviewRasterIO()                           */
3881
/************************************************************************/
3882

3883
CPLErr GDALDataset::TryOverviewRasterIO(
158,041✔
3884
    GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
3885
    void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
3886
    int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
3887
    GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
3888
    int *pbTried)
3889
{
3890
    int nXOffMod = nXOff;
158,041✔
3891
    int nYOffMod = nYOff;
158,041✔
3892
    int nXSizeMod = nXSize;
158,041✔
3893
    int nYSizeMod = nYSize;
158,041✔
3894
    GDALRasterIOExtraArg sExtraArg;
3895
    GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
158,041✔
3896

3897
    int iOvrLevel = GDALBandGetBestOverviewLevel2(
316,082✔
3898
        papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
158,041✔
3899
        nBufYSize, &sExtraArg);
3900

3901
    if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
158,080✔
3902
        papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
39✔
3903
    {
3904
        *pbTried = TRUE;
39✔
3905
        return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
39✔
3906
            eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
3907
            nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
3908
            nLineSpace, nBandSpace, &sExtraArg);
39✔
3909
    }
3910
    else
3911
    {
3912
        *pbTried = FALSE;
158,002✔
3913
        return CE_None;
158,002✔
3914
    }
3915
}
3916

3917
/************************************************************************/
3918
/*                        GetBestOverviewLevel()                        */
3919
/*                                                                      */
3920
/* Returns the best overview level to satisfy the query or -1 if none   */
3921
/* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid     */
3922
/* overview level                                                       */
3923
/************************************************************************/
3924

3925
static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
4✔
3926
                                           int &nYOff, int &nXSize, int &nYSize,
3927
                                           int nBufXSize, int nBufYSize,
3928
                                           int nBandCount,
3929
                                           const int *panBandMap,
3930
                                           GDALRasterIOExtraArg *psExtraArg)
3931
{
3932
    int nOverviewCount = 0;
4✔
3933
    GDALRasterBand *poFirstBand = nullptr;
4✔
3934

3935
    /* -------------------------------------------------------------------- */
3936
    /* Check that all bands have the same number of overviews and           */
3937
    /* that they have all the same size and block dimensions                */
3938
    /* -------------------------------------------------------------------- */
3939
    for (int iBand = 0; iBand < nBandCount; iBand++)
12✔
3940
    {
3941
        GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
8✔
3942
        if (poBand == nullptr)
8✔
3943
            return -1;
×
3944
        if (iBand == 0)
8✔
3945
        {
3946
            poFirstBand = poBand;
4✔
3947
            nOverviewCount = poBand->GetOverviewCount();
4✔
3948
        }
3949
        else if (nOverviewCount != poBand->GetOverviewCount())
4✔
3950
        {
3951
            CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
×
3952
                             "mismatched overview count, use std method.");
3953
            return -1;
×
3954
        }
3955
        else
3956
        {
3957
            for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
4✔
3958
            {
3959
                GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
×
3960
                GDALRasterBand *poOvrFirstBand =
3961
                    poFirstBand->GetOverview(iOverview);
×
3962
                if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
×
3963
                    continue;
×
3964

3965
                if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
×
3966
                    poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
×
3967
                {
3968
                    CPLDebug("GDAL",
×
3969
                             "GDALDataset::GetBestOverviewLevel() ... "
3970
                             "mismatched overview sizes, use std method.");
3971
                    return -1;
×
3972
                }
3973
                int nBlockXSizeFirst = 0;
×
3974
                int nBlockYSizeFirst = 0;
×
3975
                poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
×
3976
                                             &nBlockYSizeFirst);
3977

3978
                int nBlockXSizeCurrent = 0;
×
3979
                int nBlockYSizeCurrent = 0;
×
3980
                poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
×
3981
                                        &nBlockYSizeCurrent);
3982

3983
                if (nBlockXSizeFirst != nBlockXSizeCurrent ||
×
3984
                    nBlockYSizeFirst != nBlockYSizeCurrent)
×
3985
                {
3986
                    CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
×
3987
                                     "mismatched block sizes, use std method.");
3988
                    return -1;
×
3989
                }
3990
            }
3991
        }
3992
    }
3993
    if (poFirstBand == nullptr)
4✔
3994
        return -1;
×
3995

3996
    return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
4✔
3997
                                         nYSize, nBufXSize, nBufYSize,
3998
                                         psExtraArg);
4✔
3999
}
4000

4001
/************************************************************************/
4002
/*                         BlockBasedRasterIO()                         */
4003
/*                                                                      */
4004
/*      This convenience function implements a dataset level            */
4005
/*      RasterIO() interface based on calling down to fetch blocks,     */
4006
/*      much like the GDALRasterBand::IRasterIO(), but it handles       */
4007
/*      all bands at once, so that a format driver that handles a       */
4008
/*      request for different bands of the same block efficiently       */
4009
/*      (i.e. without re-reading interleaved data) will efficiently.    */
4010
/*                                                                      */
4011
/*      This method is intended to be called by an overridden           */
4012
/*      IRasterIO() method in the driver specific GDALDataset           */
4013
/*      derived class.                                                  */
4014
/*                                                                      */
4015
/*      Default internal implementation of RasterIO() ... utilizes      */
4016
/*      the Block access methods to satisfy the request.  This would    */
4017
/*      normally only be overridden by formats with overviews.          */
4018
/*                                                                      */
4019
/*      To keep things relatively simple, this method does not          */
4020
/*      currently take advantage of some special cases addressed in     */
4021
/*      GDALRasterBand::IRasterIO(), so it is likely best to only       */
4022
/*      call it when you know it will help.  That is in cases where     */
4023
/*      data is at 1:1 to the buffer, and you know the driver is        */
4024
/*      implementing interleaved IO efficiently on a block by block     */
4025
/*      basis. Overviews will be used when possible.                    */
4026
/************************************************************************/
4027

4028
CPLErr GDALDataset::BlockBasedRasterIO(
63,630✔
4029
    GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4030
    void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4031
    int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4032
    GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
4033

4034
{
4035
    CPLAssert(nullptr != pData);
63,630✔
4036

4037
    GByte **papabySrcBlock = nullptr;
63,630✔
4038
    GDALRasterBlock *poBlock = nullptr;
63,630✔
4039
    GDALRasterBlock **papoBlocks = nullptr;
63,630✔
4040
    int nLBlockX = -1;
63,630✔
4041
    int nLBlockY = -1;
63,630✔
4042
    int iBufYOff;
4043
    int iBufXOff;
4044
    int nBlockXSize = 1;
63,630✔
4045
    int nBlockYSize = 1;
63,630✔
4046
    CPLErr eErr = CE_None;
63,630✔
4047
    GDALDataType eDataType = GDT_Byte;
63,630✔
4048

4049
    const bool bUseIntegerRequestCoords =
63,630✔
4050
        (!psExtraArg->bFloatingPointWindowValidity ||
64,066✔
4051
         (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
436✔
4052
          nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
433✔
4053

4054
    /* -------------------------------------------------------------------- */
4055
    /*      Ensure that all bands share a common block size and data type.  */
4056
    /* -------------------------------------------------------------------- */
4057
    for (int iBand = 0; iBand < nBandCount; iBand++)
301,203✔
4058
    {
4059
        GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
237,573✔
4060

4061
        if (iBand == 0)
237,569✔
4062
        {
4063
            poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
63,630✔
4064
            eDataType = poBand->GetRasterDataType();
63,629✔
4065
        }
4066
        else
4067
        {
4068
            int nThisBlockXSize = 0;
173,939✔
4069
            int nThisBlockYSize = 0;
173,939✔
4070
            poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
173,939✔
4071
            if (nThisBlockXSize != nBlockXSize ||
173,941✔
4072
                nThisBlockYSize != nBlockYSize)
173,941✔
4073
            {
4074
                CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
×
4075
                                 "mismatched block sizes, use std method.");
4076
                return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
×
4077
                                         pData, nBufXSize, nBufYSize, eBufType,
4078
                                         nBandCount, panBandMap, nPixelSpace,
4079
                                         nLineSpace, nBandSpace, psExtraArg);
×
4080
            }
4081

4082
            if (eDataType != poBand->GetRasterDataType() &&
173,941✔
4083
                (nXSize != nBufXSize || nYSize != nBufYSize))
×
4084
            {
4085
                CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
×
4086
                                 "mismatched band data types, use std method.");
4087
                return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
×
4088
                                         pData, nBufXSize, nBufYSize, eBufType,
4089
                                         nBandCount, panBandMap, nPixelSpace,
4090
                                         nLineSpace, nBandSpace, psExtraArg);
×
4091
            }
4092
        }
4093
    }
4094

4095
    /* ==================================================================== */
4096
    /*      In this special case at full resolution we step through in      */
4097
    /*      blocks, turning the request over to the per-band                */
4098
    /*      IRasterIO(), but ensuring that all bands of one block are       */
4099
    /*      called before proceeding to the next.                           */
4100
    /* ==================================================================== */
4101

4102
    if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
63,630✔
4103
    {
4104
        GDALRasterIOExtraArg sDummyExtraArg;
4105
        INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
63,625✔
4106

4107
        int nChunkYSize = 0;
63,625✔
4108
        int nChunkXSize = 0;
63,625✔
4109

4110
        for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
223,111✔
4111
        {
4112
            const int nChunkYOff = iBufYOff + nYOff;
160,514✔
4113
            nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
160,514✔
4114
            if (nChunkYOff + nChunkYSize > nYOff + nYSize)
160,514✔
4115
                nChunkYSize = (nYOff + nYSize) - nChunkYOff;
58,867✔
4116

4117
            for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
844,889✔
4118
            {
4119
                const int nChunkXOff = iBufXOff + nXOff;
685,389✔
4120
                nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
685,389✔
4121
                if (nChunkXOff + nChunkXSize > nXOff + nXSize)
685,389✔
4122
                    nChunkXSize = (nXOff + nXSize) - nChunkXOff;
75,875✔
4123

4124
                GByte *pabyChunkData =
685,389✔
4125
                    static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
685,389✔
4126
                    static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
685,389✔
4127

4128
                for (int iBand = 0; iBand < nBandCount; iBand++)
3,327,810✔
4129
                {
4130
                    GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
2,643,440✔
4131

4132
                    eErr = poBand->IRasterIO(
5,286,860✔
4133
                        eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
4134
                        nChunkYSize,
4135
                        pabyChunkData +
2,643,410✔
4136
                            static_cast<GPtrDiff_t>(iBand) * nBandSpace,
2,643,410✔
4137
                        nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
4138
                        nLineSpace, &sDummyExtraArg);
2,643,410✔
4139
                    if (eErr != CE_None)
2,643,450✔
4140
                        return eErr;
1,028✔
4141
                }
4142
            }
4143

4144
            if (psExtraArg->pfnProgress != nullptr &&
179,769✔
4145
                !psExtraArg->pfnProgress(
20,269✔
4146
                    1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
179,769✔
4147
                        nBufYSize,
4148
                    "", psExtraArg->pProgressData))
4149
            {
4150
                return CE_Failure;
18✔
4151
            }
4152
        }
4153

4154
        return CE_None;
62,597✔
4155
    }
4156

4157
    /* Below code is not compatible with that case. It would need a complete */
4158
    /* separate code like done in GDALRasterBand::IRasterIO. */
4159
    if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
5✔
4160
    {
4161
        return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
×
4162
                                 nBufXSize, nBufYSize, eBufType, nBandCount,
4163
                                 panBandMap, nPixelSpace, nLineSpace,
4164
                                 nBandSpace, psExtraArg);
×
4165
    }
4166

4167
    /* We could have a smarter implementation, but that will do for now */
4168
    if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
5✔
4169
        (nBufXSize != nXSize || nBufYSize != nYSize))
×
4170
    {
4171
        return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
×
4172
                                 nBufXSize, nBufYSize, eBufType, nBandCount,
4173
                                 panBandMap, nPixelSpace, nLineSpace,
4174
                                 nBandSpace, psExtraArg);
×
4175
    }
4176

4177
    /* ==================================================================== */
4178
    /*      Loop reading required source blocks to satisfy output           */
4179
    /*      request.  This is the most general implementation.              */
4180
    /* ==================================================================== */
4181

4182
    const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
5✔
4183

4184
    papabySrcBlock =
4185
        static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
4✔
4186
    papoBlocks =
4187
        static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
4✔
4188

4189
    /* -------------------------------------------------------------------- */
4190
    /*      Select an overview level if appropriate.                        */
4191
    /* -------------------------------------------------------------------- */
4192

4193
    GDALRasterIOExtraArg sExtraArg;
4194
    GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4✔
4195
    const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
4✔
4196
        this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
4197
        panBandMap, &sExtraArg);
4198
    if (nOverviewLevel >= 0)
4✔
4199
    {
4200
        GetRasterBand(panBandMap[0])
2✔
4201
            ->GetOverview(nOverviewLevel)
2✔
4202
            ->GetBlockSize(&nBlockXSize, &nBlockYSize);
2✔
4203
    }
4204

4205
    double dfXOff = nXOff;
4✔
4206
    double dfYOff = nYOff;
4✔
4207
    double dfXSize = nXSize;
4✔
4208
    double dfYSize = nYSize;
4✔
4209
    if (sExtraArg.bFloatingPointWindowValidity)
4✔
4210
    {
4211
        dfXOff = sExtraArg.dfXOff;
2✔
4212
        dfYOff = sExtraArg.dfYOff;
2✔
4213
        dfXSize = sExtraArg.dfXSize;
2✔
4214
        dfYSize = sExtraArg.dfYSize;
2✔
4215
    }
4216

4217
    /* -------------------------------------------------------------------- */
4218
    /*      Compute stepping increment.                                     */
4219
    /* -------------------------------------------------------------------- */
4220
    const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
4✔
4221
    const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
4✔
4222

4223
    constexpr double EPS = 1e-10;
4✔
4224
    /* -------------------------------------------------------------------- */
4225
    /*      Loop over buffer computing source locations.                    */
4226
    /* -------------------------------------------------------------------- */
4227
    for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
36✔
4228
    {
4229
        GPtrDiff_t iSrcOffset;
4230

4231
        // Add small epsilon to avoid some numeric precision issues.
4232
        const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
32✔
4233
        const int iSrcY = static_cast<int>(std::min(
32✔
4234
            std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
32✔
4235

4236
        GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
32✔
4237
                                static_cast<GPtrDiff_t>(nLineSpace);
4238

4239
        for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
302✔
4240
        {
4241
            const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
270✔
4242
            const int iSrcX = static_cast<int>(std::min(
270✔
4243
                std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
270✔
4244

4245
            // FIXME: this code likely doesn't work if the dirty block gets
4246
            // flushed to disk before being completely written. In the meantime,
4247
            // bJustInitialize should probably be set to FALSE even if it is not
4248
            // ideal performance wise, and for lossy compression
4249

4250
            /* --------------------------------------------------------------------
4251
             */
4252
            /*      Ensure we have the appropriate block loaded. */
4253
            /* --------------------------------------------------------------------
4254
             */
4255
            if (iSrcX < nLBlockX * nBlockXSize ||
270✔
4256
                iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
270✔
4257
                iSrcY < nLBlockY * nBlockYSize ||
266✔
4258
                iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
266✔
4259
            {
4260
                nLBlockX = iSrcX / nBlockXSize;
4✔
4261
                nLBlockY = iSrcY / nBlockYSize;
4✔
4262

4263
                const bool bJustInitialize =
4✔
4264
                    eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
×
4265
                    nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
×
4266
                    nXOff <= nLBlockX * nBlockXSize &&
4✔
4267
                    nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
×
4268
                /*bool bMemZeroBuffer = FALSE;
4269
                if( eRWFlag == GF_Write && !bJustInitialize &&
4270
                    nXOff <= nLBlockX * nBlockXSize &&
4271
                    nYOff <= nLBlockY * nBlockYSize &&
4272
                    (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
4273
                     (nXOff + nXSize == GetRasterXSize() &&
4274
                     (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
4275
                    (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
4276
                     (nYOff + nYSize == GetRasterYSize() &&
4277
                     (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
4278
                {
4279
                    bJustInitialize = TRUE;
4280
                    bMemZeroBuffer = TRUE;
4281
                }*/
4282
                for (int iBand = 0; iBand < nBandCount; iBand++)
12✔
4283
                {
4284
                    GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
8✔
4285
                    if (nOverviewLevel >= 0)
8✔
4286
                        poBand = poBand->GetOverview(nOverviewLevel);
2✔
4287
                    poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
16✔
4288
                                                        bJustInitialize);
8✔
4289
                    if (poBlock == nullptr)
8✔
4290
                    {
4291
                        eErr = CE_Failure;
×
4292
                        goto CleanupAndReturn;
×
4293
                    }
4294

4295
                    if (eRWFlag == GF_Write)
8✔
4296
                        poBlock->MarkDirty();
×
4297

4298
                    if (papoBlocks[iBand] != nullptr)
8✔
4299
                        papoBlocks[iBand]->DropLock();
×
4300

4301
                    papoBlocks[iBand] = poBlock;
8✔
4302

4303
                    papabySrcBlock[iBand] =
8✔
4304
                        static_cast<GByte *>(poBlock->GetDataRef());
8✔
4305
                    /*if( bMemZeroBuffer )
4306
                    {
4307
                        memset(papabySrcBlock[iBand], 0,
4308
                            static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
4309
                    * nBlockYSize);
4310
                    }*/
4311
                }
4312
            }
4313

4314
            /* --------------------------------------------------------------------
4315
             */
4316
            /*      Copy over this pixel of data. */
4317
            /* --------------------------------------------------------------------
4318
             */
4319
            iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
270✔
4320
                          static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
270✔
4321
                          (static_cast<GPtrDiff_t>(iSrcY) -
270✔
4322
                           static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
270✔
4323
                              nBlockXSize) *
270✔
4324
                         nBandDataSize;
270✔
4325

4326
            for (int iBand = 0; iBand < nBandCount; iBand++)
980✔
4327
            {
4328
                GByte *pabySrcBlock = papabySrcBlock[iBand];
710✔
4329
                GPtrDiff_t iBandBufOffset =
710✔
4330
                    iBufOffset + static_cast<GPtrDiff_t>(iBand) *
710✔
4331
                                     static_cast<GPtrDiff_t>(nBandSpace);
4332

4333
                if (eDataType == eBufType)
710✔
4334
                {
4335
                    if (eRWFlag == GF_Read)
710✔
4336
                        memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
710✔
4337
                               pabySrcBlock + iSrcOffset, nBandDataSize);
710✔
4338
                    else
4339
                        memcpy(pabySrcBlock + iSrcOffset,
×
4340
                               static_cast<const GByte *>(pData) +
4341
                                   iBandBufOffset,
×
4342
                               nBandDataSize);
4343
                }
4344
                else
4345
                {
4346
                    /* type to type conversion ... ouch, this is expensive way
4347
                       of handling single words */
4348

4349
                    if (eRWFlag == GF_Read)
×
4350
                        GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
×
4351
                                        static_cast<GByte *>(pData) +
4352
                                            iBandBufOffset,
×
4353
                                        eBufType, 0, 1);
4354
                    else
4355
                        GDALCopyWords64(static_cast<const GByte *>(pData) +
×
4356
                                            iBandBufOffset,
×
4357
                                        eBufType, 0, pabySrcBlock + iSrcOffset,
×
4358
                                        eDataType, 0, 1);
4359
                }
4360
            }
4361

4362
            iBufOffset += static_cast<int>(nPixelSpace);
270✔
4363
        }
4364
    }
4365

4366
    /* -------------------------------------------------------------------- */
4367
    /*      CleanupAndReturn.                                               */
4368
    /* -------------------------------------------------------------------- */
4369
CleanupAndReturn:
4✔
4370
    CPLFree(papabySrcBlock);
4✔
4371
    if (papoBlocks != nullptr)
4✔
4372
    {
4373
        for (int iBand = 0; iBand < nBandCount; iBand++)
12✔
4374
        {
4375
            if (papoBlocks[iBand] != nullptr)
8✔
4376
                papoBlocks[iBand]->DropLock();
8✔
4377
        }
4378
        CPLFree(papoBlocks);
4✔
4379
    }
4380

4381
    return eErr;
4✔
4382
}
4383

4384
//! @endcond
4385

4386
/************************************************************************/
4387
/*                  GDALCopyWholeRasterGetSwathSize()                   */
4388
/************************************************************************/
4389

4390
static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
2,863✔
4391
                                            GDALRasterBand *poDstPrototypeBand,
4392
                                            int nBandCount,
4393
                                            int bDstIsCompressed,
4394
                                            int bInterleave, int *pnSwathCols,
4395
                                            int *pnSwathLines)
4396
{
4397
    GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
2,863✔
4398
    int nSrcBlockXSize = 0;
2,863✔
4399
    int nSrcBlockYSize = 0;
2,863✔
4400
    int nBlockXSize = 0;
2,863✔
4401
    int nBlockYSize = 0;
2,863✔
4402

4403
    int nXSize = poSrcPrototypeBand->GetXSize();
2,863✔
4404
    int nYSize = poSrcPrototypeBand->GetYSize();
2,863✔
4405

4406
    poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
2,863✔
4407
    poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
2,863✔
4408

4409
    const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
2,863✔
4410
    const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
2,863✔
4411

4412
    int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
2,863✔
4413
    if (bInterleave)
2,863✔
4414
        nPixelSize *= nBandCount;
1,357✔
4415

4416
    // aim for one row of blocks.  Do not settle for less.
4417
    int nSwathCols = nXSize;
2,863✔
4418
    int nSwathLines = nMaxBlockYSize;
2,863✔
4419

4420
    const char *pszSrcCompression =
4421
        poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
2,863✔
4422
    if (pszSrcCompression == nullptr)
2,863✔
4423
    {
4424
        auto poSrcDS = poSrcPrototypeBand->GetDataset();
2,837✔
4425
        if (poSrcDS)
2,837✔
4426
            pszSrcCompression =
4427
                poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
2,831✔
4428
    }
4429

4430
    /* -------------------------------------------------------------------- */
4431
    /*      What will our swath size be?                                    */
4432
    /* -------------------------------------------------------------------- */
4433
    // When writing interleaved data in a compressed format, we want to be sure
4434
    // that each block will only be written once, so the swath size must not be
4435
    // greater than the block cache.
4436
    const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
2,863✔
4437
    int nTargetSwathSize;
4438
    if (pszSwathSize != nullptr)
2,863✔
4439
        nTargetSwathSize = static_cast<int>(
×
4440
            std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
×
4441
    else
4442
    {
4443
        // As a default, take one 1/4 of the cache size.
4444
        nTargetSwathSize = static_cast<int>(
2,863✔
4445
            std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
2,863✔
4446

4447
        // but if the minimum idal swath buf size is less, then go for it to
4448
        // avoid unnecessarily abusing RAM usage.
4449
        // but try to use 10 MB at least.
4450
        GIntBig nIdealSwathBufSize =
2,863✔
4451
            static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
2,863✔
4452
        int nMinTargetSwathSize = 10 * 1000 * 1000;
2,863✔
4453

4454
        if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
2,863✔
4455
             GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
2,863✔
4456
        {
4457
            nMinTargetSwathSize = nTargetSwathSize;
2✔
4458
        }
4459

4460
        if (nIdealSwathBufSize < nTargetSwathSize &&
2,863✔
4461
            nIdealSwathBufSize < nMinTargetSwathSize)
2,853✔
4462
        {
4463
            nIdealSwathBufSize = nMinTargetSwathSize;
2,850✔
4464
        }
4465

4466
        if (pszSrcCompression != nullptr &&
2,863✔
4467
            EQUAL(pszSrcCompression, "JPEG2000") &&
157✔
4468
            (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
×
4469
                                   (nSrcBlockYSize % nBlockYSize) == 0)))
×
4470
        {
4471
            nIdealSwathBufSize =
2✔
4472
                std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
4✔
4473
                                                 nSrcBlockYSize * nPixelSize);
2✔
4474
        }
4475
        if (nTargetSwathSize > nIdealSwathBufSize)
2,863✔
4476
            nTargetSwathSize = static_cast<int>(
2,849✔
4477
                std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
2,849✔
4478
    }
4479

4480
    if (nTargetSwathSize < 1000000)
2,863✔
4481
        nTargetSwathSize = 1000000;
8✔
4482

4483
    /* But let's check that  */
4484
    if (bDstIsCompressed && bInterleave &&
3,071✔
4485
        nTargetSwathSize > GDALGetCacheMax64())
208✔
4486
    {
4487
        CPLError(CE_Warning, CPLE_AppDefined,
×
4488
                 "When translating into a compressed interleave format, "
4489
                 "the block cache size (" CPL_FRMT_GIB ") "
4490
                 "should be at least the size of the swath (%d) "
4491
                 "(GDAL_SWATH_SIZE config. option)",
4492
                 GDALGetCacheMax64(), nTargetSwathSize);
4493
    }
4494

4495
#define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
4496
#define ROUND_TO(x, y) (((x) / (y)) * (y))
4497

4498
    // if both input and output datasets are tiled, that the tile dimensions
4499
    // are "compatible", try to stick  to a swath dimension that is a multiple
4500
    // of input and output block dimensions.
4501
    if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
2,863✔
4502
        IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
33✔
4503
        IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
33✔
4504
        IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
33✔
4505
        IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
33✔
4506
    {
4507
        if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
33✔
4508
                nPixelSize <=
33✔
4509
            static_cast<GIntBig>(nTargetSwathSize))
33✔
4510
        {
4511
            nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
33✔
4512
            nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
33✔
4513
            if (nSwathCols == 0)
33✔
4514
                nSwathCols = nMaxBlockXSize;
×
4515
            if (nSwathCols > nXSize)
33✔
4516
                nSwathCols = nXSize;
31✔
4517
            nSwathLines = nMaxBlockYSize;
33✔
4518

4519
            if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
33✔
4520
                static_cast<GIntBig>(nTargetSwathSize))
33✔
4521
            {
4522
                nSwathCols = nXSize;
×
4523
                nSwathLines = nBlockYSize;
×
4524
            }
4525
        }
4526
    }
4527

4528
    const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
2,863✔
4529
    const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
2,863✔
4530
    if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
2,863✔
4531
    {
4532
        nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
1✔
4533
        if (nSwathLines == 0)
1✔
4534
            nSwathLines = 1;
1✔
4535

4536
        CPLDebug(
1✔
4537
            "GDAL",
4538
            "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
4539
            "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
4540
            "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
4541
            nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
1✔
4542
    }
4543
    // If we are processing single scans, try to handle several at once.
4544
    // If we are handling swaths already, only grow the swath if a row
4545
    // of blocks is substantially less than our target buffer size.
4546
    else if (nSwathLines == 1 ||
2,862✔
4547
             nMemoryPerCol * nSwathLines <
2,401✔
4548
                 static_cast<GIntBig>(nTargetSwathSize) / 10)
2,401✔
4549
    {
4550
        nSwathLines = std::min(
2,835✔
4551
            nYSize,
4552
            std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
2,835✔
4553

4554
        /* If possible try to align to source and target block height */
4555
        if ((nSwathLines % nMaxBlockYSize) != 0 &&
2,835✔
4556
            nSwathLines > nMaxBlockYSize &&
987✔
4557
            IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
987✔
4558
            IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
959✔
4559
            nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
177✔
4560
    }
4561

4562
    if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
2,863✔
4563
        (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
×
4564
                               IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
×
4565
    {
4566
        // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
4567
        if (nSwathLines < nSrcBlockYSize)
2✔
4568
        {
4569
            nSwathLines = nSrcBlockYSize;
×
4570

4571
            // Number of pixels that can be read/write simultaneously.
4572
            nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
×
4573
            nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
×
4574
            if (nSwathCols == 0)
×
4575
                nSwathCols = nSrcBlockXSize;
×
4576
            if (nSwathCols > nXSize)
×
4577
                nSwathCols = nXSize;
×
4578

4579
            CPLDebug(
×
4580
                "GDAL",
4581
                "GDALCopyWholeRasterGetSwathSize(): because of compression and "
4582
                "too high block, "
4583
                "use partial width at one time");
4584
        }
4585
        else if ((nSwathLines % nSrcBlockYSize) != 0)
2✔
4586
        {
4587
            /* Round on a multiple of nSrcBlockYSize */
4588
            nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
×
4589
            CPLDebug(
×
4590
                "GDAL",
4591
                "GDALCopyWholeRasterGetSwathSize(): because of compression, "
4592
                "round nSwathLines to block height : %d",
4593
                nSwathLines);
4594
        }
4595
    }
4596
    else if (bDstIsCompressed)
2,861✔
4597
    {
4598
        if (nSwathLines < nBlockYSize)
374✔
4599
        {
4600
            nSwathLines = nBlockYSize;
142✔
4601

4602
            // Number of pixels that can be read/write simultaneously.
4603
            nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
142✔
4604
            nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
142✔
4605
            if (nSwathCols == 0)
142✔
4606
                nSwathCols = nBlockXSize;
×
4607
            if (nSwathCols > nXSize)
142✔
4608
                nSwathCols = nXSize;
142✔
4609

4610
            CPLDebug(
142✔
4611
                "GDAL",
4612
                "GDALCopyWholeRasterGetSwathSize(): because of compression and "
4613
                "too high block, "
4614
                "use partial width at one time");
4615
        }
4616
        else if ((nSwathLines % nBlockYSize) != 0)
232✔
4617
        {
4618
            // Round on a multiple of nBlockYSize.
4619
            nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
9✔
4620
            CPLDebug(
9✔
4621
                "GDAL",
4622
                "GDALCopyWholeRasterGetSwathSize(): because of compression, "
4623
                "round nSwathLines to block height : %d",
4624
                nSwathLines);
4625
        }
4626
    }
4627

4628
    *pnSwathCols = nSwathCols;
2,863✔
4629
    *pnSwathLines = nSwathLines;
2,863✔
4630
}
2,863✔
4631

4632
/************************************************************************/
4633
/*                     GDALDatasetCopyWholeRaster()                     */
4634
/************************************************************************/
4635

4636
/**
4637
 * \brief Copy all dataset raster data.
4638
 *
4639
 * This function copies the complete raster contents of one dataset to
4640
 * another similarly configured dataset.  The source and destination
4641
 * dataset must have the same number of bands, and the same width
4642
 * and height.  The bands do not have to have the same data type.
4643
 *
4644
 * This function is primarily intended to support implementation of
4645
 * driver specific CreateCopy() functions.  It implements efficient copying,
4646
 * in particular "chunking" the copy in substantial blocks and, if appropriate,
4647
 * performing the transfer in a pixel interleaved fashion.
4648
 *
4649
 * Currently the only papszOptions value supported are :
4650
 * <ul>
4651
 * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
4652
 * write access pattern (this does not modify the layout of the destination
4653
 * data)</li> <li>"COMPRESSED=YES" to force alignment on target dataset block
4654
 * sizes to achieve best compression.</li> <li>"SKIP_HOLES=YES" to skip chunks
4655
 * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
4656
 * (GDAL &gt;= 2.2)</li>
4657
 * </ul>
4658
 * More options may be supported in the future.
4659
 *
4660
 * @param hSrcDS the source dataset
4661
 * @param hDstDS the destination dataset
4662
 * @param papszOptions transfer hints in "StringList" Name=Value format.
4663
 * @param pfnProgress progress reporting function.
4664
 * @param pProgressData callback data for progress function.
4665
 *
4666
 * @return CE_None on success, or CE_Failure on failure.
4667
 */
4668

4669
CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
2,830✔
4670
                                              GDALDatasetH hDstDS,
4671
                                              CSLConstList papszOptions,
4672
                                              GDALProgressFunc pfnProgress,
4673
                                              void *pProgressData)
4674

4675
{
4676
    VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
2,830✔
4677
    VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
2,830✔
4678

4679
    GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
2,830✔
4680
    GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
2,830✔
4681

4682
    if (pfnProgress == nullptr)
2,830✔
4683
        pfnProgress = GDALDummyProgress;
3✔
4684

4685
    /* -------------------------------------------------------------------- */
4686
    /*      Confirm the datasets match in size and band counts.             */
4687
    /* -------------------------------------------------------------------- */
4688
    const int nXSize = poDstDS->GetRasterXSize();
2,830✔
4689
    const int nYSize = poDstDS->GetRasterYSize();
2,830✔
4690
    const int nBandCount = poDstDS->GetRasterCount();
2,830✔
4691

4692
    if (poSrcDS->GetRasterXSize() != nXSize ||
2,830✔
4693
        poSrcDS->GetRasterYSize() != nYSize ||
5,660✔
4694
        poSrcDS->GetRasterCount() != nBandCount)
2,830✔
4695
    {
4696
        CPLError(CE_Failure, CPLE_AppDefined,
×
4697
                 "Input and output dataset sizes or band counts do not\n"
4698
                 "match in GDALDatasetCopyWholeRaster()");
4699
        return CE_Failure;
×
4700
    }
4701

4702
    /* -------------------------------------------------------------------- */
4703
    /*      Report preliminary (0) progress.                                */
4704
    /* -------------------------------------------------------------------- */
4705
    if (!pfnProgress(0.0, nullptr, pProgressData))
2,830✔
4706
    {
4707
        CPLError(CE_Failure, CPLE_UserInterrupt,
1✔
4708
                 "User terminated CreateCopy()");
4709
        return CE_Failure;
1✔
4710
    }
4711

4712
    /* -------------------------------------------------------------------- */
4713
    /*      Get our prototype band, and assume the others are similarly     */
4714
    /*      configured.                                                     */
4715
    /* -------------------------------------------------------------------- */
4716
    if (nBandCount == 0)
2,829✔
4717
        return CE_None;
×
4718

4719
    GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
2,829✔
4720
    GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
2,829✔
4721
    GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
2,829✔
4722

4723
    /* -------------------------------------------------------------------- */
4724
    /*      Do we want to try and do the operation in a pixel               */
4725
    /*      interleaved fashion?                                            */
4726
    /* -------------------------------------------------------------------- */
4727
    bool bInterleave = false;
2,829✔
4728
    const char *pszInterleave =
4729
        poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
2,829✔
4730
    if (pszInterleave != nullptr &&
2,829✔
4731
        (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
1,138✔
4732
        bInterleave = true;
175✔
4733

4734
    pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
2,829✔
4735
    if (pszInterleave != nullptr &&
2,829✔
4736
        (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
2,074✔
4737
        bInterleave = true;
1,310✔
4738

4739
    pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
2,829✔
4740
    if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
2,829✔
4741
        bInterleave = true;
5✔
4742
    else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
2,824✔
4743
        bInterleave = false;
7✔
4744
    // attributes is specific to the TileDB driver
4745
    else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
2,817✔
4746
        bInterleave = true;
4✔
4747
    else if (pszInterleave != nullptr)
2,813✔
4748
    {
4749
        CPLError(CE_Warning, CPLE_NotSupported,
×
4750
                 "Unsupported value for option INTERLEAVE");
4751
    }
4752

4753
    // If the destination is compressed, we must try to write blocks just once,
4754
    // to save disk space (GTiff case for example), and to avoid data loss
4755
    // (JPEG compression for example).
4756
    bool bDstIsCompressed = false;
2,829✔
4757
    const char *pszDstCompressed =
4758
        CSLFetchNameValue(papszOptions, "COMPRESSED");
2,829✔
4759
    if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
2,829✔
4760
        bDstIsCompressed = true;
352✔
4761

4762
    /* -------------------------------------------------------------------- */
4763
    /*      What will our swath size be?                                    */
4764
    /* -------------------------------------------------------------------- */
4765

4766
    int nSwathCols = 0;
2,829✔
4767
    int nSwathLines = 0;
2,829✔
4768
    GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
2,829✔
4769
                                    nBandCount, bDstIsCompressed, bInterleave,
4770
                                    &nSwathCols, &nSwathLines);
4771

4772
    int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
2,829✔
4773
    if (bInterleave)
2,829✔
4774
        nPixelSize *= nBandCount;
1,357✔
4775

4776
    void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
2,829✔
4777
    if (pSwathBuf == nullptr)
2,829✔
4778
    {
4779
        return CE_Failure;
×
4780
    }
4781

4782
    CPLDebug("GDAL",
2,829✔
4783
             "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
4784
             nSwathCols, nSwathLines, static_cast<int>(bInterleave));
4785

4786
    // Advise the source raster that we are going to read it completely
4787
    // Note: this might already have been done by GDALCreateCopy() in the
4788
    // likely case this function is indirectly called by it
4789
    poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
2,829✔
4790
                        nullptr, nullptr);
2,829✔
4791

4792
    /* ==================================================================== */
4793
    /*      Band oriented (uninterleaved) case.                             */
4794
    /* ==================================================================== */
4795
    CPLErr eErr = CE_None;
2,829✔
4796
    const bool bCheckHoles =
4797
        CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
2,829✔
4798

4799
    if (!bInterleave)
2,829✔
4800
    {
4801
        GDALRasterIOExtraArg sExtraArg;
4802
        INIT_RASTERIO_EXTRA_ARG(sExtraArg);
1,472✔
4803
        CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
1,472✔
4804

4805
        const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
4,416✔
4806
                                     DIV_ROUND_UP(nYSize, nSwathLines) *
1,472✔
4807
                                     DIV_ROUND_UP(nXSize, nSwathCols);
1,472✔
4808
        GIntBig nBlocksDone = 0;
1,472✔
4809

4810
        for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
3,576✔
4811
        {
4812
            int nBand = iBand + 1;
2,104✔
4813

4814
            for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
4,360✔
4815
            {
4816
                int nThisLines = nSwathLines;
2,256✔
4817

4818
                if (iY + nThisLines > nYSize)
2,256✔
4819
                    nThisLines = nYSize - iY;
268✔
4820

4821
                for (int iX = 0; iX < nXSize && eErr == CE_None;
4,512✔
4822
                     iX += nSwathCols)
2,256✔
4823
                {
4824
                    int nThisCols = nSwathCols;
2,256✔
4825

4826
                    if (iX + nThisCols > nXSize)
2,256✔
4827
                        nThisCols = nXSize - iX;
×
4828

4829
                    int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
2,256✔
4830
                    if (bCheckHoles)
2,256✔
4831
                    {
4832
                        nStatus = poSrcDS->GetRasterBand(nBand)
4833
                                      ->GetDataCoverageStatus(
950✔
4834
                                          iX, iY, nThisCols, nThisLines,
4835
                                          GDAL_DATA_COVERAGE_STATUS_DATA);
4836
                    }
4837
                    if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
2,256✔
4838
                    {
4839
                        sExtraArg.pfnProgress = GDALScaledProgress;
2,252✔
4840
                        sExtraArg.pProgressData = GDALCreateScaledProgress(
4,504✔
4841
                            nBlocksDone / static_cast<double>(nTotalBlocks),
2,252✔
4842
                            (nBlocksDone + 0.5) /
2,252✔
4843
                                static_cast<double>(nTotalBlocks),
2,252✔
4844
                            pfnProgress, pProgressData);
4845
                        if (sExtraArg.pProgressData == nullptr)
2,252✔
4846
                            sExtraArg.pfnProgress = nullptr;
1,298✔
4847

4848
                        eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
2,252✔
4849
                                                 nThisLines, pSwathBuf,
4850
                                                 nThisCols, nThisLines, eDT, 1,
4851
                                                 &nBand, 0, 0, 0, &sExtraArg);
4852

4853
                        GDALDestroyScaledProgress(sExtraArg.pProgressData);
2,252✔
4854

4855
                        if (eErr == CE_None)
2,252✔
4856
                            eErr = poDstDS->RasterIO(
2,248✔
4857
                                GF_Write, iX, iY, nThisCols, nThisLines,
4858
                                pSwathBuf, nThisCols, nThisLines, eDT, 1,
4859
                                &nBand, 0, 0, 0, nullptr);
4860
                    }
4861

4862
                    nBlocksDone++;
2,256✔
4863
                    if (eErr == CE_None &&
4,473✔
4864
                        !pfnProgress(nBlocksDone /
2,217✔
4865
                                         static_cast<double>(nTotalBlocks),
2,217✔
4866
                                     nullptr, pProgressData))
4867
                    {
4868
                        eErr = CE_Failure;
2✔
4869
                        CPLError(CE_Failure, CPLE_UserInterrupt,
2✔
4870
                                 "User terminated CreateCopy()");
4871
                    }
4872
                }
4873
            }
4874
        }
4875
    }
4876

4877
    /* ==================================================================== */
4878
    /*      Pixel interleaved case.                                         */
4879
    /* ==================================================================== */
4880
    else /* if( bInterleave ) */
4881
    {
4882
        GDALRasterIOExtraArg sExtraArg;
4883
        INIT_RASTERIO_EXTRA_ARG(sExtraArg);
1,357✔
4884
        CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
1,357✔
4885

4886
        const GIntBig nTotalBlocks =
1,357✔
4887
            static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
1,357✔
4888
            DIV_ROUND_UP(nXSize, nSwathCols);
1,357✔
4889
        GIntBig nBlocksDone = 0;
1,357✔
4890

4891
        for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
2,945✔
4892
        {
4893
            int nThisLines = nSwathLines;
1,588✔
4894

4895
            if (iY + nThisLines > nYSize)
1,588✔
4896
                nThisLines = nYSize - iY;
209✔
4897

4898
            for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
3,181✔
4899
            {
4900
                int nThisCols = nSwathCols;
1,593✔
4901

4902
                if (iX + nThisCols > nXSize)
1,593✔
4903
                    nThisCols = nXSize - iX;
3✔
4904

4905
                int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
1,593✔
4906
                if (bCheckHoles)
1,593✔
4907
                {
4908
                    nStatus = 0;
1,362✔
4909
                    for (int iBand = 0; iBand < nBandCount; iBand++)
1,415✔
4910
                    {
4911
                        nStatus |= poSrcDS->GetRasterBand(iBand + 1)
1,396✔
4912
                                       ->GetDataCoverageStatus(
1,396✔
4913
                                           iX, iY, nThisCols, nThisLines,
4914
                                           GDAL_DATA_COVERAGE_STATUS_DATA);
4915
                        if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
1,396✔
4916
                            break;
1,343✔
4917
                    }
4918
                }
4919
                if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
1,593✔
4920
                {
4921
                    sExtraArg.pfnProgress = GDALScaledProgress;
1,574✔
4922
                    sExtraArg.pProgressData = GDALCreateScaledProgress(
3,148✔
4923
                        nBlocksDone / static_cast<double>(nTotalBlocks),
1,574✔
4924
                        (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
1,574✔
4925
                        pfnProgress, pProgressData);
4926
                    if (sExtraArg.pProgressData == nullptr)
1,574✔
4927
                        sExtraArg.pfnProgress = nullptr;
344✔
4928

4929
                    eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
1,574✔
4930
                                             nThisLines, pSwathBuf, nThisCols,
4931
                                             nThisLines, eDT, nBandCount,
4932
                                             nullptr, 0, 0, 0, &sExtraArg);
4933

4934
                    GDALDestroyScaledProgress(sExtraArg.pProgressData);
1,574✔
4935

4936
                    if (eErr == CE_None)
1,574✔
4937
                        eErr = poDstDS->RasterIO(
1,573✔
4938
                            GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
4939
                            nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
4940
                            0, 0, nullptr);
4941
                }
4942

4943
                nBlocksDone++;
1,593✔
4944
                if (eErr == CE_None &&
3,182✔
4945
                    !pfnProgress(nBlocksDone /
1,589✔
4946
                                     static_cast<double>(nTotalBlocks),
1,589✔
4947
                                 nullptr, pProgressData))
4948
                {
4949
                    eErr = CE_Failure;
1✔
4950
                    CPLError(CE_Failure, CPLE_UserInterrupt,
1✔
4951
                             "User terminated CreateCopy()");
4952
                }
4953
            }
4954
        }
4955
    }
4956

4957
    /* -------------------------------------------------------------------- */
4958
    /*      Cleanup                                                         */
4959
    /* -------------------------------------------------------------------- */
4960
    CPLFree(pSwathBuf);
2,829✔
4961

4962
    return eErr;
2,829✔
4963
}
4964

4965
/************************************************************************/
4966
/*                     GDALRasterBandCopyWholeRaster()                  */
4967
/************************************************************************/
4968

4969
/**
4970
 * \brief Copy a whole raster band
4971
 *
4972
 * This function copies the complete raster contents of one band to
4973
 * another similarly configured band.  The source and destination
4974
 * bands must have the same width and height.  The bands do not have
4975
 * to have the same data type.
4976
 *
4977
 * It implements efficient copying, in particular "chunking" the copy in
4978
 * substantial blocks.
4979
 *
4980
 * Currently the only papszOptions value supported are :
4981
 * <ul>
4982
 * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
4983
 * achieve best compression.</li>
4984
 * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
4985
 * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL &gt;= 2.2)</li>
4986
 * </ul>
4987
 *
4988
 * @param hSrcBand the source band
4989
 * @param hDstBand the destination band
4990
 * @param papszOptions transfer hints in "StringList" Name=Value format.
4991
 * @param pfnProgress progress reporting function.
4992
 * @param pProgressData callback data for progress function.
4993
 *
4994
 * @return CE_None on success, or CE_Failure on failure.
4995
 */
4996

4997
CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
34✔
4998
    GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
4999
    const char *const *const papszOptions, GDALProgressFunc pfnProgress,
5000
    void *pProgressData)
5001

5002
{
5003
    VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
34✔
5004
    VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
34✔
5005

5006
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
34✔
5007
    GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
34✔
5008
    CPLErr eErr = CE_None;
34✔
5009

5010
    if (pfnProgress == nullptr)
34✔
5011
        pfnProgress = GDALDummyProgress;
11✔
5012

5013
    /* -------------------------------------------------------------------- */
5014
    /*      Confirm the datasets match in size and band counts.             */
5015
    /* -------------------------------------------------------------------- */
5016
    int nXSize = poSrcBand->GetXSize();
34✔
5017
    int nYSize = poSrcBand->GetYSize();
34✔
5018

5019
    if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
34✔
5020
    {
5021
        CPLError(CE_Failure, CPLE_AppDefined,
×
5022
                 "Input and output band sizes do not\n"
5023
                 "match in GDALRasterBandCopyWholeRaster()");
5024
        return CE_Failure;
×
5025
    }
5026

5027
    /* -------------------------------------------------------------------- */
5028
    /*      Report preliminary (0) progress.                                */
5029
    /* -------------------------------------------------------------------- */
5030
    if (!pfnProgress(0.0, nullptr, pProgressData))
34✔
5031
    {
5032
        CPLError(CE_Failure, CPLE_UserInterrupt,
×
5033
                 "User terminated CreateCopy()");
5034
        return CE_Failure;
×
5035
    }
5036

5037
    GDALDataType eDT = poDstBand->GetRasterDataType();
34✔
5038

5039
    // If the destination is compressed, we must try to write blocks just once,
5040
    // to save disk space (GTiff case for example), and to avoid data loss
5041
    // (JPEG compression for example).
5042
    bool bDstIsCompressed = false;
34✔
5043
    const char *pszDstCompressed =
5044
        CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
34✔
5045
    if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
34✔
5046
        bDstIsCompressed = true;
22✔
5047

5048
    /* -------------------------------------------------------------------- */
5049
    /*      What will our swath size be?                                    */
5050
    /* -------------------------------------------------------------------- */
5051

5052
    int nSwathCols = 0;
34✔
5053
    int nSwathLines = 0;
34✔
5054
    GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
34✔
5055
                                    FALSE, &nSwathCols, &nSwathLines);
5056

5057
    const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
34✔
5058

5059
    void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
34✔
5060
    if (pSwathBuf == nullptr)
34✔
5061
    {
5062
        return CE_Failure;
×
5063
    }
5064

5065
    CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
34✔
5066
             nSwathCols, nSwathLines);
5067

5068
    const bool bCheckHoles =
5069
        CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
34✔
5070

5071
    // Advise the source raster that we are going to read it completely
5072
    poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
34✔
5073

5074
    /* ==================================================================== */
5075
    /*      Band oriented (uninterleaved) case.                             */
5076
    /* ==================================================================== */
5077

5078
    for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
82✔
5079
    {
5080
        int nThisLines = nSwathLines;
48✔
5081

5082
        if (iY + nThisLines > nYSize)
48✔
5083
            nThisLines = nYSize - iY;
8✔
5084

5085
        for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
96✔
5086
        {
5087
            int nThisCols = nSwathCols;
48✔
5088

5089
            if (iX + nThisCols > nXSize)
48✔
5090
                nThisCols = nXSize - iX;
×
5091

5092
            int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
48✔
5093
            if (bCheckHoles)
48✔
5094
            {
5095
                nStatus = poSrcBand->GetDataCoverageStatus(
×
5096
                    iX, iY, nThisCols, nThisLines,
5097
                    GDAL_DATA_COVERAGE_STATUS_DATA);
5098
            }
5099
            if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
48✔
5100
            {
5101
                eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
48✔
5102
                                           nThisLines, pSwathBuf, nThisCols,
5103
                                           nThisLines, eDT, 0, 0, nullptr);
5104

5105
                if (eErr == CE_None)
48✔
5106
                    eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
48✔
5107
                                               nThisLines, pSwathBuf, nThisCols,
5108
                                               nThisLines, eDT, 0, 0, nullptr);
5109
            }
5110

5111
            if (eErr == CE_None &&
96✔
5112
                !pfnProgress((iY + nThisLines) / static_cast<float>(nYSize),
48✔
5113
                             nullptr, pProgressData))
5114
            {
5115
                eErr = CE_Failure;
×
5116
                CPLError(CE_Failure, CPLE_UserInterrupt,
×
5117
                         "User terminated CreateCopy()");
5118
            }
5119
        }
5120
    }
5121

5122
    /* -------------------------------------------------------------------- */
5123
    /*      Cleanup                                                         */
5124
    /* -------------------------------------------------------------------- */
5125
    CPLFree(pSwathBuf);
34✔
5126

5127
    return eErr;
34✔
5128
}
5129

5130
/************************************************************************/
5131
/*                      GDALCopyRasterIOExtraArg ()                     */
5132
/************************************************************************/
5133

5134
void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
323,368✔
5135
                              GDALRasterIOExtraArg *psSrcArg)
5136
{
5137
    INIT_RASTERIO_EXTRA_ARG(*psDestArg);
323,368✔
5138
    if (psSrcArg)
323,368✔
5139
    {
5140
        psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
323,368✔
5141
        psDestArg->pfnProgress = psSrcArg->pfnProgress;
323,368✔
5142
        psDestArg->pProgressData = psSrcArg->pProgressData;
323,368✔
5143
        psDestArg->bFloatingPointWindowValidity =
323,368✔
5144
            psSrcArg->bFloatingPointWindowValidity;
323,368✔
5145
        if (psSrcArg->bFloatingPointWindowValidity)
323,368✔
5146
        {
5147
            psDestArg->dfXOff = psSrcArg->dfXOff;
3,118✔
5148
            psDestArg->dfYOff = psSrcArg->dfYOff;
3,118✔
5149
            psDestArg->dfXSize = psSrcArg->dfXSize;
3,118✔
5150
            psDestArg->dfYSize = psSrcArg->dfYSize;
3,118✔
5151
        }
5152
    }
5153
}
323,368✔
5154

5155
/************************************************************************/
5156
/*                         HasOnlyNoData()                              */
5157
/************************************************************************/
5158

5159
template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
24,858,142✔
5160
{
5161
    return value == noDataValue;
24,858,142✔
5162
}
5163

5164
template <> bool IsEqualToNoData<float>(float value, float noDataValue)
560,311✔
5165
{
5166
    return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
560,311✔
5167
}
5168

5169
template <> bool IsEqualToNoData<double>(double value, double noDataValue)
503,708✔
5170
{
5171
    return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
503,708✔
5172
}
5173

5174
template <class T>
5175
static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
12,154✔
5176
                           size_t nHeight, size_t nLineStride,
5177
                           size_t nComponents)
5178
{
5179
    // Fast test: check the 4 corners and the middle pixel.
5180
    for (size_t iBand = 0; iBand < nComponents; iBand++)
23,530✔
5181
    {
5182
        if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
25,049✔
5183
              IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
12,439✔
5184
                              noDataValue) &&
12,334✔
5185
              IsEqualToNoData(
12,334✔
5186
                  pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
12,334✔
5187
                              nComponents +
12,334✔
5188
                          iBand],
5189
                  noDataValue) &&
11,389✔
5190
              IsEqualToNoData(
11,389✔
5191
                  pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
11,389✔
5192
                  noDataValue) &&
5193
              IsEqualToNoData(
11,381✔
5194
                  pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
11,381✔
5195
                              nComponents +
11,381✔
5196
                          iBand],
5197
                  noDataValue)))
5198
        {
5199
            return false;
1,234✔
5200
        }
5201
    }
5202

5203
    // Test all pixels.
5204
    for (size_t iY = 0; iY < nHeight; iY++)
37,828✔
5205
    {
5206
        const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
26,962✔
5207
        for (size_t iX = 0; iX < nWidth * nComponents; iX++)
25,888,845✔
5208
        {
5209
            if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
25,862,003✔
5210
            {
5211
                return false;
54✔
5212
            }
5213
        }
5214
    }
5215
    return true;
10,866✔
5216
}
5217

5218
/************************************************************************/
5219
/*                    GDALBufferHasOnlyNoData()                         */
5220
/************************************************************************/
5221

5222
bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
38,310✔
5223
                             size_t nWidth, size_t nHeight, size_t nLineStride,
5224
                             size_t nComponents, int nBitsPerSample,
5225
                             GDALBufferSampleFormat nSampleFormat)
5226
{
5227
    // In the case where the nodata is 0, we can compare several bytes at
5228
    // once. Select the largest natural integer type for the architecture.
5229
#if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
5230
    // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
5231
    typedef std::uint64_t WordType;
5232
#else
5233
    typedef std::uint32_t WordType;
5234
#endif
5235
    if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
38,310✔
5236
        // Do not use this optimized code path for floating point numbers,
5237
        // as it can't detect negative zero.
5238
        nSampleFormat != GSF_FLOATING_POINT)
5239
    {
5240
        const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
26,150✔
5241
        const size_t nSize =
26,150✔
5242
            (nWidth * nHeight * nComponents * nBitsPerSample + 7) / 8;
26,150✔
5243
        size_t i = 0;
26,150✔
5244
        const size_t nInitialIters =
5245
            std::min(sizeof(WordType) -
52,300✔
5246
                         static_cast<size_t>(
26,150✔
5247
                             reinterpret_cast<std::uintptr_t>(pabyBuffer) %
5248
                             sizeof(WordType)),
5249
                     nSize);
26,150✔
5250
        for (; i < nInitialIters; i++)
217,715✔
5251
        {
5252
            if (pabyBuffer[i])
195,563✔
5253
                return false;
3,998✔
5254
        }
5255
        for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
16,452,000✔
5256
        {
5257
            if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
16,436,900✔
5258
                return false;
7,086✔
5259
        }
5260
        for (; i < nSize; i++)
52,415✔
5261
        {
5262
            if (pabyBuffer[i])
37,354✔
5263
                return false;
5✔
5264
        }
5265
        return true;
15,061✔
5266
    }
5267

5268
    if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
12,160✔
5269
    {
5270
        return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
22,264✔
5271
               HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
11,132✔
5272
                              static_cast<uint8_t>(dfNoDataValue), nWidth,
11,132✔
5273
                              nHeight, nLineStride, nComponents);
11,132✔
5274
    }
5275
    if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
1,028✔
5276
    {
5277
        // Use unsigned implementation by converting the nodatavalue to
5278
        // unsigned
5279
        return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
63✔
5280
               HasOnlyNoDataT(
31✔
5281
                   static_cast<const uint8_t *>(pBuffer),
5282
                   static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
31✔
5283
                   nWidth, nHeight, nLineStride, nComponents);
32✔
5284
    }
5285
    if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
996✔
5286
    {
5287
        return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
21✔
5288
               HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
10✔
5289
                              static_cast<uint16_t>(dfNoDataValue), nWidth,
10✔
5290
                              nHeight, nLineStride, nComponents);
11✔
5291
    }
5292
    if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
985✔
5293
    {
5294
        // Use unsigned implementation by converting the nodatavalue to
5295
        // unsigned
5296
        return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
109✔
5297
               HasOnlyNoDataT(
54✔
5298
                   static_cast<const uint16_t *>(pBuffer),
5299
                   static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
54✔
5300
                   nWidth, nHeight, nLineStride, nComponents);
55✔
5301
    }
5302
    if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
930✔
5303
    {
5304
        return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
73✔
5305
               HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
36✔
5306
                              static_cast<uint32_t>(dfNoDataValue), nWidth,
5307
                              nHeight, nLineStride, nComponents);
37✔
5308
    }
5309
    if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
893✔
5310
    {
5311
        // Use unsigned implementation by converting the nodatavalue to
5312
        // unsigned
5313
        return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
19✔
5314
               HasOnlyNoDataT(
9✔
5315
                   static_cast<const uint32_t *>(pBuffer),
5316
                   static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
9✔
5317
                   nWidth, nHeight, nLineStride, nComponents);
10✔
5318
    }
5319
    if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
883✔
5320
    {
5321
        return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
56✔
5322
               HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
28✔
5323
                              static_cast<uint64_t>(dfNoDataValue), nWidth,
5324
                              nHeight, nLineStride, nComponents);
28✔
5325
    }
5326
    if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
855✔
5327
    {
5328
        // Use unsigned implementation by converting the nodatavalue to
5329
        // unsigned
5330
        return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
×
5331
               HasOnlyNoDataT(
×
5332
                   static_cast<const uint64_t *>(pBuffer),
5333
                   static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
×
5334
                   nWidth, nHeight, nLineStride, nComponents);
×
5335
    }
5336
    if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
855✔
5337
    {
5338
        return (std::isnan(dfNoDataValue) ||
689✔
5339
                GDALIsValueInRange<float>(dfNoDataValue)) &&
1,377✔
5340
               HasOnlyNoDataT(static_cast<const float *>(pBuffer),
688✔
5341
                              static_cast<float>(dfNoDataValue), nWidth,
5342
                              nHeight, nLineStride, nComponents);
689✔
5343
    }
5344
    if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
166✔
5345
    {
5346
        return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
166✔
5347
                              dfNoDataValue, nWidth, nHeight, nLineStride,
5348
                              nComponents);
166✔
5349
    }
5350
    return false;
×
5351
}
5352

5353
#ifdef HAVE_SSE2
5354

5355
/************************************************************************/
5356
/*                    GDALDeinterleave3Byte()                           */
5357
/************************************************************************/
5358

5359
#if defined(__GNUC__) && !defined(__clang__)
5360
__attribute__((optimize("no-tree-vectorize")))
5361
#endif
5362
static void
5363
GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
71,267✔
5364
                      GByte *CPL_RESTRICT pabyDest0,
5365
                      GByte *CPL_RESTRICT pabyDest1,
5366
                      GByte *CPL_RESTRICT pabyDest2, size_t nIters)
5367
#ifdef USE_NEON_OPTIMIZATIONS
5368
{
5369
    return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
5370
                                       nIters);
5371
}
5372
#else
5373
{
5374
#ifdef HAVE_SSSE3_AT_COMPILE_TIME
5375
    if (CPLHaveRuntimeSSSE3())
71,267✔
5376
    {
5377
        return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
71,285✔
5378
                                           pabyDest2, nIters);
71,283✔
5379
    }
5380
#endif
5381

5382
    size_t i = 0;
2✔
5383
    if (((reinterpret_cast<uintptr_t>(pabySrc) |
2✔
5384
          reinterpret_cast<uintptr_t>(pabyDest0) |
2✔
5385
          reinterpret_cast<uintptr_t>(pabyDest1) |
2✔
5386
          reinterpret_cast<uintptr_t>(pabyDest2)) %
2✔
5387
         sizeof(unsigned int)) == 0)
5388
    {
5389
        // Slightly better than GCC autovectorizer
5390
        for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
17✔
5391
        {
5392
            unsigned int word0 =
15✔
5393
                *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
15✔
5394
            unsigned int word1 =
15✔
5395
                *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
15✔
5396
            unsigned int word2 =
15✔
5397
                *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
15✔
5398
            reinterpret_cast<unsigned int *>(pabyDest0)[j] =
15✔
5399
                (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
15✔
5400
                ((word2 >> 8) << 24);
15✔
5401
            reinterpret_cast<unsigned int *>(pabyDest1)[j] =
15✔
5402
                ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
15✔
5403
                (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
15✔
5404
            pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
15✔
5405
            pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
15✔
5406
            pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
15✔
5407
            pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
15✔
5408
        }
5409
    }
5410
#if defined(__clang__)
5411
#pragma clang loop vectorize(disable)
5412
#endif
5413
    for (; i < nIters; ++i)
3✔
5414
    {
5415
        pabyDest0[i] = pabySrc[3 * i + 0];
1✔
5416
        pabyDest1[i] = pabySrc[3 * i + 1];
1✔
5417
        pabyDest2[i] = pabySrc[3 * i + 2];
1✔
5418
    }
5419
}
5420
#endif
5421

5422
/************************************************************************/
5423
/*                    GDALDeinterleave4Byte()                           */
5424
/************************************************************************/
5425

5426
#if !defined(__GNUC__) || defined(__clang__)
5427

5428
/************************************************************************/
5429
/*                         deinterleave()                               */
5430
/************************************************************************/
5431

5432
template <bool SHIFT, bool MASK>
5433
inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
5434
                            __m128i &xmm2_ori, __m128i &xmm3_ori)
5435
{
5436
    // Set higher 24bit of each int32 packed word to 0
5437
    if (SHIFT)
5438
    {
5439
        xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
5440
        xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
5441
        xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
5442
        xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
5443
    }
5444
    __m128i xmm0;
5445
    __m128i xmm1;
5446
    __m128i xmm2;
5447
    __m128i xmm3;
5448
    if (MASK)
5449
    {
5450
        const __m128i xmm_mask = _mm_set1_epi32(0xff);
5451
        xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
5452
        xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
5453
        xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
5454
        xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
5455
    }
5456
    else
5457
    {
5458
        xmm0 = xmm0_ori;
5459
        xmm1 = xmm1_ori;
5460
        xmm2 = xmm2_ori;
5461
        xmm3 = xmm3_ori;
5462
    }
5463
    // Pack int32 to int16
5464
    xmm0 = _mm_packs_epi32(xmm0, xmm1);
5465
    xmm2 = _mm_packs_epi32(xmm2, xmm3);
5466
    // Pack int16 to uint8
5467
    xmm0 = _mm_packus_epi16(xmm0, xmm2);
5468
    return xmm0;
5469
}
5470

5471
static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
5472
                                  GByte *CPL_RESTRICT pabyDest0,
5473
                                  GByte *CPL_RESTRICT pabyDest1,
5474
                                  GByte *CPL_RESTRICT pabyDest2,
5475
                                  GByte *CPL_RESTRICT pabyDest3, size_t nIters)
5476
#ifdef USE_NEON_OPTIMIZATIONS
5477
{
5478
    return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
5479
                                       pabyDest3, nIters);
5480
}
5481
#else
5482
{
5483
#ifdef HAVE_SSSE3_AT_COMPILE_TIME
5484
    if (CPLHaveRuntimeSSSE3())
5485
    {
5486
        return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
5487
                                           pabyDest2, pabyDest3, nIters);
5488
    }
5489
#endif
5490

5491
    // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
5492
    // do something slightly better.
5493
    size_t i = 0;
5494
    for (; i + 15 < nIters; i += 16)
5495
    {
5496
        __m128i xmm0_ori = _mm_loadu_si128(
5497
            reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
5498
        __m128i xmm1_ori = _mm_loadu_si128(
5499
            reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
5500
        __m128i xmm2_ori = _mm_loadu_si128(
5501
            reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
5502
        __m128i xmm3_ori = _mm_loadu_si128(
5503
            reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
5504

5505
        _mm_storeu_si128(
5506
            reinterpret_cast<__m128i *>(pabyDest0 + i),
5507
            deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
5508
        _mm_storeu_si128(
5509
            reinterpret_cast<__m128i *>(pabyDest1 + i),
5510
            deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
5511
        _mm_storeu_si128(
5512
            reinterpret_cast<__m128i *>(pabyDest2 + i),
5513
            deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
5514
        _mm_storeu_si128(
5515
            reinterpret_cast<__m128i *>(pabyDest3 + i),
5516
            deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
5517
    }
5518

5519
#if defined(__clang__)
5520
#pragma clang loop vectorize(disable)
5521
#endif
5522
    for (; i < nIters; ++i)
5523
    {
5524
        pabyDest0[i] = pabySrc[4 * i + 0];
5525
        pabyDest1[i] = pabySrc[4 * i + 1];
5526
        pabyDest2[i] = pabySrc[4 * i + 2];
5527
        pabyDest3[i] = pabySrc[4 * i + 3];
5528
    }
5529
}
5530
#endif
5531
#else
5532
// GCC autovectorizer does an excellent job
5533
__attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
53,129✔
5534
    const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
5535
    GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
5536
    GByte *CPL_RESTRICT pabyDest3, size_t nIters)
5537
{
5538
    for (size_t i = 0; i < nIters; ++i)
526,223,000✔
5539
    {
5540
        pabyDest0[i] = pabySrc[4 * i + 0];
526,170,000✔
5541
        pabyDest1[i] = pabySrc[4 * i + 1];
526,170,000✔
5542
        pabyDest2[i] = pabySrc[4 * i + 2];
526,170,000✔
5543
        pabyDest3[i] = pabySrc[4 * i + 3];
526,170,000✔
5544
    }
5545
}
53,129✔
5546
#endif
5547

5548
#else
5549

5550
/************************************************************************/
5551
/*                    GDALDeinterleave3Byte()                           */
5552
/************************************************************************/
5553

5554
// TODO: Enabling below could help on non-Intel architectures where GCC knows
5555
// how to auto-vectorize
5556
// #if defined(__GNUC__)
5557
//__attribute__((optimize("tree-vectorize")))
5558
// #endif
5559
static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
5560
                                  GByte *CPL_RESTRICT pabyDest0,
5561
                                  GByte *CPL_RESTRICT pabyDest1,
5562
                                  GByte *CPL_RESTRICT pabyDest2, size_t nIters)
5563
{
5564
    for (size_t i = 0; i < nIters; ++i)
5565
    {
5566
        pabyDest0[i] = pabySrc[3 * i + 0];
5567
        pabyDest1[i] = pabySrc[3 * i + 1];
5568
        pabyDest2[i] = pabySrc[3 * i + 2];
5569
    }
5570
}
5571

5572
/************************************************************************/
5573
/*                    GDALDeinterleave4Byte()                           */
5574
/************************************************************************/
5575

5576
// TODO: Enabling below could help on non-Intel architectures where gcc knows
5577
// how to auto-vectorize
5578
// #if defined(__GNUC__)
5579
//__attribute__((optimize("tree-vectorize")))
5580
// #endif
5581
static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
5582
                                  GByte *CPL_RESTRICT pabyDest0,
5583
                                  GByte *CPL_RESTRICT pabyDest1,
5584
                                  GByte *CPL_RESTRICT pabyDest2,
5585
                                  GByte *CPL_RESTRICT pabyDest3, size_t nIters)
5586
{
5587
    for (size_t i = 0; i < nIters; ++i)
5588
    {
5589
        pabyDest0[i] = pabySrc[4 * i + 0];
5590
        pabyDest1[i] = pabySrc[4 * i + 1];
5591
        pabyDest2[i] = pabySrc[4 * i + 2];
5592
        pabyDest3[i] = pabySrc[4 * i + 3];
5593
    }
5594
}
5595

5596
#endif
5597

5598
/************************************************************************/
5599
/*                      GDALDeinterleave()                              */
5600
/************************************************************************/
5601

5602
/*! Copy values from a pixel-interleave buffer to multiple per-component
5603
    buffers.
5604

5605
    In pseudo-code
5606
    \verbatim
5607
    for(size_t i = 0; i < nIters; ++i)
5608
        for(int iComp = 0; iComp < nComponents; iComp++ )
5609
            ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
5610
    \endverbatim
5611

5612
    The implementation is optimized for a few cases, like de-interleaving
5613
    of 3 or 4-components Byte buffers.
5614

5615
    \since GDAL 3.6
5616
 */
5617
void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
125,161✔
5618
                      int nComponents, void **ppDestBuffer,
5619
                      GDALDataType eDestDT, size_t nIters)
5620
{
5621
    if (eSourceDT == eDestDT)
125,161✔
5622
    {
5623
        if (eSourceDT == GDT_Byte || eSourceDT == GDT_Int8)
125,137✔
5624
        {
5625
            if (nComponents == 3)
124,404✔
5626
            {
5627
                const GByte *CPL_RESTRICT pabySrc =
71,264✔
5628
                    static_cast<const GByte *>(pSourceBuffer);
5629
                GByte *CPL_RESTRICT pabyDest0 =
71,264✔
5630
                    static_cast<GByte *>(ppDestBuffer[0]);
5631
                GByte *CPL_RESTRICT pabyDest1 =
71,264✔
5632
                    static_cast<GByte *>(ppDestBuffer[1]);
5633
                GByte *CPL_RESTRICT pabyDest2 =
71,264✔
5634
                    static_cast<GByte *>(ppDestBuffer[2]);
5635
                GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
71,264✔
5636
                                      nIters);
5637
                return;
71,283✔
5638
            }
5639
            else if (nComponents == 4)
53,140✔
5640
            {
5641
                const GByte *CPL_RESTRICT pabySrc =
53,129✔
5642
                    static_cast<const GByte *>(pSourceBuffer);
5643
                GByte *CPL_RESTRICT pabyDest0 =
53,129✔
5644
                    static_cast<GByte *>(ppDestBuffer[0]);
5645
                GByte *CPL_RESTRICT pabyDest1 =
53,129✔
5646
                    static_cast<GByte *>(ppDestBuffer[1]);
5647
                GByte *CPL_RESTRICT pabyDest2 =
53,129✔
5648
                    static_cast<GByte *>(ppDestBuffer[2]);
5649
                GByte *CPL_RESTRICT pabyDest3 =
53,129✔
5650
                    static_cast<GByte *>(ppDestBuffer[3]);
5651
                GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
53,129✔
5652
                                      pabyDest3, nIters);
5653
                return;
53,129✔
5654
            }
11✔
5655
        }
5656
#if ((defined(__GNUC__) && !defined(__clang__)) ||                             \
5657
     defined(__INTEL_CLANG_COMPILER)) &&                                       \
5658
    defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
5659
        else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
1,466✔
5660
                 CPLHaveRuntimeSSSE3())
733✔
5661
        {
5662
            if (nComponents == 3)
733✔
5663
            {
5664
                const GUInt16 *CPL_RESTRICT panSrc =
239✔
5665
                    static_cast<const GUInt16 *>(pSourceBuffer);
5666
                GUInt16 *CPL_RESTRICT panDest0 =
239✔
5667
                    static_cast<GUInt16 *>(ppDestBuffer[0]);
5668
                GUInt16 *CPL_RESTRICT panDest1 =
239✔
5669
                    static_cast<GUInt16 *>(ppDestBuffer[1]);
5670
                GUInt16 *CPL_RESTRICT panDest2 =
239✔
5671
                    static_cast<GUInt16 *>(ppDestBuffer[2]);
5672
                GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
239✔
5673
                                              panDest2, nIters);
5674
                return;
239✔
5675
            }
5676
#if !defined(__INTEL_CLANG_COMPILER)
5677
            // ICC autovectorizer doesn't do a good job, at least with icx
5678
            // 2022.1.0.20220316
5679
            else if (nComponents == 4)
494✔
5680
            {
5681
                const GUInt16 *CPL_RESTRICT panSrc =
494✔
5682
                    static_cast<const GUInt16 *>(pSourceBuffer);
5683
                GUInt16 *CPL_RESTRICT panDest0 =
494✔
5684
                    static_cast<GUInt16 *>(ppDestBuffer[0]);
5685
                GUInt16 *CPL_RESTRICT panDest1 =
494✔
5686
                    static_cast<GUInt16 *>(ppDestBuffer[1]);
5687
                GUInt16 *CPL_RESTRICT panDest2 =
494✔
5688
                    static_cast<GUInt16 *>(ppDestBuffer[2]);
5689
                GUInt16 *CPL_RESTRICT panDest3 =
494✔
5690
                    static_cast<GUInt16 *>(ppDestBuffer[3]);
5691
                GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
494✔
5692
                                              panDest2, panDest3, nIters);
5693
                return;
494✔
5694
            }
5695
#endif
5696
        }
5697
#endif
5698
    }
5699

5700
    const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
35✔
5701
    const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
22✔
5702
    for (int iComp = 0; iComp < nComponents; iComp++)
87✔
5703
    {
5704
        GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
65✔
5705
                            iComp * nSourceDTSize,
65✔
5706
                        eSourceDT, nComponents * nSourceDTSize,
5707
                        ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
65✔
5708
    }
5709
}
5710

5711
/************************************************************************/
5712
/*                    GDALTranspose2DSingleToSingle()                   */
5713
/************************************************************************/
5714
/**
5715
 * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
5716
 *
5717
 * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
5718
 * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
5719
 * @param nSrcWidth Width of pSrc array.
5720
 * @param nSrcHeight Height of pSrc array.
5721
 */
5722

5723
template <class DST, class SRC>
5724
void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
145✔
5725
                                   DST *CPL_RESTRICT pDst, size_t nSrcWidth,
5726
                                   size_t nSrcHeight)
5727
{
5728
    constexpr size_t blocksize = 32;
145✔
5729
    for (size_t i = 0; i < nSrcHeight; i += blocksize)
315✔
5730
    {
5731
        const size_t max_k = std::min(i + blocksize, nSrcHeight);
170✔
5732
        for (size_t j = 0; j < nSrcWidth; j += blocksize)
390✔
5733
        {
5734
            // transpose the block beginning at [i,j]
5735
            const size_t max_l = std::min(j + blocksize, nSrcWidth);
220✔
5736
            for (size_t k = i; k < max_k; ++k)
2,509✔
5737
            {
5738
                for (size_t l = j; l < max_l; ++l)
41,017✔
5739
                {
5740
                    GDALCopyWord(pSrc[l + k * nSrcWidth],
38,728✔
5741
                                 pDst[k + l * nSrcHeight]);
38,728✔
5742
                }
5743
            }
5744
        }
5745
    }
5746
}
145✔
5747

5748
/************************************************************************/
5749
/*                   GDALTranspose2DComplexToComplex()                  */
5750
/************************************************************************/
5751
/**
5752
 * Transpose a 2D array of complex values into an array of complex values,
5753
 * in a efficient (cache-oblivious) way.
5754
 *
5755
 * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
5756
 * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
5757
 * @param nSrcWidth Width of pSrc array.
5758
 * @param nSrcHeight Height of pSrc array.
5759
 */
5760
template <class DST, class SRC>
5761
void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
25✔
5762
                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
5763
                                     size_t nSrcHeight)
5764
{
5765
    constexpr size_t blocksize = 32;
25✔
5766
    for (size_t i = 0; i < nSrcHeight; i += blocksize)
50✔
5767
    {
5768
        const size_t max_k = std::min(i + blocksize, nSrcHeight);
25✔
5769
        for (size_t j = 0; j < nSrcWidth; j += blocksize)
50✔
5770
        {
5771
            // transpose the block beginning at [i,j]
5772
            const size_t max_l = std::min(j + blocksize, nSrcWidth);
25✔
5773
            for (size_t k = i; k < max_k; ++k)
75✔
5774
            {
5775
                for (size_t l = j; l < max_l; ++l)
200✔
5776
                {
5777
                    GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
150✔
5778
                                 pDst[2 * (k + l * nSrcHeight) + 0]);
150✔
5779
                    GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
150✔
5780
                                 pDst[2 * (k + l * nSrcHeight) + 1]);
150✔
5781
                }
5782
            }
5783
        }
5784
    }
5785
}
25✔
5786

5787
/************************************************************************/
5788
/*                   GDALTranspose2DComplexToSingle()                  */
5789
/************************************************************************/
5790
/**
5791
 * Transpose a 2D array of complex values into an array of non-complex values,
5792
 * in a efficient (cache-oblivious) way.
5793
 *
5794
 * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
5795
 * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
5796
 * @param nSrcWidth Width of pSrc array.
5797
 * @param nSrcHeight Height of pSrc array.
5798
 */
5799
template <class DST, class SRC>
5800
void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
55✔
5801
                                    DST *CPL_RESTRICT pDst, size_t nSrcWidth,
5802
                                    size_t nSrcHeight)
5803
{
5804
    constexpr size_t blocksize = 32;
55✔
5805
    for (size_t i = 0; i < nSrcHeight; i += blocksize)
110✔
5806
    {
5807
        const size_t max_k = std::min(i + blocksize, nSrcHeight);
55✔
5808
        for (size_t j = 0; j < nSrcWidth; j += blocksize)
110✔
5809
        {
5810
            // transpose the block beginning at [i,j]
5811
            const size_t max_l = std::min(j + blocksize, nSrcWidth);
55✔
5812
            for (size_t k = i; k < max_k; ++k)
165✔
5813
            {
5814
                for (size_t l = j; l < max_l; ++l)
440✔
5815
                {
5816
                    GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
330✔
5817
                                 pDst[k + l * nSrcHeight]);
330✔
5818
                }
5819
            }
5820
        }
5821
    }
5822
}
55✔
5823

5824
/************************************************************************/
5825
/*                   GDALTranspose2DSingleToComplex()                  */
5826
/************************************************************************/
5827
/**
5828
 * Transpose a 2D array of non-complex values into an array of complex values,
5829
 * in a efficient (cache-oblivious) way.
5830
 *
5831
 * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
5832
 * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
5833
 * @param nSrcWidth Width of pSrc array.
5834
 * @param nSrcHeight Height of pSrc array.
5835
 */
5836
template <class DST, class SRC>
5837
void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
55✔
5838
                                    DST *CPL_RESTRICT pDst, size_t nSrcWidth,
5839
                                    size_t nSrcHeight)
5840
{
5841
    constexpr size_t blocksize = 32;
55✔
5842
    for (size_t i = 0; i < nSrcHeight; i += blocksize)
110✔
5843
    {
5844
        const size_t max_k = std::min(i + blocksize, nSrcHeight);
55✔
5845
        for (size_t j = 0; j < nSrcWidth; j += blocksize)
110✔
5846
        {
5847
            // transpose the block beginning at [i,j]
5848
            const size_t max_l = std::min(j + blocksize, nSrcWidth);
55✔
5849
            for (size_t k = i; k < max_k; ++k)
165✔
5850
            {
5851
                for (size_t l = j; l < max_l; ++l)
440✔
5852
                {
5853
                    GDALCopyWord(pSrc[l + k * nSrcWidth],
330✔
5854
                                 pDst[2 * (k + l * nSrcHeight) + 0]);
330✔
5855
                    pDst[2 * (k + l * nSrcHeight) + 1] = 0;
330✔
5856
                }
5857
            }
5858
        }
5859
    }
5860
}
55✔
5861

5862
/************************************************************************/
5863
/*                        GDALTranspose2D()                             */
5864
/************************************************************************/
5865

5866
template <class DST, bool DST_IS_COMPLEX>
5867
static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
280✔
5868
                            size_t nSrcWidth, size_t nSrcHeight)
5869
{
5870
#define CALL_GDALTranspose2D_internal(SRC_TYPE)                                \
5871
    do                                                                         \
5872
    {                                                                          \
5873
        if constexpr (DST_IS_COMPLEX)                                          \
5874
        {                                                                      \
5875
            GDALTranspose2DSingleToComplex(                                    \
5876
                static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
5877
                nSrcHeight);                                                   \
5878
        }                                                                      \
5879
        else                                                                   \
5880
        {                                                                      \
5881
            GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
5882
                                          pDst, nSrcWidth, nSrcHeight);        \
5883
        }                                                                      \
5884
    } while (0)
5885

5886
#define CALL_GDALTranspose2DComplex_internal(SRC_TYPE)                         \
5887
    do                                                                         \
5888
    {                                                                          \
5889
        if constexpr (DST_IS_COMPLEX)                                          \
5890
        {                                                                      \
5891
            GDALTranspose2DComplexToComplex(                                   \
5892
                static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
5893
                nSrcHeight);                                                   \
5894
        }                                                                      \
5895
        else                                                                   \
5896
        {                                                                      \
5897
            GDALTranspose2DComplexToSingle(                                    \
5898
                static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
5899
                nSrcHeight);                                                   \
5900
        }                                                                      \
5901
    } while (0)
5902

5903
    // clang-format off
5904
    switch (eSrcType)
280✔
5905
    {
5906
        case GDT_Byte:     CALL_GDALTranspose2D_internal(uint8_t); break;
16✔
5907
        case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t); break;
15✔
5908
        case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t); break;
24✔
5909
        case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t); break;
16✔
5910
        case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t); break;
24✔
5911
        case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t); break;
16✔
5912
        case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t); break;
16✔
5913
        case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t); break;
16✔
5914
        case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16); break;
16✔
5915
        case GDT_Float32:  CALL_GDALTranspose2D_internal(float); break;
17✔
5916
        case GDT_Float64:  CALL_GDALTranspose2D_internal(double); break;
24✔
5917
        case GDT_CInt16:   CALL_GDALTranspose2DComplex_internal(int16_t); break;
16✔
5918
        case GDT_CInt32:   CALL_GDALTranspose2DComplex_internal(int32_t); break;
16✔
5919
        case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
16✔
5920
        case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
16✔
5921
        case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
16✔
5922
        case GDT_Unknown:
×
5923
        case GDT_TypeCount:
5924
            break;
×
5925
    }
5926
        // clang-format on
5927

5928
#undef CALL_GDALTranspose2D_internal
5929
#undef CALL_GDALTranspose2DComplex_internal
5930
}
280✔
5931

5932
/************************************************************************/
5933
/*                      GDALInterleave2Byte()                           */
5934
/************************************************************************/
5935

5936
#if defined(HAVE_SSE2) &&                                                      \
5937
    (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
5938

5939
// ICC autovectorizer doesn't do a good job at generating good SSE code,
5940
// at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
5941
#if defined(__GNUC__)
5942
__attribute__((noinline))
5943
#endif
5944
static void
5945
GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
5946
                    uint8_t *CPL_RESTRICT pDst, size_t nIters)
5947
{
5948
    size_t i = 0;
5949
    constexpr size_t VALS_PER_ITER = 16;
5950
    for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
5951
    {
5952
        __m128i xmm0 =
5953
            _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
5954
        __m128i xmm1 = _mm_loadu_si128(
5955
            reinterpret_cast<__m128i const *>(pSrc + i + nIters));
5956
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
5957
                         _mm_unpacklo_epi8(xmm0, xmm1));
5958
        _mm_storeu_si128(
5959
            reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
5960
            _mm_unpackhi_epi8(xmm0, xmm1));
5961
    }
5962
#if defined(__clang__)
5963
#pragma clang loop vectorize(disable)
5964
#endif
5965
    for (; i < nIters; ++i)
5966
    {
5967
        pDst[2 * i + 0] = pSrc[i + 0 * nIters];
5968
        pDst[2 * i + 1] = pSrc[i + 1 * nIters];
5969
    }
5970
}
5971

5972
#else
5973

5974
#if defined(__GNUC__) && !defined(__clang__)
5975
__attribute__((optimize("tree-vectorize")))
5976
#endif
5977
#if defined(__GNUC__)
5978
__attribute__((noinline))
5979
#endif
5980
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
5981
// clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
5982
#pragma clang diagnostic push
5983
#pragma clang diagnostic ignored "-Wpass-failed"
5984
#endif
5985
static void
5986
GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
4✔
5987
                    uint8_t *CPL_RESTRICT pDst, size_t nIters)
5988
{
5989
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
5990
#pragma clang loop vectorize(enable)
5991
#endif
5992
    for (size_t i = 0; i < nIters; ++i)
44✔
5993
    {
5994
        pDst[2 * i + 0] = pSrc[i + 0 * nIters];
40✔
5995
        pDst[2 * i + 1] = pSrc[i + 1 * nIters];
40✔
5996
    }
5997
}
4✔
5998
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
5999
#pragma clang diagnostic pop
6000
#endif
6001

6002
#endif
6003

6004
/************************************************************************/
6005
/*                      GDALInterleave4Byte()                           */
6006
/************************************************************************/
6007

6008
#if defined(HAVE_SSE2) &&                                                      \
6009
    (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
6010

6011
// ICC autovectorizer doesn't do a good job at generating good SSE code,
6012
// at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
6013
#if defined(__GNUC__)
6014
__attribute__((noinline))
6015
#endif
6016
static void
6017
GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
6018
                    uint8_t *CPL_RESTRICT pDst, size_t nIters)
6019
{
6020
    size_t i = 0;
6021
    constexpr size_t VALS_PER_ITER = 16;
6022
    for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
6023
    {
6024
        __m128i xmm0 = _mm_loadu_si128(
6025
            reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
6026
        __m128i xmm1 = _mm_loadu_si128(
6027
            reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
6028
        __m128i xmm2 = _mm_loadu_si128(
6029
            reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
6030
        __m128i xmm3 = _mm_loadu_si128(
6031
            reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
6032
        auto tmp0 = _mm_unpacklo_epi8(
6033
            xmm0,
6034
            xmm1);  // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
6035
        auto tmp1 = _mm_unpackhi_epi8(
6036
            xmm0,
6037
            xmm1);  // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
6038
        auto tmp2 = _mm_unpacklo_epi8(
6039
            xmm2,
6040
            xmm3);  // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
6041
        auto tmp3 = _mm_unpackhi_epi8(
6042
            xmm2,
6043
            xmm3);  // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
6044
        auto tmp2_0 = _mm_unpacklo_epi16(
6045
            tmp0,
6046
            tmp2);  // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
6047
        auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
6048
        auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
6049
        auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
6050
        _mm_storeu_si128(
6051
            reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
6052
            tmp2_0);
6053
        _mm_storeu_si128(
6054
            reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
6055
            tmp2_1);
6056
        _mm_storeu_si128(
6057
            reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
6058
            tmp2_2);
6059
        _mm_storeu_si128(
6060
            reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
6061
            tmp2_3);
6062
    }
6063
#if defined(__clang__)
6064
#pragma clang loop vectorize(disable)
6065
#endif
6066
    for (; i < nIters; ++i)
6067
    {
6068
        pDst[4 * i + 0] = pSrc[i + 0 * nIters];
6069
        pDst[4 * i + 1] = pSrc[i + 1 * nIters];
6070
        pDst[4 * i + 2] = pSrc[i + 2 * nIters];
6071
        pDst[4 * i + 3] = pSrc[i + 3 * nIters];
6072
    }
6073
}
6074

6075
#else
6076

6077
#if defined(__GNUC__) && !defined(__clang__)
6078
__attribute__((optimize("tree-vectorize")))
6079
#endif
6080
#if defined(__GNUC__)
6081
__attribute__((noinline))
6082
#endif
6083
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6084
// clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
6085
#pragma clang diagnostic push
6086
#pragma clang diagnostic ignored "-Wpass-failed"
6087
#endif
6088
static void
6089
GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
2✔
6090
                    uint8_t *CPL_RESTRICT pDst, size_t nIters)
6091
{
6092
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6093
#pragma clang loop vectorize(enable)
6094
#endif
6095
    for (size_t i = 0; i < nIters; ++i)
36✔
6096
    {
6097
        pDst[4 * i + 0] = pSrc[i + 0 * nIters];
34✔
6098
        pDst[4 * i + 1] = pSrc[i + 1 * nIters];
34✔
6099
        pDst[4 * i + 2] = pSrc[i + 2 * nIters];
34✔
6100
        pDst[4 * i + 3] = pSrc[i + 3 * nIters];
34✔
6101
    }
6102
}
2✔
6103
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6104
#pragma clang diagnostic pop
6105
#endif
6106

6107
#endif
6108

6109
/************************************************************************/
6110
/*                        GDALTranspose2D()                             */
6111
/************************************************************************/
6112

6113
/**
6114
 * Transpose a 2D array in a efficient (cache-oblivious) way.
6115
 *
6116
 * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
6117
 * @param eSrcType Data type of pSrc.
6118
 * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
6119
 * @param eDstType Data type of pDst.
6120
 * @param nSrcWidth Width of pSrc array.
6121
 * @param nSrcHeight Height of pSrc array.
6122
 * @since GDAL 3.11
6123
 */
6124

6125
void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
305✔
6126
                     GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
6127
{
6128
    if (eSrcType == eDstType && (eSrcType == GDT_Byte || eSrcType == GDT_Int8))
305✔
6129
    {
6130
        if (nSrcHeight == 2)
25✔
6131
        {
6132
            GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
4✔
6133
                                static_cast<uint8_t *>(pDst), nSrcWidth);
6134
            return;
4✔
6135
        }
6136
        if (nSrcHeight == 4)
21✔
6137
        {
6138
            GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
2✔
6139
                                static_cast<uint8_t *>(pDst), nSrcWidth);
6140
            return;
2✔
6141
        }
6142
#if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
6143
     (defined(__x86_64) || defined(_M_X64)))
6144
        if (CPLHaveRuntimeSSSE3())
19✔
6145
        {
6146
            GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
19✔
6147
                                       static_cast<uint8_t *>(pDst), nSrcWidth,
6148
                                       nSrcHeight);
6149
            return;
19✔
6150
        }
6151
#elif defined(USE_NEON_OPTIMIZATIONS)
6152
        {
6153
            GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
6154
                                       static_cast<uint8_t *>(pDst), nSrcWidth,
6155
                                       nSrcHeight);
6156
            return;
6157
        }
6158
#endif
6159
    }
6160

6161
#define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX)                \
6162
    GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>(                                 \
6163
        pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
6164

6165
    // clang-format off
6166
    switch (eDstType)
280✔
6167
    {
6168
        case GDT_Byte:     CALL_GDALTranspose2D_internal(uint8_t, false); break;
15✔
6169
        case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t, false); break;
15✔
6170
        case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t, false); break;
24✔
6171
        case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t, false); break;
16✔
6172
        case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t, false); break;
24✔
6173
        case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t, false); break;
16✔
6174
        case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t, false); break;
16✔
6175
        case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t, false); break;
16✔
6176
        case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16, false); break;
16✔
6177
        case GDT_Float32:  CALL_GDALTranspose2D_internal(float, false); break;
17✔
6178
        case GDT_Float64:  CALL_GDALTranspose2D_internal(double, false); break;
25✔
6179
        case GDT_CInt16:   CALL_GDALTranspose2D_internal(int16_t, true); break;
16✔
6180
        case GDT_CInt32:   CALL_GDALTranspose2D_internal(int32_t, true); break;
16✔
6181
        case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
16✔
6182
        case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
16✔
6183
        case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
16✔
6184
        case GDT_Unknown:
×
6185
        case GDT_TypeCount:
6186
            break;
×
6187
    }
6188
        // clang-format on
6189

6190
#undef CALL_GDALTranspose2D_internal
6191
}
6192

6193
/************************************************************************/
6194
/*                     ExtractBitAndConvertTo255()                      */
6195
/************************************************************************/
6196

6197
#if defined(__GNUC__) || defined(_MSC_VER)
6198
// Signedness of char implementation dependent, so be explicit.
6199
// Assumes 2-complement integer types and sign extension of right shifting
6200
// GCC guarantees such:
6201
// https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
6202
static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
156,490✔
6203
{
6204
    return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
156,490✔
6205
                              7);
156,490✔
6206
}
6207
#else
6208
// Portable way
6209
static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
6210
{
6211
    return (byVal & (1 << nBit)) ? 255 : 0;
6212
}
6213
#endif
6214

6215
/************************************************************************/
6216
/*                   ExpandEightPackedBitsToByteAt255()                 */
6217
/************************************************************************/
6218

6219
static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
19,377✔
6220
                                                    GByte abyOutput[8])
6221
{
6222
    abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
19,377✔
6223
    abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
19,377✔
6224
    abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
19,377✔
6225
    abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
19,377✔
6226
    abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
19,377✔
6227
    abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
19,377✔
6228
    abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
19,377✔
6229
    abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
19,377✔
6230
}
19,377✔
6231

6232
/************************************************************************/
6233
/*                GDALExpandPackedBitsToByteAt0Or255()                  */
6234
/************************************************************************/
6235

6236
/** Expand packed-bits (ordered from most-significant bit to least one)
6237
  into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
6238
  at 1 to a byte at 255.
6239

6240
 The function does (in a possibly more optimized way) the following:
6241
 \code{.cpp}
6242
 for (size_t i = 0; i < nInputBits; ++i )
6243
 {
6244
     pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
6245
 }
6246
 \endcode
6247

6248
 @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
6249
 @param pabyOutput Output array of nInputBits bytes.
6250
 @param nInputBits Number of valid bits in pabyInput.
6251

6252
 @since 3.11
6253
*/
6254

6255
void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
44,405✔
6256
                                        GByte *CPL_RESTRICT pabyOutput,
6257
                                        size_t nInputBits)
6258
{
6259
    const size_t nInputWholeBytes = nInputBits / 8;
44,405✔
6260
    size_t iByte = 0;
44,405✔
6261

6262
#ifdef HAVE_SSE2
6263
    // Mask to isolate each bit
6264
    const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
44,405✔
6265
                                          8, 16, 32, 64, -128);
6266
    const __m128i zero = _mm_setzero_si128();
44,405✔
6267
    const __m128i all_ones = _mm_set1_epi8(-1);
44,405✔
6268
#ifdef __SSSE3__
6269
    const __m128i dispatch_two_bytes =
6270
        _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
6271
#endif
6272
    constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
44,405✔
6273
    for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
132,354✔
6274
    {
6275
        __m128i reg_ori = _mm_loadu_si128(
87,949✔
6276
            reinterpret_cast<const __m128i *>(pabyInput + iByte));
87,949✔
6277

6278
        constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
87,949✔
6279
        for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
791,541✔
6280
        {
6281
            // Given reg_ori = (A, B, ... 14 other bytes ...),
6282
            // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
6283
#ifdef __SSSE3__
6284
            __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
6285
#else
6286
            __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
703,592✔
6287
            reg = _mm_unpacklo_epi16(reg, reg);
703,592✔
6288
            reg = _mm_unpacklo_epi32(reg, reg);
703,592✔
6289
#endif
6290

6291
            // Test if bits of interest are set
6292
            reg = _mm_and_si128(reg, bit_mask);
703,592✔
6293

6294
            // Now test if those bits are set, by comparing to zero. So the
6295
            // result will be that bytes where bits are set will be at 0, and
6296
            // ones where they are cleared will be at 0xFF. So the inverse of
6297
            // the end result we want!
6298
            reg = _mm_cmpeq_epi8(reg, zero);
703,592✔
6299

6300
            // Invert the result
6301
            reg = _mm_andnot_si128(reg, all_ones);
703,592✔
6302

6303
            _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
6304

6305
            pabyOutput += SSE_REG_SIZE;
703,592✔
6306

6307
            // Right-shift of 2 bytes
6308
            reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
703,592✔
6309
        }
6310
    }
6311

6312
#endif  // HAVE_SSE2
6313

6314
    for (; iByte < nInputWholeBytes; ++iByte)
63,782✔
6315
    {
6316
        ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
19,377✔
6317
        pabyOutput += 8;
19,377✔
6318
    }
6319
    for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
45,879✔
6320
    {
6321
        *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
1,474✔
6322
        ++pabyOutput;
1,474✔
6323
    }
6324
}
44,405✔
6325

6326
/************************************************************************/
6327
/*                   ExpandEightPackedBitsToByteAt1()                   */
6328
/************************************************************************/
6329

6330
static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
136,113✔
6331
                                                  GByte abyOutput[8])
6332
{
6333
    abyOutput[0] = (byVal >> 7) & 0x1;
136,113✔
6334
    abyOutput[1] = (byVal >> 6) & 0x1;
136,113✔
6335
    abyOutput[2] = (byVal >> 5) & 0x1;
136,113✔
6336
    abyOutput[3] = (byVal >> 4) & 0x1;
136,113✔
6337
    abyOutput[4] = (byVal >> 3) & 0x1;
136,113✔
6338
    abyOutput[5] = (byVal >> 2) & 0x1;
136,113✔
6339
    abyOutput[6] = (byVal >> 1) & 0x1;
136,113✔
6340
    abyOutput[7] = (byVal >> 0) & 0x1;
136,113✔
6341
}
136,113✔
6342

6343
/************************************************************************/
6344
/*                GDALExpandPackedBitsToByteAt0Or1()                    */
6345
/************************************************************************/
6346

6347
/** Expand packed-bits (ordered from most-significant bit to least one)
6348
  into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
6349
  at 1 to a byte at 1.
6350

6351
 The function does (in a possibly more optimized way) the following:
6352
 \code{.cpp}
6353
 for (size_t i = 0; i < nInputBits; ++i )
6354
 {
6355
     pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
6356
 }
6357
 \endcode
6358

6359
 @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
6360
 @param pabyOutput Output array of nInputBits bytes.
6361
 @param nInputBits Number of valid bits in pabyInput.
6362

6363
 @since 3.11
6364
*/
6365

6366
void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
7,041✔
6367
                                      GByte *CPL_RESTRICT pabyOutput,
6368
                                      size_t nInputBits)
6369
{
6370
    const size_t nInputWholeBytes = nInputBits / 8;
7,041✔
6371
    size_t iByte = 0;
7,041✔
6372
    for (; iByte < nInputWholeBytes; ++iByte)
143,154✔
6373
    {
6374
        ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
136,113✔
6375
        pabyOutput += 8;
136,113✔
6376
    }
6377
    for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
18,902✔
6378
    {
6379
        *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
11,861✔
6380
        ++pabyOutput;
11,861✔
6381
    }
6382
}
7,041✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc