Skip to content

Commit

Permalink
Warper: optimize speed of Lanczos resampling in Byte case without any…
Browse files Browse the repository at this point in the history
… masking/alpha
  • Loading branch information
rouault committed Oct 18, 2024
1 parent 1650f86 commit 7727dba
Showing 1 changed file with 134 additions and 30 deletions.
164 changes: 134 additions & 30 deletions alg/gdalwarpkernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3838,13 +3838,15 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand,
const double dfYScale = poWK->dfYScale;

// Space for saved X weights.
double *padfWeightsX = psWrkStruct->padfWeightsX;
double *padfWeightsY = psWrkStruct->padfWeightsY;
double *const padfWeightsXShifted =
psWrkStruct->padfWeightsX - poWK->nFiltInitX;
double *const padfWeightsYShifted =
psWrkStruct->padfWeightsY - poWK->nFiltInitY;

// Space for saving a row of pixels.
double *padfRowDensity = psWrkStruct->padfRowDensity;
double *padfRowReal = psWrkStruct->padfRowReal;
double *padfRowImag = psWrkStruct->padfRowImag;
double *const padfRowDensity = psWrkStruct->padfRowDensity;
double *const padfRowReal = psWrkStruct->padfRowReal;
double *const padfRowImag = psWrkStruct->padfRowImag;

// Skip sampling over edge of image.
int jMin = poWK->nFiltInitY;
Expand Down Expand Up @@ -3873,7 +3875,7 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand,
Naive version:
for (int i = iMin; i <= iMax; ++i)
{
psWrkStruct->padfWeightsX[i - poWK->nFiltInitX] =
psWrkStruct->padfWeightsXShifted[i] =
GWKLanczosSinc((i - dfDeltaX) * dfXScale);
}
Expand Down Expand Up @@ -3918,7 +3920,7 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand,
const double dfSinPiXScaleOver3 = psWrkStruct->dfSinPiXScaleOver3;
const double dfCosPiXScale = psWrkStruct->dfCosPiXScale;
const double dfSinPiXScale = psWrkStruct->dfSinPiXScale;
psWrkStruct->padfWeightsX[iMin - poWK->nFiltInitX] =
padfWeightsXShifted[iMin] =
dfX == 0 ? 1.0 : dfSin * dfSinOver3 / (dfPIX * dfPIXover3);
for (int i = iMin + 1; i <= iMax; ++i)
{
Expand All @@ -3928,7 +3930,7 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand,
dfSin * dfCosPiXScale + dfCos * dfSinPiXScale;
const double dfNewSinOver3 = dfSinOver3 * dfCosPiXScaleOver3 +
dfCosOver3 * dfSinPiXScaleOver3;
psWrkStruct->padfWeightsX[i - poWK->nFiltInitX] =
padfWeightsXShifted[i] =
dfX == 0 ? 1.0
: dfNewSin * dfNewSinOver3 / (dfPIX * dfPIX / 3);
const double dfNewCos =
Expand Down Expand Up @@ -3998,10 +4000,9 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand,
{
const double dfX = i - dfDeltaX;
if (dfX == 0.0)
padfWeightsX[i - poWK->nFiltInitX] = 1.0;
padfWeightsXShifted[i] = 1.0;
else
padfWeightsX[i - poWK->nFiltInitX] =
padfCst[(i + 3) % 3] / (dfX * dfX);
padfWeightsXShifted[i] = padfCst[(i + 3) % 3] / (dfX * dfX);
#if DEBUG_VERBOSE
// TODO(schwehr): AlmostEqual.
// CPLAssert(fabs(padfWeightsX[i-poWK->nFiltInitX] -
Expand All @@ -4026,7 +4027,7 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand,
Naive version:
for (int j = jMin; j <= jMax; ++j)
{
psWrkStruct->padfWeightsY[j - poWK->nFiltInitY] =
padfWeightsYShifted[j] =
GWKLanczosSinc((j - dfDeltaY) * dfYScale);
}
*/
Expand Down Expand Up @@ -4054,7 +4055,7 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand,
const double dfSinPiYScaleOver3 = psWrkStruct->dfSinPiYScaleOver3;
const double dfCosPiYScale = psWrkStruct->dfCosPiYScale;
const double dfSinPiYScale = psWrkStruct->dfSinPiYScale;
psWrkStruct->padfWeightsY[jMin - poWK->nFiltInitY] =
padfWeightsYShifted[jMin] =
dfY == 0 ? 1.0 : dfSin * dfSinOver3 / (dfPIY * dfPIYover3);
for (int j = jMin + 1; j <= iMax; ++j)
{
Expand All @@ -4064,7 +4065,7 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand,
dfSin * dfCosPiYScale + dfCos * dfSinPiYScale;
const double dfNewSinOver3 = dfSinOver3 * dfCosPiYScaleOver3 +
dfCosOver3 * dfSinPiYScaleOver3;
psWrkStruct->padfWeightsY[j - poWK->nFiltInitY] =
padfWeightsYShifted[j] =
dfY == 0 ? 1.0
: dfNewSin * dfNewSinOver3 / (dfPIY * dfPIY / 3);
const double dfNewCos =
Expand Down Expand Up @@ -4117,13 +4118,12 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand,
{
const double dfY = j - dfDeltaY;
if (dfY == 0.0)
padfWeightsY[j - poWK->nFiltInitY] = 1.0;
padfWeightsYShifted[j] = 1.0;
else
padfWeightsY[j - poWK->nFiltInitY] =
padfCst[(j + 3) % 3] / (dfY * dfY);
padfWeightsYShifted[j] = padfCst[(j + 3) % 3] / (dfY * dfY);
#if DEBUG_VERBOSE
// TODO(schwehr): AlmostEqual.
// CPLAssert(fabs(padfWeightsY[j-poWK->nFiltInitY] -
// CPLAssert(fabs(padfWeightsYShifted[j] -
// GWKLanczosSinc(dfY, 3.0)) < 1e-10);
#endif
}
Expand All @@ -4133,30 +4133,135 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand,
}
}

GPtrDiff_t iRowOffset =
iSrcOffset + static_cast<GPtrDiff_t>(jMin - 1) * nSrcXSize + iMin;

// If we have no density information, we can simply compute the
// accumulated weight.
if (padfRowDensity == nullptr)
{
double dfRowAccWeight = 0.0;
for (int i = iMin; i <= iMax; ++i)
{
dfRowAccWeight += padfWeightsX[i - poWK->nFiltInitX];
dfRowAccWeight += padfWeightsXShifted[i];
}
double dfColAccWeight = 0.0;
for (int j = jMin; j <= jMax; ++j)
{
dfColAccWeight += padfWeightsY[j - poWK->nFiltInitY];
dfColAccWeight += padfWeightsYShifted[j];
}
dfAccumulatorWeight = dfRowAccWeight * dfColAccWeight;
}

const bool bIsNonComplex = !GDALDataTypeIsComplex(poWK->eWorkingDataType);

// Loop over pixel rows in the kernel.

if (poWK->eWorkingDataType == GDT_Byte && !poWK->panUnifiedSrcValid &&
!poWK->papanBandSrcValid && !poWK->pafUnifiedSrcDensity &&
!padfRowDensity)
{
// Optimization for Byte case without any masking/alpha

if (dfAccumulatorWeight < 0.000001)
{
*pdfDensity = 0.0;
return false;
}

const GByte *pSrc =
reinterpret_cast<const GByte *>(poWK->papabySrcImage[iBand]);
pSrc += iSrcOffset + static_cast<GPtrDiff_t>(jMin) * nSrcXSize;

#if defined(__x86_64) || defined(_M_X64)
if (iMax - iMin + 1 == 6)
{
// This is just an optimized version of the general case in
// the else clause.

pSrc += iMin;
int j = jMin;
const auto fourXWeights =
XMMReg4Double::Load4Val(padfWeightsXShifted + iMin);

// Process 2 lines at the same time.
for (; j < jMax; j += 2)
{
const XMMReg4Double v_acc =
XMMReg4Double::Load4Val(pSrc) * fourXWeights;
const XMMReg4Double v_acc2 =
XMMReg4Double::Load4Val(pSrc + nSrcXSize) * fourXWeights;
const double dfRowAcc = v_acc.GetHorizSum();
const double dfRowAccEnd =
pSrc[4] * padfWeightsXShifted[iMin + 4] +
pSrc[5] * padfWeightsXShifted[iMin + 5];
dfAccumulatorReal +=
(dfRowAcc + dfRowAccEnd) * padfWeightsYShifted[j];
const double dfRowAcc2 = v_acc2.GetHorizSum();
const double dfRowAcc2End =
pSrc[nSrcXSize + 4] * padfWeightsXShifted[iMin + 4] +
pSrc[nSrcXSize + 5] * padfWeightsXShifted[iMin + 5];
dfAccumulatorReal +=
(dfRowAcc2 + dfRowAcc2End) * padfWeightsYShifted[j + 1];
pSrc += 2 * nSrcXSize;
}
if (j == jMax)
{
// Process last line if there's an odd number of them.

const XMMReg4Double v_acc =
XMMReg4Double::Load4Val(pSrc) * fourXWeights;
const double dfRowAcc = v_acc.GetHorizSum();
const double dfRowAccEnd =
pSrc[4] * padfWeightsXShifted[iMin + 4] +
pSrc[5] * padfWeightsXShifted[iMin + 5];
dfAccumulatorReal +=
(dfRowAcc + dfRowAccEnd) * padfWeightsYShifted[j];
}
}
else
#endif
{
for (int j = jMin; j <= jMax; ++j)
{
int i = iMin;
double dfRowAcc1 = 0.0;
double dfRowAcc2 = 0.0;
// A bit of loop unrolling
for (; i < iMax; i += 2)
{
dfRowAcc1 += pSrc[i] * padfWeightsXShifted[i];
dfRowAcc2 += pSrc[i + 1] * padfWeightsXShifted[i + 1];
}
if (i == iMax)
{
// Process last column if there's an odd number of them.
dfRowAcc1 += pSrc[i] * padfWeightsXShifted[i];
}

dfAccumulatorReal +=
(dfRowAcc1 + dfRowAcc2) * padfWeightsYShifted[j];
pSrc += nSrcXSize;
}
}

// Calculate the output taking into account weighting.
if (dfAccumulatorWeight < 0.99999 || dfAccumulatorWeight > 1.00001)
{
const double dfInvAcc = 1.0 / dfAccumulatorWeight;
*pdfReal = dfAccumulatorReal * dfInvAcc;
*pdfDensity = 1.0;
}
else
{
*pdfReal = dfAccumulatorReal;
*pdfDensity = 1.0;
}

return true;
}

GPtrDiff_t iRowOffset =
iSrcOffset + static_cast<GPtrDiff_t>(jMin - 1) * nSrcXSize + iMin;

int nCountValid = 0;
const bool bIsNonComplex = !GDALDataTypeIsComplex(poWK->eWorkingDataType);

for (int j = jMin; j <= jMax; ++j)
{
iRowOffset += nSrcXSize;
Expand All @@ -4170,7 +4275,7 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand,
padfRowDensity, padfRowReal, padfRowImag))
continue;

const double dfWeight1 = padfWeightsY[j - poWK->nFiltInitY];
const double dfWeight1 = padfWeightsYShifted[j];

// Iterate over pixels in row.
if (padfRowDensity != nullptr)
Expand All @@ -4184,8 +4289,7 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand,
nCountValid++;

// Use a cached set of weights for this row.
const double dfWeight2 =
dfWeight1 * padfWeightsX[i - poWK->nFiltInitX];
const double dfWeight2 = dfWeight1 * padfWeightsXShifted[i];

// Accumulate!
dfAccumulatorReal += padfRowReal[i - iMin] * dfWeight2;
Expand All @@ -4199,7 +4303,7 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand,
double dfRowAccReal = 0.0;
for (int i = iMin; i <= iMax; ++i)
{
const double dfWeight2 = padfWeightsX[i - poWK->nFiltInitX];
const double dfWeight2 = padfWeightsXShifted[i];

// Accumulate!
dfRowAccReal += padfRowReal[i - iMin] * dfWeight2;
Expand All @@ -4213,7 +4317,7 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand,
double dfRowAccImag = 0.0;
for (int i = iMin; i <= iMax; ++i)
{
const double dfWeight2 = padfWeightsX[i - poWK->nFiltInitX];
const double dfWeight2 = padfWeightsXShifted[i];

// Accumulate!
dfRowAccReal += padfRowReal[i - iMin] * dfWeight2;
Expand Down

0 comments on commit 7727dba

Please sign in to comment.