Skip to content

Commit efad694

Browse files
committed
some improvements of existing sse3 optimization of bilateral filter in case of 8uc3. Now perf tests take 6120ms instead of previous 7250ms (1.18x speed-up)
1 parent 242a6de commit efad694

File tree

1 file changed

+12
-8
lines changed

1 file changed

+12
-8
lines changed

modules/imgproc/src/smooth.cpp

+12-8
Original file line numberDiff line numberDiff line change
@@ -1787,21 +1787,25 @@ class BilateralFilter_8u_Invoker :
17871787
#if CV_SSE3
17881788
if( haveSSE3 )
17891789
{
1790+
const __m128i izero = _mm_setzero_si128();
17901791
const __m128 _b0 = _mm_set1_ps(static_cast<float>(b0));
17911792
const __m128 _g0 = _mm_set1_ps(static_cast<float>(g0));
17921793
const __m128 _r0 = _mm_set1_ps(static_cast<float>(r0));
17931794
const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
17941795

17951796
for( ; k <= maxk - 4; k += 4 )
17961797
{
1797-
const uchar* sptr_k = sptr + j + space_ofs[k];
1798-
const uchar* sptr_k1 = sptr + j + space_ofs[k+1];
1799-
const uchar* sptr_k2 = sptr + j + space_ofs[k+2];
1800-
const uchar* sptr_k3 = sptr + j + space_ofs[k+3];
1801-
1802-
__m128 _b = _mm_set_ps(sptr_k3[0],sptr_k2[0],sptr_k1[0],sptr_k[0]);
1803-
__m128 _g = _mm_set_ps(sptr_k3[1],sptr_k2[1],sptr_k1[1],sptr_k[1]);
1804-
__m128 _r = _mm_set_ps(sptr_k3[2],sptr_k2[2],sptr_k1[2],sptr_k[2]);
1798+
const int* const sptr_k0 = reinterpret_cast<const int*>(sptr + j + space_ofs[k]);
1799+
const int* const sptr_k1 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+1]);
1800+
const int* const sptr_k2 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+2]);
1801+
const int* const sptr_k3 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+3]);
1802+
1803+
__m128 _b = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k0[0]), izero), izero));
1804+
__m128 _g = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k1[0]), izero), izero));
1805+
__m128 _r = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k2[0]), izero), izero));
1806+
__m128 _z = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k3[0]), izero), izero));
1807+
1808+
_MM_TRANSPOSE4_PS(_b, _g, _r, _z);
18051809

18061810
__m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0));
18071811
__m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0));

0 commit comments

Comments
 (0)