Skip to content

Commit 3eed5d8

Browse files
committed
additional improvements for 32f
1 parent efad694 commit 3eed5d8

File tree

1 file changed

+23
-19
lines changed

1 file changed

+23
-19
lines changed

modules/imgproc/src/smooth.cpp

+23-19
Original file line numberDiff line numberDiff line change
@@ -1965,6 +1965,7 @@ class BilateralFilter_32f_Invoker :
19651965
#if CV_SSE3
19661966
if( haveSSE3 )
19671967
{
1968+
__m128 psum = _mm_setzero_ps();
19681969
const __m128 _val0 = _mm_set1_ps(sptr[j]);
19691970
const __m128 _scale_index = _mm_set1_ps(scale_index);
19701971
const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
@@ -1990,11 +1991,12 @@ class BilateralFilter_32f_Invoker :
19901991

19911992
_sw = _mm_hadd_ps(_w, _val);
19921993
_sw = _mm_hadd_ps(_sw, _sw);
1993-
_mm_storel_pi((__m64*)bufSum32, _sw);
1994-
1995-
sum += bufSum32[1];
1996-
wsum += bufSum32[0];
1994+
psum = _mm_add_ps(_sw, psum);
19971995
}
1996+
_mm_storel_pi((__m64*)bufSum32, psum);
1997+
1998+
sum = bufSum32[1];
1999+
wsum = bufSum32[0];
19982000
}
19992001
#endif
20002002

@@ -2013,7 +2015,7 @@ class BilateralFilter_32f_Invoker :
20132015
}
20142016
else
20152017
{
2016-
assert( cn == 3 );
2018+
CV_Assert( cn == 3 );
20172019
for( j = 0; j < size.width*3; j += 3 )
20182020
{
20192021
float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
@@ -2022,6 +2024,7 @@ class BilateralFilter_32f_Invoker :
20222024
#if CV_SSE3
20232025
if( haveSSE3 )
20242026
{
2027+
__m128 sum = _mm_setzero_ps();
20252028
const __m128 _b0 = _mm_set1_ps(b0);
20262029
const __m128 _g0 = _mm_set1_ps(g0);
20272030
const __m128 _r0 = _mm_set1_ps(r0);
@@ -2032,14 +2035,16 @@ class BilateralFilter_32f_Invoker :
20322035
{
20332036
__m128 _sw = _mm_loadu_ps(space_weight + k);
20342037

2035-
const float* sptr_k = sptr + j + space_ofs[k];
2036-
const float* sptr_k1 = sptr + j + space_ofs[k+1];
2037-
const float* sptr_k2 = sptr + j + space_ofs[k+2];
2038-
const float* sptr_k3 = sptr + j + space_ofs[k+3];
2038+
const float* const sptr_k0 = sptr + j + space_ofs[k];
2039+
const float* const sptr_k1 = sptr + j + space_ofs[k+1];
2040+
const float* const sptr_k2 = sptr + j + space_ofs[k+2];
2041+
const float* const sptr_k3 = sptr + j + space_ofs[k+3];
20392042

2040-
__m128 _b = _mm_set_ps(sptr_k3[0], sptr_k2[0], sptr_k1[0], sptr_k[0]);
2041-
__m128 _g = _mm_set_ps(sptr_k3[1], sptr_k2[1], sptr_k1[1], sptr_k[1]);
2042-
__m128 _r = _mm_set_ps(sptr_k3[2], sptr_k2[2], sptr_k1[2], sptr_k[2]);
2043+
__m128 _b = _mm_loadu_ps(sptr_k0);
2044+
__m128 _g = _mm_loadu_ps(sptr_k1);
2045+
__m128 _r = _mm_loadu_ps(sptr_k2);
2046+
__m128 _z = _mm_loadu_ps(sptr_k3);
2047+
_MM_TRANSPOSE4_PS(_b, _g, _r, _z);
20432048

20442049
__m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0));
20452050
__m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0));
@@ -2064,14 +2069,13 @@ class BilateralFilter_32f_Invoker :
20642069
_g = _mm_hadd_ps(_g, _r);
20652070

20662071
_w = _mm_hadd_ps(_w, _g);
2067-
_mm_store_ps(bufSum32, _w);
2068-
2069-
wsum += bufSum32[0];
2070-
sum_b += bufSum32[1];
2071-
sum_g += bufSum32[2];
2072-
sum_r += bufSum32[3];
2072+
sum = _mm_add_ps(sum, _w);
20732073
}
2074-
2074+
_mm_store_ps(bufSum32, sum);
2075+
wsum = bufSum32[0];
2076+
sum_b = bufSum32[1];
2077+
sum_g = bufSum32[2];
2078+
sum_r = bufSum32[3];
20752079
}
20762080
#endif
20772081

0 commit comments

Comments
 (0)