Skip to content

Commit d620ef0

Browse files
Andrey KamaevOpenCV Buildbot
Andrey Kamaev
authored and
OpenCV Buildbot
committed
Merge pull request opencv#577 from ilya-lavrenov:BilateralFilter
2 parents 7b79eaf + 3eed5d8 commit d620ef0

File tree

1 file changed

+34
-26
lines changed

1 file changed

+34
-26
lines changed

modules/imgproc/src/smooth.cpp

+34-26
Original file line numberDiff line numberDiff line change
@@ -1787,21 +1787,25 @@ class BilateralFilter_8u_Invoker :
17871787
#if CV_SSE3
17881788
if( haveSSE3 )
17891789
{
1790+
const __m128i izero = _mm_setzero_si128();
17901791
const __m128 _b0 = _mm_set1_ps(static_cast<float>(b0));
17911792
const __m128 _g0 = _mm_set1_ps(static_cast<float>(g0));
17921793
const __m128 _r0 = _mm_set1_ps(static_cast<float>(r0));
17931794
const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
17941795

17951796
for( ; k <= maxk - 4; k += 4 )
17961797
{
1797-
const uchar* sptr_k = sptr + j + space_ofs[k];
1798-
const uchar* sptr_k1 = sptr + j + space_ofs[k+1];
1799-
const uchar* sptr_k2 = sptr + j + space_ofs[k+2];
1800-
const uchar* sptr_k3 = sptr + j + space_ofs[k+3];
1798+
const int* const sptr_k0 = reinterpret_cast<const int*>(sptr + j + space_ofs[k]);
1799+
const int* const sptr_k1 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+1]);
1800+
const int* const sptr_k2 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+2]);
1801+
const int* const sptr_k3 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+3]);
18011802

1802-
__m128 _b = _mm_set_ps(sptr_k3[0],sptr_k2[0],sptr_k1[0],sptr_k[0]);
1803-
__m128 _g = _mm_set_ps(sptr_k3[1],sptr_k2[1],sptr_k1[1],sptr_k[1]);
1804-
__m128 _r = _mm_set_ps(sptr_k3[2],sptr_k2[2],sptr_k1[2],sptr_k[2]);
1803+
__m128 _b = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k0[0]), izero), izero));
1804+
__m128 _g = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k1[0]), izero), izero));
1805+
__m128 _r = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k2[0]), izero), izero));
1806+
__m128 _z = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k3[0]), izero), izero));
1807+
1808+
_MM_TRANSPOSE4_PS(_b, _g, _r, _z);
18051809

18061810
__m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0));
18071811
__m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0));
@@ -1961,6 +1965,7 @@ class BilateralFilter_32f_Invoker :
19611965
#if CV_SSE3
19621966
if( haveSSE3 )
19631967
{
1968+
__m128 psum = _mm_setzero_ps();
19641969
const __m128 _val0 = _mm_set1_ps(sptr[j]);
19651970
const __m128 _scale_index = _mm_set1_ps(scale_index);
19661971
const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
@@ -1986,11 +1991,12 @@ class BilateralFilter_32f_Invoker :
19861991

19871992
_sw = _mm_hadd_ps(_w, _val);
19881993
_sw = _mm_hadd_ps(_sw, _sw);
1989-
_mm_storel_pi((__m64*)bufSum32, _sw);
1990-
1991-
sum += bufSum32[1];
1992-
wsum += bufSum32[0];
1994+
psum = _mm_add_ps(_sw, psum);
19931995
}
1996+
_mm_storel_pi((__m64*)bufSum32, psum);
1997+
1998+
sum = bufSum32[1];
1999+
wsum = bufSum32[0];
19942000
}
19952001
#endif
19962002

@@ -2009,7 +2015,7 @@ class BilateralFilter_32f_Invoker :
20092015
}
20102016
else
20112017
{
2012-
assert( cn == 3 );
2018+
CV_Assert( cn == 3 );
20132019
for( j = 0; j < size.width*3; j += 3 )
20142020
{
20152021
float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
@@ -2018,6 +2024,7 @@ class BilateralFilter_32f_Invoker :
20182024
#if CV_SSE3
20192025
if( haveSSE3 )
20202026
{
2027+
__m128 sum = _mm_setzero_ps();
20212028
const __m128 _b0 = _mm_set1_ps(b0);
20222029
const __m128 _g0 = _mm_set1_ps(g0);
20232030
const __m128 _r0 = _mm_set1_ps(r0);
@@ -2028,14 +2035,16 @@ class BilateralFilter_32f_Invoker :
20282035
{
20292036
__m128 _sw = _mm_loadu_ps(space_weight + k);
20302037

2031-
const float* sptr_k = sptr + j + space_ofs[k];
2032-
const float* sptr_k1 = sptr + j + space_ofs[k+1];
2033-
const float* sptr_k2 = sptr + j + space_ofs[k+2];
2034-
const float* sptr_k3 = sptr + j + space_ofs[k+3];
2038+
const float* const sptr_k0 = sptr + j + space_ofs[k];
2039+
const float* const sptr_k1 = sptr + j + space_ofs[k+1];
2040+
const float* const sptr_k2 = sptr + j + space_ofs[k+2];
2041+
const float* const sptr_k3 = sptr + j + space_ofs[k+3];
20352042

2036-
__m128 _b = _mm_set_ps(sptr_k3[0], sptr_k2[0], sptr_k1[0], sptr_k[0]);
2037-
__m128 _g = _mm_set_ps(sptr_k3[1], sptr_k2[1], sptr_k1[1], sptr_k[1]);
2038-
__m128 _r = _mm_set_ps(sptr_k3[2], sptr_k2[2], sptr_k1[2], sptr_k[2]);
2043+
__m128 _b = _mm_loadu_ps(sptr_k0);
2044+
__m128 _g = _mm_loadu_ps(sptr_k1);
2045+
__m128 _r = _mm_loadu_ps(sptr_k2);
2046+
__m128 _z = _mm_loadu_ps(sptr_k3);
2047+
_MM_TRANSPOSE4_PS(_b, _g, _r, _z);
20392048

20402049
__m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0));
20412050
__m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0));
@@ -2060,14 +2069,13 @@ class BilateralFilter_32f_Invoker :
20602069
_g = _mm_hadd_ps(_g, _r);
20612070

20622071
_w = _mm_hadd_ps(_w, _g);
2063-
_mm_store_ps(bufSum32, _w);
2064-
2065-
wsum += bufSum32[0];
2066-
sum_b += bufSum32[1];
2067-
sum_g += bufSum32[2];
2068-
sum_r += bufSum32[3];
2072+
sum = _mm_add_ps(sum, _w);
20692073
}
2070-
2074+
_mm_store_ps(bufSum32, sum);
2075+
wsum = bufSum32[0];
2076+
sum_b = bufSum32[1];
2077+
sum_g = bufSum32[2];
2078+
sum_r = bufSum32[3];
20712079
}
20722080
#endif
20732081

0 commit comments

Comments
 (0)