@@ -1965,6 +1965,7 @@ class BilateralFilter_32f_Invoker :
1965
1965
#if CV_SSE3
1966
1966
if ( haveSSE3 )
1967
1967
{
1968
+ __m128 psum = _mm_setzero_ps ();
1968
1969
const __m128 _val0 = _mm_set1_ps (sptr[j]);
1969
1970
const __m128 _scale_index = _mm_set1_ps (scale_index);
1970
1971
const __m128 _signMask = _mm_load_ps ((const float *)bufSignMask);
@@ -1990,11 +1991,12 @@ class BilateralFilter_32f_Invoker :
1990
1991
1991
1992
_sw = _mm_hadd_ps (_w, _val);
1992
1993
_sw = _mm_hadd_ps (_sw, _sw);
1993
- _mm_storel_pi ((__m64*)bufSum32, _sw);
1994
-
1995
- sum += bufSum32[1 ];
1996
- wsum += bufSum32[0 ];
1994
+ psum = _mm_add_ps (_sw, psum);
1997
1995
}
1996
+ _mm_storel_pi ((__m64*)bufSum32, psum);
1997
+
1998
+ sum = bufSum32[1 ];
1999
+ wsum = bufSum32[0 ];
1998
2000
}
1999
2001
#endif
2000
2002
@@ -2013,7 +2015,7 @@ class BilateralFilter_32f_Invoker :
2013
2015
}
2014
2016
else
2015
2017
{
2016
- assert ( cn == 3 );
2018
+ CV_Assert ( cn == 3 );
2017
2019
for ( j = 0 ; j < size.width *3 ; j += 3 )
2018
2020
{
2019
2021
float sum_b = 0 , sum_g = 0 , sum_r = 0 , wsum = 0 ;
@@ -2022,6 +2024,7 @@ class BilateralFilter_32f_Invoker :
2022
2024
#if CV_SSE3
2023
2025
if ( haveSSE3 )
2024
2026
{
2027
+ __m128 sum = _mm_setzero_ps ();
2025
2028
const __m128 _b0 = _mm_set1_ps (b0);
2026
2029
const __m128 _g0 = _mm_set1_ps (g0);
2027
2030
const __m128 _r0 = _mm_set1_ps (r0);
@@ -2032,14 +2035,16 @@ class BilateralFilter_32f_Invoker :
2032
2035
{
2033
2036
__m128 _sw = _mm_loadu_ps (space_weight + k);
2034
2037
2035
- const float * sptr_k = sptr + j + space_ofs[k];
2036
- const float * sptr_k1 = sptr + j + space_ofs[k+1 ];
2037
- const float * sptr_k2 = sptr + j + space_ofs[k+2 ];
2038
- const float * sptr_k3 = sptr + j + space_ofs[k+3 ];
2038
+ const float * const sptr_k0 = sptr + j + space_ofs[k];
2039
+ const float * const sptr_k1 = sptr + j + space_ofs[k+1 ];
2040
+ const float * const sptr_k2 = sptr + j + space_ofs[k+2 ];
2041
+ const float * const sptr_k3 = sptr + j + space_ofs[k+3 ];
2039
2042
2040
- __m128 _b = _mm_set_ps (sptr_k3[0 ], sptr_k2[0 ], sptr_k1[0 ], sptr_k[0 ]);
2041
- __m128 _g = _mm_set_ps (sptr_k3[1 ], sptr_k2[1 ], sptr_k1[1 ], sptr_k[1 ]);
2042
- __m128 _r = _mm_set_ps (sptr_k3[2 ], sptr_k2[2 ], sptr_k1[2 ], sptr_k[2 ]);
2043
+ __m128 _b = _mm_loadu_ps (sptr_k0);
2044
+ __m128 _g = _mm_loadu_ps (sptr_k1);
2045
+ __m128 _r = _mm_loadu_ps (sptr_k2);
2046
+ __m128 _z = _mm_loadu_ps (sptr_k3);
2047
+ _MM_TRANSPOSE4_PS (_b, _g, _r, _z);
2043
2048
2044
2049
__m128 _bt = _mm_andnot_ps (_signMask,_mm_sub_ps (_b,_b0));
2045
2050
__m128 _gt = _mm_andnot_ps (_signMask,_mm_sub_ps (_g,_g0));
@@ -2064,14 +2069,13 @@ class BilateralFilter_32f_Invoker :
2064
2069
_g = _mm_hadd_ps (_g, _r);
2065
2070
2066
2071
_w = _mm_hadd_ps (_w, _g);
2067
- _mm_store_ps (bufSum32, _w);
2068
-
2069
- wsum += bufSum32[0 ];
2070
- sum_b += bufSum32[1 ];
2071
- sum_g += bufSum32[2 ];
2072
- sum_r += bufSum32[3 ];
2072
+ sum = _mm_add_ps (sum, _w);
2073
2073
}
2074
-
2074
+ _mm_store_ps (bufSum32, sum);
2075
+ wsum = bufSum32[0 ];
2076
+ sum_b = bufSum32[1 ];
2077
+ sum_g = bufSum32[2 ];
2078
+ sum_r = bufSum32[3 ];
2075
2079
}
2076
2080
#endif
2077
2081
0 commit comments