@@ -1787,21 +1787,25 @@ class BilateralFilter_8u_Invoker :
1787
1787
#if CV_SSE3
1788
1788
if ( haveSSE3 )
1789
1789
{
1790
+ const __m128i izero = _mm_setzero_si128 ();
1790
1791
const __m128 _b0 = _mm_set1_ps (static_cast <float >(b0));
1791
1792
const __m128 _g0 = _mm_set1_ps (static_cast <float >(g0));
1792
1793
const __m128 _r0 = _mm_set1_ps (static_cast <float >(r0));
1793
1794
const __m128 _signMask = _mm_load_ps ((const float *)bufSignMask);
1794
1795
1795
1796
for ( ; k <= maxk - 4 ; k += 4 )
1796
1797
{
1797
- const uchar* sptr_k = sptr + j + space_ofs[k];
1798
- const uchar* sptr_k1 = sptr + j + space_ofs[k+1 ];
1799
- const uchar* sptr_k2 = sptr + j + space_ofs[k+2 ];
1800
- const uchar* sptr_k3 = sptr + j + space_ofs[k+3 ];
1798
+ const int * const sptr_k0 = reinterpret_cast < const int *>( sptr + j + space_ofs[k]) ;
1799
+ const int * const sptr_k1 = reinterpret_cast < const int *>( sptr + j + space_ofs[k+1 ]) ;
1800
+ const int * const sptr_k2 = reinterpret_cast < const int *>( sptr + j + space_ofs[k+2 ]) ;
1801
+ const int * const sptr_k3 = reinterpret_cast < const int *>( sptr + j + space_ofs[k+3 ]) ;
1801
1802
1802
- __m128 _b = _mm_set_ps (sptr_k3[0 ],sptr_k2[0 ],sptr_k1[0 ],sptr_k[0 ]);
1803
- __m128 _g = _mm_set_ps (sptr_k3[1 ],sptr_k2[1 ],sptr_k1[1 ],sptr_k[1 ]);
1804
- __m128 _r = _mm_set_ps (sptr_k3[2 ],sptr_k2[2 ],sptr_k1[2 ],sptr_k[2 ]);
1803
+ __m128 _b = _mm_cvtepi32_ps (_mm_unpacklo_epi16 (_mm_unpacklo_epi8 (_mm_cvtsi32_si128 (sptr_k0[0 ]), izero), izero));
1804
+ __m128 _g = _mm_cvtepi32_ps (_mm_unpacklo_epi16 (_mm_unpacklo_epi8 (_mm_cvtsi32_si128 (sptr_k1[0 ]), izero), izero));
1805
+ __m128 _r = _mm_cvtepi32_ps (_mm_unpacklo_epi16 (_mm_unpacklo_epi8 (_mm_cvtsi32_si128 (sptr_k2[0 ]), izero), izero));
1806
+ __m128 _z = _mm_cvtepi32_ps (_mm_unpacklo_epi16 (_mm_unpacklo_epi8 (_mm_cvtsi32_si128 (sptr_k3[0 ]), izero), izero));
1807
+
1808
+ _MM_TRANSPOSE4_PS (_b, _g, _r, _z);
1805
1809
1806
1810
__m128 bt = _mm_andnot_ps (_signMask, _mm_sub_ps (_b,_b0));
1807
1811
__m128 gt = _mm_andnot_ps (_signMask, _mm_sub_ps (_g,_g0));
@@ -1961,6 +1965,7 @@ class BilateralFilter_32f_Invoker :
1961
1965
#if CV_SSE3
1962
1966
if ( haveSSE3 )
1963
1967
{
1968
+ __m128 psum = _mm_setzero_ps ();
1964
1969
const __m128 _val0 = _mm_set1_ps (sptr[j]);
1965
1970
const __m128 _scale_index = _mm_set1_ps (scale_index);
1966
1971
const __m128 _signMask = _mm_load_ps ((const float *)bufSignMask);
@@ -1986,11 +1991,12 @@ class BilateralFilter_32f_Invoker :
1986
1991
1987
1992
_sw = _mm_hadd_ps (_w, _val);
1988
1993
_sw = _mm_hadd_ps (_sw, _sw);
1989
- _mm_storel_pi ((__m64*)bufSum32, _sw);
1990
-
1991
- sum += bufSum32[1 ];
1992
- wsum += bufSum32[0 ];
1994
+ psum = _mm_add_ps (_sw, psum);
1993
1995
}
1996
+ _mm_storel_pi ((__m64*)bufSum32, psum);
1997
+
1998
+ sum = bufSum32[1 ];
1999
+ wsum = bufSum32[0 ];
1994
2000
}
1995
2001
#endif
1996
2002
@@ -2009,7 +2015,7 @@ class BilateralFilter_32f_Invoker :
2009
2015
}
2010
2016
else
2011
2017
{
2012
- assert ( cn == 3 );
2018
+ CV_Assert ( cn == 3 );
2013
2019
for ( j = 0 ; j < size.width *3 ; j += 3 )
2014
2020
{
2015
2021
float sum_b = 0 , sum_g = 0 , sum_r = 0 , wsum = 0 ;
@@ -2018,6 +2024,7 @@ class BilateralFilter_32f_Invoker :
2018
2024
#if CV_SSE3
2019
2025
if ( haveSSE3 )
2020
2026
{
2027
+ __m128 sum = _mm_setzero_ps ();
2021
2028
const __m128 _b0 = _mm_set1_ps (b0);
2022
2029
const __m128 _g0 = _mm_set1_ps (g0);
2023
2030
const __m128 _r0 = _mm_set1_ps (r0);
@@ -2028,14 +2035,16 @@ class BilateralFilter_32f_Invoker :
2028
2035
{
2029
2036
__m128 _sw = _mm_loadu_ps (space_weight + k);
2030
2037
2031
- const float * sptr_k = sptr + j + space_ofs[k];
2032
- const float * sptr_k1 = sptr + j + space_ofs[k+1 ];
2033
- const float * sptr_k2 = sptr + j + space_ofs[k+2 ];
2034
- const float * sptr_k3 = sptr + j + space_ofs[k+3 ];
2038
+ const float * const sptr_k0 = sptr + j + space_ofs[k];
2039
+ const float * const sptr_k1 = sptr + j + space_ofs[k+1 ];
2040
+ const float * const sptr_k2 = sptr + j + space_ofs[k+2 ];
2041
+ const float * const sptr_k3 = sptr + j + space_ofs[k+3 ];
2035
2042
2036
- __m128 _b = _mm_set_ps (sptr_k3[0 ], sptr_k2[0 ], sptr_k1[0 ], sptr_k[0 ]);
2037
- __m128 _g = _mm_set_ps (sptr_k3[1 ], sptr_k2[1 ], sptr_k1[1 ], sptr_k[1 ]);
2038
- __m128 _r = _mm_set_ps (sptr_k3[2 ], sptr_k2[2 ], sptr_k1[2 ], sptr_k[2 ]);
2043
+ __m128 _b = _mm_loadu_ps (sptr_k0);
2044
+ __m128 _g = _mm_loadu_ps (sptr_k1);
2045
+ __m128 _r = _mm_loadu_ps (sptr_k2);
2046
+ __m128 _z = _mm_loadu_ps (sptr_k3);
2047
+ _MM_TRANSPOSE4_PS (_b, _g, _r, _z);
2039
2048
2040
2049
__m128 _bt = _mm_andnot_ps (_signMask,_mm_sub_ps (_b,_b0));
2041
2050
__m128 _gt = _mm_andnot_ps (_signMask,_mm_sub_ps (_g,_g0));
@@ -2060,14 +2069,13 @@ class BilateralFilter_32f_Invoker :
2060
2069
_g = _mm_hadd_ps (_g, _r);
2061
2070
2062
2071
_w = _mm_hadd_ps (_w, _g);
2063
- _mm_store_ps (bufSum32, _w);
2064
-
2065
- wsum += bufSum32[0 ];
2066
- sum_b += bufSum32[1 ];
2067
- sum_g += bufSum32[2 ];
2068
- sum_r += bufSum32[3 ];
2072
+ sum = _mm_add_ps (sum, _w);
2069
2073
}
2070
-
2074
+ _mm_store_ps (bufSum32, sum);
2075
+ wsum = bufSum32[0 ];
2076
+ sum_b = bufSum32[1 ];
2077
+ sum_g = bufSum32[2 ];
2078
+ sum_r = bufSum32[3 ];
2071
2079
}
2072
2080
#endif
2073
2081
0 commit comments