12
12
// See the License for the specific language governing permissions and
13
13
// limitations under the License.
14
14
15
- #[ cfg( feature = "sse" ) ]
16
- #[ link( name = "accumulate" ) ]
17
- extern "C" {
18
- fn accumulate_sse ( src : * const f32 , dst : * mut u8 , n : u32 ) ;
15
+ use std:: mem;
16
+
17
+ #[ cfg( target_arch = "x86_64" ) ]
18
+ use std:: arch:: x86_64:: * ;
19
+
20
+ #[ cfg( target_arch = "x86" ) ]
21
+ use std:: arch:: x86:: * ;
22
+
23
+ macro_rules! _mm_shuffle {
24
+ ( $z: expr, $y: expr, $x: expr, $w: expr) => {
25
+ ( $z << 6 ) | ( $y << 4 ) | ( $x << 2 ) | $w
26
+ } ;
19
27
}
20
28
21
29
#[ cfg( feature = "sse" ) ]
@@ -28,11 +36,33 @@ pub fn accumulate(src: &[f32]) -> Vec<u8> {
28
36
// and so on
29
37
let len = src. len ( ) ;
30
38
let n = ( len + 3 ) & !3 ; // align data
31
- let mut dst: Vec < u8 > = Vec :: with_capacity ( n) ;
39
+ let mut dst: Vec < u8 > = vec ! [ 0 ; n] ;
40
+
32
41
unsafe {
33
- accumulate_sse ( src. as_ptr ( ) , dst. as_mut_ptr ( ) , n as u32 ) ;
42
+ let mut offset = _mm_setzero_ps ( ) ;
43
+ let sign_mask = _mm_set1_ps ( -0. ) ;
44
+ let mask = _mm_set1_epi32 ( 0x0c080400 ) ;
45
+
46
+ for i in ( 0 ..n) . step_by ( 4 ) {
47
+ let mut x = _mm_loadu_ps ( & src[ i] ) ;
48
+ x = _mm_add_ps ( x, _mm_castsi128_ps ( _mm_slli_si128 ( _mm_castps_si128 ( x) , 4 ) ) ) ;
49
+ x = _mm_add_ps ( x, _mm_shuffle_ps ( _mm_setzero_ps ( ) , x, 0x40 ) ) ;
50
+ x = _mm_add_ps ( x, offset) ;
51
+
52
+ let mut y = _mm_andnot_ps ( sign_mask, x) ; // fabs(x)
53
+ y = _mm_min_ps ( y, _mm_set1_ps ( 1.0 ) ) ;
54
+ y = _mm_mul_ps ( y, _mm_set1_ps ( 255.0 ) ) ;
55
+
56
+ let mut z = _mm_cvttps_epi32 ( y) ;
57
+ z = _mm_shuffle_epi8 ( z, mask) ;
58
+
59
+ _mm_store_ss ( mem:: transmute ( & dst[ i] ) , _mm_castsi128_ps ( z) ) ;
60
+ offset = _mm_shuffle_ps ( x, x, _mm_shuffle ! ( 3 , 3 , 3 , 3 ) ) ;
61
+ }
62
+
34
63
dst. set_len ( len) ; // we must return vec of the same length as src.len()
35
64
}
65
+
36
66
dst
37
67
}
38
68
0 commit comments