diff --git a/Cargo.toml b/Cargo.toml index df49da2..19443ab 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,10 +6,6 @@ authors = [ "Raph Levien " ] keywords = ["font", "truetype", "ttf"] description = "A font renderer written (mostly) in pure, safe Rust" repository = "https://github.com/google/font-rs" -build = "build.rs" [features] sse = [] - -[target.'cfg(target_feature = "sse")'.build-dependencies] -gcc = "0.3" diff --git a/build.rs b/build.rs deleted file mode 100644 index 9dbf570..0000000 --- a/build.rs +++ /dev/null @@ -1,11 +0,0 @@ -#[cfg(feature="sse")] -extern crate gcc; - -fn main() { - #[cfg(feature="sse")] - gcc::Build::new() - .file("src/accumulate.c") - .flag("-march=native") - .flag("-std=c99") - .compile("libaccumulate.a"); -} diff --git a/src/accumulate.c b/src/accumulate.c deleted file mode 100644 index 2551e36..0000000 --- a/src/accumulate.c +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// SSE3 instrinsics for cumulative sum and conversion to pixels - -#include -#include - -void accumulate_sse(const float *in, uint8_t *out, uint32_t n) { - __m128 offset = _mm_setzero_ps(); - __m128i mask = _mm_set1_epi32(0x0c080400); - __m128 sign_mask = _mm_set1_ps(-0.f); - for (uint32_t i = 0; i < n; i += 4) { - __m128 x = _mm_load_ps(&in[i]); - x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 4))); - x = _mm_add_ps(x, _mm_shuffle_ps(_mm_setzero_ps(), x, 0x40)); - x = _mm_add_ps(x, offset); - __m128 y = _mm_andnot_ps(sign_mask, x); // fabs(x) - y = _mm_min_ps(y, _mm_set1_ps(1.0f)); - y = _mm_mul_ps(y, _mm_set1_ps(255.0f)); - __m128i z = _mm_cvttps_epi32(y); - z = _mm_shuffle_epi8(z, mask); - _mm_store_ss((float *)&out[i], _mm_castsi128_ps(z)); - offset = _mm_shuffle_ps(x, x, _MM_SHUFFLE(3, 3, 3, 3)); - } -} diff --git a/src/accumulate.rs b/src/accumulate.rs index 8766aa2..d41c536 100644 --- a/src/accumulate.rs +++ b/src/accumulate.rs @@ -12,10 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#[cfg(feature = "sse")] -#[link(name = "accumulate")] -extern "C" { - fn accumulate_sse(src: *const f32, dst: *mut u8, n: u32); +use std::mem; + +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[cfg(target_arch = "x86")] +use std::arch::x86::*; + +macro_rules! _mm_shuffle { + ($z:expr, $y:expr, $x:expr, $w:expr) => { + ($z << 6) | ($y << 4) | ($x << 2) | $w + }; } #[cfg(feature = "sse")] @@ -28,11 +36,33 @@ pub fn accumulate(src: &[f32]) -> Vec { // and so on let len = src.len(); let n = (len + 3) & !3; // align data - let mut dst: Vec = Vec::with_capacity(n); + let mut dst: Vec = vec![0; n]; + unsafe { - accumulate_sse(src.as_ptr(), dst.as_mut_ptr(), n as u32); + let mut offset = _mm_setzero_ps(); + let sign_mask = _mm_set1_ps(-0.); + let mask = _mm_set1_epi32(0x0c080400); + + for i in (0..n).step_by(4) { + let mut x = _mm_loadu_ps(&src[i]); + x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 4))); + x = _mm_add_ps(x, _mm_shuffle_ps(_mm_setzero_ps(), x, 0x40)); + x = _mm_add_ps(x, offset); + + let mut y = _mm_andnot_ps(sign_mask, x); // fabs(x) + y = _mm_min_ps(y, _mm_set1_ps(1.0)); + y = _mm_mul_ps(y, _mm_set1_ps(255.0)); + + let mut z = _mm_cvttps_epi32(y); + z = _mm_shuffle_epi8(z, mask); + + _mm_store_ss(mem::transmute(&dst[i]), _mm_castsi128_ps(z)); + offset = _mm_shuffle_ps(x, x, _mm_shuffle!(3, 3, 3, 3)); + } + dst.set_len(len); // we must return vec of the same length as src.len() } + dst }